Inventory Optimization with Reinforcement learning

Import Libraries

In [ ]:
import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense, Input, concatenate
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
In [ ]:
class Environment:
    
    def __init__(self, max_inventory):
        self.fixed_cost = 0
        self.max_inventory = max_inventory
        self.lost_reveune_factor = 0.3
    
    # returns initial state
    def reset(self):
        return self.max_inventory
    
    # order cost = fixed cost + variable cost
    def order_cost(self, x):
        
        if x>0:
            oc = self.fixed_cost +  2*x  #varaible_cost
        else: 
            oc = 0
        return oc
    
    # Holding cost: When input is negative = backorders
    def holding_cost(self, x):
        if x < 0:
            return -2*x
        else:
            return 1*x
    
    # revenue genrated 
    def revenue(self, x):
        if x < 0:
            return 0
        else:
            return 8*x
    
    # Calculate next state and reward
    def step(self, state, action, demand):
        
        backlog_orders = 0
        reward = 0
        
        # serve backlog orders 
        if state < 0:
            backlog_orders = np.abs(state)
            
            if backlog_orders > action:
                reward += self.revenue(action)
                backlog_orders -= (action)
                
            else:
                reward += self.revenue(backlog_orders)
                backlog_orders = 0
        
        curr_inv = state + action
        
        # serve current demand
        if demand > curr_inv:
            reward += self.revenue(curr_inv)
            
        else:
            reward += self.revenue(demand)
                
        next_state = (curr_inv - demand)
        curr_inv = next_state

        net_reward = reward - self.order_cost(action) - self.holding_cost(curr_inv)    # use argmax for this
        
        return next_state, net_reward
    
     

Create our agent with DQN with Q-Learning policy

In [ ]:
class Agent:
    
    def __init__(self, max_inventory, model, learning_rate = 0.001):
        self.max_inventory = max_inventory
        self.learning_rate = learning_rate
        self.memory = deque(maxlen=1000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_max = 1.0
        self.epsilon_decay = 0.03
        self.type = model
        self.model = self._build_model()
        
    def _build_model(self):
        
        if self.type == 'withdemand':
            ## Agents knows the demand
            state_input = Input(shape=(1,), name = 'state')
            demand_input = Input(shape=(1,), name = 'demand')
            x = concatenate([state_input, demand_input], name = 'concatenate')
            x = Dense(5, activation='relu', name = 'hidden1')(x)
            x = Dense(8, activation='relu', name = 'hidden2')(x)
            action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
            model = Model(inputs = [state_input, demand_input], outputs = action)
        
        if self.type == "nodemand":
            ## Agent doesnt know the demand
            state_input = Input(shape=(1,), name = 'state')
            x = Dense(5, activation='relu', name = 'hidden1')(state_input)
            x = Dense(8, activation='relu', name = 'hidden2')(x)
            action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
            model = Model(inputs = state_input, outputs = action)
        
        model.summary()
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done, demand):
        
        if self.type == "withdemand":
            ## Knows demand
            self.memory.append((state, action, reward, next_state, done, demand))
        if self.type == "nodemand":
            ## No demand
            self.memory.append((state, action, reward, next_state, done))

            
    def get_action(self, state, demand):
        
        if np.random.rand() <= self.epsilon:
            if state < 0:
                return random.randint(0, self.max_inventory)
            else:
                return random.randint(0, (self.max_inventory - state))
        
        if self.type == "withdemand":
            ## Knows demand
            act_values = self.model.predict([np.array([state]), np.array([demand])])
        if self.type == "nodemand":
            ## No demand
            act_values = self.model.predict(np.array([state]))
        
        action = np.argmax(act_values[0])
        
        if 0 <= action <= (self.max_inventory - state) :
            return action            # returns action within limit
        else:
            return 0
            

    def replay(self, batch_size, episode):
        minibatch = random.sample(self.memory, batch_size)
        if self.type == "withdemand":
            ## knows demand
            for state, action, reward, next_state, done, demand in minibatch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.model.predict([np.array([next_state]), np.array([demand])]))
                
                target_f = self.model.predict([np.array([state]), np.array([demand])])
                target_f[0][action] = target
                self.model.fit([np.array([state]), np.array([demand])], target_f, epochs=1, verbose=0)
        
        if self.type == "nodemand":
            ## No demand
            for state, action, reward, next_state, done in minibatch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state])))

                target_f = self.model.predict(np.array([state]))
                target_f[0][action] = target
                self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min)*np.exp(-self.epsilon_decay*episode)
            #self.epsilon *= self.epsilon_decay

        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)
    

Training our agent

In [ ]:
max_inventory = 100

env = Environment(max_inventory = max_inventory)
agent = Agent(max_inventory = max_inventory, model = 'withdemand')
#agent.load("withdemand_backlog_200.h5")
max_episodes = 200
max_timesteps = 100  
episode_cost =[]
batch_size = 16
random.seed(2804)


for episode in range(max_episodes):
    done = False
    net_cost = []
    state = env.reset()
    #print('initial state:', state)
    #backlog_orders = 0
    
    for timestep in range(max_timesteps):
        #print('state:', state)
        demand = random.randint(1,max_inventory)
        #print('demand:', demand)
        
        action = agent.get_action(state, demand)
        #print('action:', action)
        
        next_state, reward = env.step(state, action, demand)
        #print('next_state:', next_state)
        #print('cost:', reward)
        #print('--------------')
        if timestep == max_timesteps-1:
            reward = 0
            done  = True  # reward at terminal state
        
        agent.remember(state, action, reward, next_state, done, demand)
        
        net_cost.append(reward)
        state = next_state
        
        if done:
            print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
            break
            
        if len(agent.memory) > batch_size:
            agent.replay(batch_size, episode)
        
    episode_cost.append(np.mean(net_cost))
    print('epsilon:', agent.epsilon)
    print('----------------------------------------------')
    #if episode % 100 == 0:
        #agent.save('nodemand_backlog_{}.h5.'.format(episode))
    
#agent.save('nodemand_backlog_{}.h5.'.format(max_episodes))
In [ ]:
plt.figure(figsize=(12,8))
plt.plot(range(max_episodes),episode_cost)
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xlabel('Episode')
plt.ylabel('Cost')
plt.show()
In [ ]:
max_inventory = 100
env = Environment(max_inventory=max_inventory)
agent = Agent(max_inventory=max_inventory, model = 'withdemand')
#agent.load("dqn_inventory_199.h5")
agent.load("withdemand_backlog_200.h5")
agent.epsilon = 0.0001
max_episodes = 10
max_timesteps = 100
episode_cost =[]
batch_size = 16
agent.epsilon = 0.0001
STATES = []
DEMAND = []
ACTIONS = []

for episode in range(max_episodes):
    done = False
    net_cost = []
    state = env.reset()
    #print('initial state:', state)
    
    for timestep in range(max_timesteps):
        #print('state:', state)
        STATES.append(state)
        demand = random.randint(1,max_inventory)
        #print('demand:', demand)
        DEMAND.append(demand)
        action = agent.get_action(state, demand)
        #print('action:', action)
        ACTIONS.append(action)
        #print('----------------')
        next_state, reward = env.step(state, action, demand)
        #print('next_state:', next_state)
        #print('cost:', reward)
        
        if timestep == max_timesteps-1:
            reward = 0
            done  = True  # reward at terminal state
        
        net_cost.append(reward)
        state = next_state
        
        if done:
            print("episode: {}/{}, cost: {}".format(episode, max_episodes-1, np.mean(net_cost)))
            break
            
    episode_cost.append(np.mean(net_cost))
    print('----------------------------------------------')


    
In [ ]:
plt.figure(figsize=(12,8))

plt.plot(range(max_timesteps),STATES, 'red')
#plt.plot(range(max_timesteps),DEMAND, 'green')
#plt.plot(range(max_timesteps),ACTIONS, 'orange')
plt.axhline(y = 0, color = 'black', linestyle = '--')
#plt.axhline(y = 25, color = 'purple', linestyle = '-')
plt.xlabel('timesteps')
plt.ylabel('inventory level')
plt.savefig('withdemand.png')
plt.show()
In [ ]: