Inventory Optimization with Reinforcement Learning (With Leadtime)¶

Import Libraries¶

import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense, Input, concatenate
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import matplotlib

class Environment:
    
    def __init__(self, max_inventory, leadtime):
        self.fixed_cost = 0
        self.max_inventory = max_inventory
        self.lost_reveune_factor = 0.3
        self.leadtime = leadtime
        self.inv_list = [0] * self.leadtime
    
    # Returns initial state
    def reset(self):
        self.inv_list = [0] * self.leadtime
        return self.max_inventory
    
    # Order cost = Fixed cost + variable cost
    def order_cost(self, x):
        
        if x>0:
            oc = self.fixed_cost +  2*x  
        else: 
            oc = 0
        return oc
    
    # Holding cost: if input is negative = backorders
    def holding_cost(self, x):
        if x < 0:
            return -2*x
        else:
            return 1*x
    
    def revenue(self, x):
        if x < 0:
            return 0
        else:
            return 8*x
    
    # Calculate next state and reward
    def step(self, state, action, demand):
        
        self.inv_list.append(action)
        action = self.inv_list[0]
        self.inv_list.pop(0)
        
        backlog_orders = 0
        reward = 0
        
        # serve backlog orders 
        if state < 0:
            backlog_orders = np.abs(state)
            
            if backlog_orders > action:
                reward += self.revenue(action)
                backlog_orders -= (action)
                
            else:
                reward += self.revenue(backlog_orders)
                backlog_orders = 0
        
        curr_inv = state + action
        
        # serve current demand
        if demand > curr_inv:
            reward += self.revenue(curr_inv)
            
        else:
            reward += self.revenue(demand)
                
        next_state = (curr_inv - demand)
        curr_inv = next_state

        net_reward = reward - self.order_cost(action) - self.holding_cost(curr_inv)    # use argmax for this
        return next_state, net_reward

Create our DQN Agent with Q-Learning policy¶

class Agent:
    
    def __init__(self, max_inventory, max_orders, leadtime, model, learning_rate = 0.001):
        self.max_inventory = max_inventory
        self.leadtime = leadtime
        self.max_orders = max_orders
        self.learning_rate = learning_rate
        self.memory = deque(maxlen=1000)
        self.gamma = 0.95    # discount rate
        self.epsilon = 1.0  # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_max = 1.0
        self.epsilon_decay = 0.03
        self.type = model
        
        self.hl1 = 5
        self.hl2 = 8
        
        self.model = self._build_model()
        
    def set_epsilon(self, epsilon=0.001):
        self.epsilon = epsilon
        
    def _build_model(self):
        
        if self.type == 'withdemand':
            ## Agents knows the demand
            state_input = Input(shape=(1,), name = 'state')
            demand_input = Input(shape=(1,), name = 'demand')
            x = concatenate([state_input, demand_input], name = 'concatenate')
            x = Dense(self.hl1, activation='relu', name = 'hidden1')(x)
            x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
            action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
            model = Model(inputs = [state_input, demand_input], outputs = action)
        
        if self.type == "nodemand":
            ## Agent doesnt know the demand
            state_input = Input(shape=(1,), name = 'state')
            x = Dense(self.hl1, activation='relu', name = 'hidden1')(state_input)
            x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
            action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
            model = Model(inputs = state_input, outputs = action)
        
        if self.type == 'leadtime':
            ## Agents knows the demand
            state_input = Input(shape=(1,), name = 'state')
            demand_input = Input(shape=(1,), name = 'demand')
            actions_input = Input(shape=(self.leadtime,), name = 'transit actions')
            x = concatenate([state_input, demand_input, actions_input], name = 'concatenate')
            x = Dense(self.hl1, activation='relu', name = 'hidden1')(x)
            x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
            action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
            model = Model(inputs = [state_input, demand_input, actions_input], outputs = action)
        
        
        model.summary()
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    def remember(self, state, action, reward, next_state, done, demand):
        
        if self.type == "withdemand":
            ## Knows demand
            self.memory.append((state, action, reward, next_state, done, demand))
        if self.type == "nodemand":
            ## No demand
            self.memory.append((state, action, reward, next_state, done))

            
    def get_action(self, state, demand):
        
        if np.random.rand() <= self.epsilon:
            return random.randint(0, self.max_orders)
            
        if self.type == "withdemand":
            ## Knows demand
            act_values = self.model.predict([np.array([state]), np.array([demand])])
        if self.type == "nodemand":
            ## No demand
            act_values = self.model.predict(np.array([state]))
        
        action = np.argmax(act_values[0])
        if 0 <= action <= (self.max_inventory - state) :
            return action            # returns action within limit
        else:
            return 0
            

    def replay(self, batch_size, episode):
        minibatch = random.sample(self.memory, batch_size)
        if self.type == "withdemand":
            ## knows demand
            for state, action, reward, next_state, done, demand in minibatch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.model.predict([np.array([next_state]), np.array([demand])]))
                
                target_f = self.model.predict([np.array([state]), np.array([demand])])
                target_f[0][action] = target
                self.model.fit([np.array([state]), np.array([demand])], target_f, epochs=1, verbose=0)
        
        if self.type == "nodemand":
            ## No demand
            for state, action, reward, next_state, done in minibatch:
                target = reward
                if not done:
                    target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state])))

                target_f = self.model.predict(np.array([state]))
                target_f[0][action] = target
                self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

        if self.epsilon > self.epsilon_min:
            self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min)*np.exp(-self.epsilon_decay*episode)
            #self.epsilon *= self.epsilon_decay

        
    def load(self, name):
        self.model.load_weights(name)

    def save(self, name):
        self.model.save_weights(name)

Training our agent¶

max_inventory = 100
max_order = 30              # Max. Demand limit

env = Environment(max_inventory = max_inventory, leadtime = 3)
agent = Agent(max_inventory = max_inventory, max_orders = max_order, leadtime = 3, model = 'withdemand')
#agent.load("withdemandbacklog_100maxinv_30maxorder_3leadtime_500.h5")
#agent.set_epsilon(0.0001)
max_episodes = 10
max_timesteps = 500  
episode_cost =[]
batch_size = 16
random.seed(2804)


for episode in range(max_episodes):
    done = False
    net_cost = []
    state = env.reset()
    #print('initial state:', state)
    
    for timestep in range(max_timesteps):
        #print('state:', state)
        demand = random.randint(1,max_order)
        #print('demand:', demand)
        
        action = agent.get_action(state, demand)
        #print('action:', action)
        
        next_state, reward = env.step(state, action, demand)
        #print('next_state:', next_state)
        #print('cost:', reward)
        #print('--------------')
        if timestep == max_timesteps-1:
            reward = 0
            done  = True  # reward at terminal state
        
        agent.remember(state, action, reward, next_state, done, demand)
        
        net_cost.append(reward)
        state = next_state
        
        if done:
            print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
            break
            
        if len(agent.memory) > batch_size:
            agent.replay(batch_size, episode)
        
    episode_cost.append(np.mean(net_cost))
    print('epsilon:', agent.epsilon)
    print('----------------------------------------------')
    #if episode % 100 == 0:
        #agent.save('nodemand_backlog_{}.h5.'.format(episode))
    
#agent.save('nodemand_backlog_{}.h5.'.format(max_episodes))

Plot Episode vs Cost¶

plt.plot(range(max_episodes),episode_cost)

Testing our agent¶

max_inventory = 100
max_order = 30

env = Environment(max_inventory = max_inventory, leadtime = 3)
agent = Agent(max_inventory = max_inventory, max_orders = max_order, leadtime = 3, model = 'withdemand')

agent.load("withdemandbacklog_100maxinv_30maxorder_3leadtime_300.h5")
agent.set_epsilon(0.0001)
max_episodes = 1
max_timesteps = 100
episode_cost =[]
batch_size = 16
STATES = []
DEMAND = []
ACTIONS = []


for episode in range(max_episodes):
    done = False
    net_cost = []
    state = env.reset()
    #print('initial state:', state)
    
    for timestep in range(max_timesteps):
        
        STATES.append(state)
        #print('state:', state)
        demand = random.randint(1,max_order)
        DEMAND.append(demand)
        #print('demand:', demand)
        
        action = agent.get_action(state, demand)
        ACTIONS.append(action)
        #print('action:', action)
        
        next_state, reward = env.step(state, action, demand)
        #print('next_state:', next_state)
        #print('cost:', reward)
        #print('--------------')
        if timestep == max_timesteps-1:
            reward = 0
            done  = True  # reward at terminal state
        
        
        net_cost.append(reward)
        state = next_state
        
        if done:
            print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
            break
            
        
    episode_cost.append(np.mean(net_cost))
    print('----------------------------------------------')

Plot States, Actions, Demand, Cost wrt timesteps¶

matplotlib.rcParams.update({'font.size': 22})
plt.figure(figsize=(12,8))
#plt.plot(range(max_timesteps),net_cost, 'blue')
plt.plot(range(max_timesteps),STATES, 'red')
#plt.plot(range(max_timesteps),DEMAND, 'green')
plt.plot(range(max_timesteps),ACTIONS, 'orange')
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xlabel('timesteps')
plt.ylabel('inventory level')
plt.savefig('invopt-rl_leadtime3.png')
plt.show()