import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense, Input, concatenate
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import matplotlib
class Environment:
def __init__(self, max_inventory, leadtime):
self.fixed_cost = 0
self.max_inventory = max_inventory
self.lost_reveune_factor = 0.3
self.leadtime = leadtime
self.inv_list = [0] * self.leadtime
# Returns initial state
def reset(self):
self.inv_list = [0] * self.leadtime
return self.max_inventory
# Order cost = Fixed cost + variable cost
def order_cost(self, x):
if x>0:
oc = self.fixed_cost + 2*x
else:
oc = 0
return oc
# Holding cost: if input is negative = backorders
def holding_cost(self, x):
if x < 0:
return -2*x
else:
return 1*x
def revenue(self, x):
if x < 0:
return 0
else:
return 8*x
# Calculate next state and reward
def step(self, state, action, demand):
self.inv_list.append(action)
action = self.inv_list[0]
self.inv_list.pop(0)
backlog_orders = 0
reward = 0
# serve backlog orders
if state < 0:
backlog_orders = np.abs(state)
if backlog_orders > action:
reward += self.revenue(action)
backlog_orders -= (action)
else:
reward += self.revenue(backlog_orders)
backlog_orders = 0
curr_inv = state + action
# serve current demand
if demand > curr_inv:
reward += self.revenue(curr_inv)
else:
reward += self.revenue(demand)
next_state = (curr_inv - demand)
curr_inv = next_state
net_reward = reward - self.order_cost(action) - self.holding_cost(curr_inv) # use argmax for this
return next_state, net_reward
class Agent:
def __init__(self, max_inventory, max_orders, leadtime, model, learning_rate = 0.001):
self.max_inventory = max_inventory
self.leadtime = leadtime
self.max_orders = max_orders
self.learning_rate = learning_rate
self.memory = deque(maxlen=1000)
self.gamma = 0.95 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
self.epsilon_max = 1.0
self.epsilon_decay = 0.03
self.type = model
self.hl1 = 5
self.hl2 = 8
self.model = self._build_model()
def set_epsilon(self, epsilon=0.001):
self.epsilon = epsilon
def _build_model(self):
if self.type == 'withdemand':
## Agents knows the demand
state_input = Input(shape=(1,), name = 'state')
demand_input = Input(shape=(1,), name = 'demand')
x = concatenate([state_input, demand_input], name = 'concatenate')
x = Dense(self.hl1, activation='relu', name = 'hidden1')(x)
x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
model = Model(inputs = [state_input, demand_input], outputs = action)
if self.type == "nodemand":
## Agent doesnt know the demand
state_input = Input(shape=(1,), name = 'state')
x = Dense(self.hl1, activation='relu', name = 'hidden1')(state_input)
x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
model = Model(inputs = state_input, outputs = action)
if self.type == 'leadtime':
## Agents knows the demand
state_input = Input(shape=(1,), name = 'state')
demand_input = Input(shape=(1,), name = 'demand')
actions_input = Input(shape=(self.leadtime,), name = 'transit actions')
x = concatenate([state_input, demand_input, actions_input], name = 'concatenate')
x = Dense(self.hl1, activation='relu', name = 'hidden1')(x)
x = Dense(self.hl2, activation='relu', name = 'hidden2')(x)
action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
model = Model(inputs = [state_input, demand_input, actions_input], outputs = action)
model.summary()
model.compile(loss='mse',
optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done, demand):
if self.type == "withdemand":
## Knows demand
self.memory.append((state, action, reward, next_state, done, demand))
if self.type == "nodemand":
## No demand
self.memory.append((state, action, reward, next_state, done))
def get_action(self, state, demand):
if np.random.rand() <= self.epsilon:
return random.randint(0, self.max_orders)
if self.type == "withdemand":
## Knows demand
act_values = self.model.predict([np.array([state]), np.array([demand])])
if self.type == "nodemand":
## No demand
act_values = self.model.predict(np.array([state]))
action = np.argmax(act_values[0])
if 0 <= action <= (self.max_inventory - state) :
return action # returns action within limit
else:
return 0
def replay(self, batch_size, episode):
minibatch = random.sample(self.memory, batch_size)
if self.type == "withdemand":
## knows demand
for state, action, reward, next_state, done, demand in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict([np.array([next_state]), np.array([demand])]))
target_f = self.model.predict([np.array([state]), np.array([demand])])
target_f[0][action] = target
self.model.fit([np.array([state]), np.array([demand])], target_f, epochs=1, verbose=0)
if self.type == "nodemand":
## No demand
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state])))
target_f = self.model.predict(np.array([state]))
target_f[0][action] = target
self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min)*np.exp(-self.epsilon_decay*episode)
#self.epsilon *= self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
max_inventory = 100
max_order = 30 # Max. Demand limit
env = Environment(max_inventory = max_inventory, leadtime = 3)
agent = Agent(max_inventory = max_inventory, max_orders = max_order, leadtime = 3, model = 'withdemand')
#agent.load("withdemandbacklog_100maxinv_30maxorder_3leadtime_500.h5")
#agent.set_epsilon(0.0001)
max_episodes = 10
max_timesteps = 500
episode_cost =[]
batch_size = 16
random.seed(2804)
for episode in range(max_episodes):
done = False
net_cost = []
state = env.reset()
#print('initial state:', state)
for timestep in range(max_timesteps):
#print('state:', state)
demand = random.randint(1,max_order)
#print('demand:', demand)
action = agent.get_action(state, demand)
#print('action:', action)
next_state, reward = env.step(state, action, demand)
#print('next_state:', next_state)
#print('cost:', reward)
#print('--------------')
if timestep == max_timesteps-1:
reward = 0
done = True # reward at terminal state
agent.remember(state, action, reward, next_state, done, demand)
net_cost.append(reward)
state = next_state
if done:
print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size, episode)
episode_cost.append(np.mean(net_cost))
print('epsilon:', agent.epsilon)
print('----------------------------------------------')
#if episode % 100 == 0:
#agent.save('nodemand_backlog_{}.h5.'.format(episode))
#agent.save('nodemand_backlog_{}.h5.'.format(max_episodes))
plt.plot(range(max_episodes),episode_cost)
max_inventory = 100
max_order = 30
env = Environment(max_inventory = max_inventory, leadtime = 3)
agent = Agent(max_inventory = max_inventory, max_orders = max_order, leadtime = 3, model = 'withdemand')
agent.load("withdemandbacklog_100maxinv_30maxorder_3leadtime_300.h5")
agent.set_epsilon(0.0001)
max_episodes = 1
max_timesteps = 100
episode_cost =[]
batch_size = 16
STATES = []
DEMAND = []
ACTIONS = []
for episode in range(max_episodes):
done = False
net_cost = []
state = env.reset()
#print('initial state:', state)
for timestep in range(max_timesteps):
STATES.append(state)
#print('state:', state)
demand = random.randint(1,max_order)
DEMAND.append(demand)
#print('demand:', demand)
action = agent.get_action(state, demand)
ACTIONS.append(action)
#print('action:', action)
next_state, reward = env.step(state, action, demand)
#print('next_state:', next_state)
#print('cost:', reward)
#print('--------------')
if timestep == max_timesteps-1:
reward = 0
done = True # reward at terminal state
net_cost.append(reward)
state = next_state
if done:
print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
break
episode_cost.append(np.mean(net_cost))
print('----------------------------------------------')
matplotlib.rcParams.update({'font.size': 22})
plt.figure(figsize=(12,8))
#plt.plot(range(max_timesteps),net_cost, 'blue')
plt.plot(range(max_timesteps),STATES, 'red')
#plt.plot(range(max_timesteps),DEMAND, 'green')
plt.plot(range(max_timesteps),ACTIONS, 'orange')
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xlabel('timesteps')
plt.ylabel('inventory level')
plt.savefig('invopt-rl_leadtime3.png')
plt.show()