import random
import gym
import numpy as np
from collections import deque
from keras.models import Sequential, Model
from keras.layers import Dense, Input, concatenate
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
class Environment:
def __init__(self, max_inventory):
self.fixed_cost = 0
self.max_inventory = max_inventory
self.lost_reveune_factor = 0.3
# returns initial state
def reset(self):
return self.max_inventory
# order cost = fixed cost + variable cost
def order_cost(self, x):
if x>0:
oc = self.fixed_cost + 2*x #varaible_cost
else:
oc = 0
return oc
# Holding cost: When input is negative = backorders
def holding_cost(self, x):
if x < 0:
return -2*x
else:
return 1*x
# revenue genrated
def revenue(self, x):
if x < 0:
return 0
else:
return 8*x
# Calculate next state and reward
def step(self, state, action, demand):
backlog_orders = 0
reward = 0
# serve backlog orders
if state < 0:
backlog_orders = np.abs(state)
if backlog_orders > action:
reward += self.revenue(action)
backlog_orders -= (action)
else:
reward += self.revenue(backlog_orders)
backlog_orders = 0
curr_inv = state + action
# serve current demand
if demand > curr_inv:
reward += self.revenue(curr_inv)
else:
reward += self.revenue(demand)
next_state = (curr_inv - demand)
curr_inv = next_state
net_reward = reward - self.order_cost(action) - self.holding_cost(curr_inv) # use argmax for this
return next_state, net_reward
class Agent:
def __init__(self, max_inventory, model, learning_rate = 0.001):
self.max_inventory = max_inventory
self.learning_rate = learning_rate
self.memory = deque(maxlen=1000)
self.gamma = 0.95 # discount rate
self.epsilon = 1.0 # exploration rate
self.epsilon_min = 0.01
self.epsilon_max = 1.0
self.epsilon_decay = 0.03
self.type = model
self.model = self._build_model()
def _build_model(self):
if self.type == 'withdemand':
## Agents knows the demand
state_input = Input(shape=(1,), name = 'state')
demand_input = Input(shape=(1,), name = 'demand')
x = concatenate([state_input, demand_input], name = 'concatenate')
x = Dense(5, activation='relu', name = 'hidden1')(x)
x = Dense(8, activation='relu', name = 'hidden2')(x)
action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
model = Model(inputs = [state_input, demand_input], outputs = action)
if self.type == "nodemand":
## Agent doesnt know the demand
state_input = Input(shape=(1,), name = 'state')
x = Dense(5, activation='relu', name = 'hidden1')(state_input)
x = Dense(8, activation='relu', name = 'hidden2')(x)
action = Dense(self.max_inventory+1, activation='linear', name = 'actions')(x)
model = Model(inputs = state_input, outputs = action)
model.summary()
model.compile(loss='mse',
optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, next_state, done, demand):
if self.type == "withdemand":
## Knows demand
self.memory.append((state, action, reward, next_state, done, demand))
if self.type == "nodemand":
## No demand
self.memory.append((state, action, reward, next_state, done))
def get_action(self, state, demand):
if np.random.rand() <= self.epsilon:
if state < 0:
return random.randint(0, self.max_inventory)
else:
return random.randint(0, (self.max_inventory - state))
if self.type == "withdemand":
## Knows demand
act_values = self.model.predict([np.array([state]), np.array([demand])])
if self.type == "nodemand":
## No demand
act_values = self.model.predict(np.array([state]))
action = np.argmax(act_values[0])
if 0 <= action <= (self.max_inventory - state) :
return action # returns action within limit
else:
return 0
def replay(self, batch_size, episode):
minibatch = random.sample(self.memory, batch_size)
if self.type == "withdemand":
## knows demand
for state, action, reward, next_state, done, demand in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict([np.array([next_state]), np.array([demand])]))
target_f = self.model.predict([np.array([state]), np.array([demand])])
target_f[0][action] = target
self.model.fit([np.array([state]), np.array([demand])], target_f, epochs=1, verbose=0)
if self.type == "nodemand":
## No demand
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state])))
target_f = self.model.predict(np.array([state]))
target_f[0][action] = target
self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
if self.epsilon > self.epsilon_min:
self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min)*np.exp(-self.epsilon_decay*episode)
#self.epsilon *= self.epsilon_decay
def load(self, name):
self.model.load_weights(name)
def save(self, name):
self.model.save_weights(name)
max_inventory = 100
env = Environment(max_inventory = max_inventory)
agent = Agent(max_inventory = max_inventory, model = 'withdemand')
#agent.load("withdemand_backlog_200.h5")
max_episodes = 200
max_timesteps = 100
episode_cost =[]
batch_size = 16
random.seed(2804)
for episode in range(max_episodes):
done = False
net_cost = []
state = env.reset()
#print('initial state:', state)
#backlog_orders = 0
for timestep in range(max_timesteps):
#print('state:', state)
demand = random.randint(1,max_inventory)
#print('demand:', demand)
action = agent.get_action(state, demand)
#print('action:', action)
next_state, reward = env.step(state, action, demand)
#print('next_state:', next_state)
#print('cost:', reward)
#print('--------------')
if timestep == max_timesteps-1:
reward = 0
done = True # reward at terminal state
agent.remember(state, action, reward, next_state, done, demand)
net_cost.append(reward)
state = next_state
if done:
print("episode: {}/{}, cost: {}".format(episode+1, max_episodes, np.mean(net_cost)))
break
if len(agent.memory) > batch_size:
agent.replay(batch_size, episode)
episode_cost.append(np.mean(net_cost))
print('epsilon:', agent.epsilon)
print('----------------------------------------------')
#if episode % 100 == 0:
#agent.save('nodemand_backlog_{}.h5.'.format(episode))
#agent.save('nodemand_backlog_{}.h5.'.format(max_episodes))
plt.figure(figsize=(12,8))
plt.plot(range(max_episodes),episode_cost)
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.xlabel('Episode')
plt.ylabel('Cost')
plt.show()
max_inventory = 100
env = Environment(max_inventory=max_inventory)
agent = Agent(max_inventory=max_inventory, model = 'withdemand')
#agent.load("dqn_inventory_199.h5")
agent.load("withdemand_backlog_200.h5")
agent.epsilon = 0.0001
max_episodes = 10
max_timesteps = 100
episode_cost =[]
batch_size = 16
agent.epsilon = 0.0001
STATES = []
DEMAND = []
ACTIONS = []
for episode in range(max_episodes):
done = False
net_cost = []
state = env.reset()
#print('initial state:', state)
for timestep in range(max_timesteps):
#print('state:', state)
STATES.append(state)
demand = random.randint(1,max_inventory)
#print('demand:', demand)
DEMAND.append(demand)
action = agent.get_action(state, demand)
#print('action:', action)
ACTIONS.append(action)
#print('----------------')
next_state, reward = env.step(state, action, demand)
#print('next_state:', next_state)
#print('cost:', reward)
if timestep == max_timesteps-1:
reward = 0
done = True # reward at terminal state
net_cost.append(reward)
state = next_state
if done:
print("episode: {}/{}, cost: {}".format(episode, max_episodes-1, np.mean(net_cost)))
break
episode_cost.append(np.mean(net_cost))
print('----------------------------------------------')
plt.figure(figsize=(12,8))
plt.plot(range(max_timesteps),STATES, 'red')
#plt.plot(range(max_timesteps),DEMAND, 'green')
#plt.plot(range(max_timesteps),ACTIONS, 'orange')
plt.axhline(y = 0, color = 'black', linestyle = '--')
#plt.axhline(y = 25, color = 'purple', linestyle = '-')
plt.xlabel('timesteps')
plt.ylabel('inventory level')
plt.savefig('withdemand.png')
plt.show()