import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams.update({'font.size': 22})
class Environment:
# Initilizes object of Class Environment
def __init__(self, max_inventory):
self.fixed_cost = 0
self.max_inventory = max_inventory
self.lost_reveune_factor = 0.3
# Returns the initial state
def reset(self):
return self.max_inventory
# Order cost: Fixed cost + varaible cost
def order_cost(self, x):
if x>0:
oc = self.fixed_cost + 2*x
else:
oc = 0
return oc
# Holding cost: When input is negative = backorders
def holding_cost(self, x):
if x < 0:
return -2*x
else:
return 1*x
# Revenue generated per unit
def revenue(self, x):
if x < 0:
return 0
else:
return 8*x
# Probability distribution of demand
def prob(self, x):
p = 1/(self.max_inventory+1) # ASSUMPTION: Uniform demand distribution
return p
# Expected reward for taking action 'a' in state 's'
def expected_reward(self, state, action, demand):
a=0
p=0
# When inventory exceed demand, present value of revenue; it occurs with probability Pj(prob of demand) is calculated as follows
for j in range(state+action):
a += self.revenue(j)*self.prob(j)
p += self.prob(j)
p = 1-p
# When demand exceed inventory, present value of revenue; it occurs with probability 1-Pj(prob of demand) is calculated as follows
b = self.revenue(state+action)*p
#print('expected reward:', a+b, a, b, p)
return a+b
# Calculate next state and reward(expected revenue - order cost - holding cost)
def step(self, state, action, demand):
next_state = state + action - demand
reward = self.expected_reward(state, action, demand) - self.order_cost(action) - self.holding_cost(state) - self.holding_cost(action)
return next_state, reward
class Agent:
def __init__(self, max_inventory, min_stock = 0):
self.max_inventory = max_inventory
self.min_stock = min_stock
def get_action(self, state):
# Possible actions are 0,1,2...10 (max_inventory - state)
if state < self.min_stock:
if state < 0:
action = self.max_inventory + np.abs(state)
else:
action = self.max_inventory - state
else:
action = 0
return action
max_inventory = 100
min_stock = 40
env = Environment(max_inventory = max_inventory)
agent = Agent(max_inventory = max_inventory, min_stock = min_stock)
max_episodes = 1
max_timesteps = 100
net_cost = []
STATES = []
DEMAND = []
ACTIONS = []
#random.seed(2804)
for episode in range(max_episodes):
state = env.reset()
#print('initial state:', state)
for timestep in range(max_timesteps):
#print('state:', state)
STATES.append(state)
# Get the demand (random number with uniform distribution)
demand = random.randint(1,max_inventory)
#print('demand:', demand)
DEMAND.append(demand)
# Take action based on current state (inventory level)
action = agent.get_action(state)
#print('action:', action)
ACTIONS.append(action)
# Calculate reward and next state based on action taken
next_state, reward = env.step(state, action, demand)
#print('next_state:', next_state)
#print('cost:', reward)
#print('------------------------')
if timestep == max_timesteps-1:
reward = 0 # reward at terminal state
net_cost.append(reward)
state = next_state
# Show stats at end of episode
print("episode: {}/{}, cost: {}, timestep: {}".format(episode, max_episodes, np.mean(net_cost), timestep))
print('----------------------------------------------')
plt.figure(figsize=(12,8))
#plt.plot(range(max_timesteps*max_episodes),net_cost, 'orange')
plt.plot(range(max_timesteps*max_episodes),STATES, 'red')
#plt.plot(range(max_timesteps*max_episodes),DEMAND, 'green')
#plt.plot(range(max_timesteps*max_episodes),ACTIONS, 'blue')
plt.axhline(y = 0, color = 'black', linestyle = '--')
plt.axhline(y = min_stock, color = 'purple', linestyle = '-')
plt.xlabel('timesteps')
plt.ylabel('inventory level')
plt.show()