import numpy as np
import gym
import random
import time
from IPython.display import clear_output
We initialize our Frozen Lake environment here...
env = gym.make('FrozenLake-v0')
env.render()
We see that there are 16 different states and 4 possible actions (Up, Down, Left, Right)
states = env.observation_space.n
actions = env.action_space.n
print('States:', states, ' Actions:', actions)
Initializing our Q-Table with zeros. You can initialize it with random values also.
qtable = np.zeros((states, actions))
#qtable = np.random.rand(states, actions)
print(qtable)
Setting up the hyperparameters for our algorithm.
episodes = 8000
# use 'speed' to see the agent playing at slower speed. Higher value = slower agent (use speed = 0.05 for comfy view)
speed = 0
learning_rate = 0.05
gamma = 0.95
epsilon = 1
min_epsilon = 0.01
max_epsilon = 1
decay_rate = 0.001
max_steps = 99
# To track the rewards for each episode
rewards =[]
# Beginning training
for episode in range(episodes):
# Get initial state
state = env.reset()
# To track if episode has ended or agent has fallen in hole
done = False
t = 0
# Total reward in an episode
total_reward = 0
for t in range(max_steps):
# Exploration and exploitation strategy
tradeoff_no = random.uniform(0,1)
if tradeoff_no > epsilon:
action = np.argmax(qtable[state,:])
else:
action = env.action_space.sample()
clear_output(wait=True)
# Displays the environment
env.render()
# Take action and receive reward, next_state, status of agent
new_state, reward, done, info = env.step(action)
total_reward += reward
# Update Q-Table using Bellman Equation
qtable[state, action] = qtable[state, action] + learning_rate * (reward + gamma * np.max(qtable[new_state, :]) - qtable[state, action])
state = new_state
time.sleep(speed)
print(qtable)
# Display stats at end of episode
if done:
print("Episode ", episode," finished after {} timesteps".format(t))
print('Epsilon:', epsilon, 'Total reward: ', total_reward)
time.sleep(0.5)
break
# Decrease epsilon value with time (episodes)
epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
#epsilon = np.max((min_epsilon, epsilon*decay_rate))
rewards.append(total_reward)
print ("Score over time: " + str(sum(rewards)/episodes))
env.reset()
speed = 0.05
goal = 0
for episode in range(10):
state = env.reset()
done = False
while not done:
clear_output(wait=True)
env.render()
action = np.argmax(qtable[state,:])
new_state, reward, done, info = env.step(action)
state = new_state
time.sleep(speed)
if done:
if reward == 1:
goal += 1
print("Episode ", episode," finished after {} timesteps".format(t))
print('Total reward: ', reward)
time.sleep(speed)
break
env.close()
print('Reached goal:', goal, "times")