Deep Q-Learning with Cart and Pole game¶

Importing required libraries¶

import random
import gym
import numpy as np
import time
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

This is where we create the soul of our Agent¶

class DQNAgent:
    
    def __init__(self, state_size, action_size):
        self.state_size = state_size             # Input size from emulater
        self.action_size = action_size           # Number of actions available
        self.memory = deque(maxlen=2000)         # Max. size of our memory. Older observations are overwritten once memory if full
        self.gamma = 0.95                        # discount rate
        self.epsilon = 1.0                       # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_max = 1.0
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001               # Learning rate for our model
        self.model = self._build_model()

    # This the policy that our agent will use to take actions
    def _build_model(self):
        # Neural Net for Deep-Q learning Model
        model = Sequential()
        model.add(Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(Dense(24, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse',
                      optimizer=Adam(lr=self.learning_rate))
        return model
    
    # Saving our data into a replay memory
    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    # Given a state, this function returns the action with maximum q-value
    def get_action(self, state):
        
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # returns action

    # Training our model with experience replay
    def replay(self, batch_size, episode):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(next_state)[0])
            
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        
        # Adjusting the exploration rate with experince
        if self.epsilon > self.epsilon_min:
            #self.epsilon = self.epsilon_min + (self.epsilon_max - self.epsilon_min)*np.exp(-self.epsilon_decay*episode)
            self.epsilon *= self.epsilon_decay

    # To load the saved model weights
    def load(self, name):
        self.model.load_weights(name)

    # To save the trained agent so that we can play with him later
    def save(self, name):
        self.model.save_weights(name)

Creating and initializing the Gym Environment¶

env = gym.make('CartPole-v0')
env._max_episode_steps = 500                # By default this is capped at 200

# Get state size
state_size = env.observation_space.shape[0]
print('state_size:', state_size)

# Get number of available Actions
action_size = env.action_space.n
print('action_size:', action_size)

Start Training¶

agent = DQNAgent(state_size, action_size)
done = False
batch_size = 128
EPISODES = 1000
render = True
    
for e in range(EPISODES):
    # Get initial state and reshape in proper shape according to our model
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    
    for time in range(500):
        
        # Displays the cart and pole environment
        if render:
            env.render()
            
        # Take action on current state and observe reward
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        
        # Reshape the next state 
        next_state = np.reshape(next_state, [1, state_size])
        
        # Save the current states into our memory
        agent.remember(state, action, reward, next_state, done)
        
        
        state = next_state
        
        # Update our model by sampling the states from our memory
        if len(agent.memory) > batch_size:
            agent.replay(batch_size, e)

        # At end of episode show stats
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                    .format(e, EPISODES, time, agent.epsilon))
            break
            
    # Saving model after every 100 episodes of training   
    if e % 100 == 0:
        agent.save('dqn_cartpole_{}.h5'.format(e))


env.close()

Testing our Agent¶

agent = DQNAgent(state_size, action_size)
agent.load("dqn_cartpole_100.h5")
agent.epsilon = 0.0001
done = False
EPISODES = 10
    
for e in range(EPISODES):
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    for t in range(500):
        env.render()
        time.sleep(0.03)
        action = agent.get_action(state)
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        state = next_state
        if done:
            print("episode: {}/{}, score: {}, e: {:.2}"
                    .format(e, EPISODES, t, agent.epsilon))
            break
            
env.close()