深度增强学习--Deep Q Network

从这里开始换个游戏演示，cartpole游戏
 import sys

 import gym

 import pylab

 import random

 import numpy as np

 from collections import deque

 from keras.layers import Dense

 from keras.optimizers import Adam

 from keras.models import Sequential

 EPISODES = 300

 # DQN Agent for the Cartpole

 # it uses Neural Network to approximate q function,使用神经网络近似q-learning的q函数

 # and experience replay memory & fixed target q network

 class DQNAgent:

     def __init__(self, state_size, action_size):

         # if you want to see Cartpole learning, then change to True

         self.render = True

         self.load_model = False

         # get size of state and action

         self.state_size = state_size

         self.action_size = action_size

         # These are hyper parameters for the DQN

         self.discount_factor = 0.99

         self.learning_rate = 0.001

         self.epsilon = 1.0

         self.epsilon_decay = 0.999

         self.epsilon_min = 0.01

         self.batch_size = 64

         self.train_start = 1000

         # create replay memory using deque

         self.memory = deque(maxlen=2000)

         # create main model and target model

         self.model = self.build_model()

         self.target_model = self.build_model()

         # initialize target model

         self.update_target_model()

         if self.load_model:

             self.model.load_weights("./save_model/cartpole_dqn.h5")

     # approximate Q function using Neural Network

     # state is input and Q Value of each action is output of network

     def build_model(self):

         model = Sequential()

         model.add(Dense(24, input_dim=self.state_size, activation='relu',

                         kernel_initializer='he_uniform'))

         model.add(Dense(24, activation='relu',

                         kernel_initializer='he_uniform'))

         model.add(Dense(self.action_size, activation='linear',

                         kernel_initializer='he_uniform'))

         model.summary()

         model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

         return model

     # after some time interval update the target model to be same with model

     def update_target_model(self):

         self.target_model.set_weights(self.model.get_weights())

     # get action from model using epsilon-greedy policy

     def get_action(self, state):

         if np.random.rand() <= self.epsilon:

             return random.randrange(self.action_size)

         else:

             q_value = self.model.predict(state)#2，q(s,a),利用模型预测不同action的q值，选大的作为下一action

             return np.argmax(q_value[0])

     # save sample <s,a,r,s'> to the replay memory

     def append_sample(self, state, action, reward, next_state, done):

         self.memory.append((state, action, reward, next_state, done))

         if self.epsilon > self.epsilon_min:

             self.epsilon *= self.epsilon_decay

     # pick samples randomly from replay memory (with batch_size)

     def train_model(self):

         if len(self.memory) < self.train_start:

             return

         import pdb; pdb.set_trace()

         batch_size = min(self.batch_size, len(self.memory))

         mini_batch = random.sample(self.memory, batch_size)#64list

         #(array([[-0.04263461, -0.00657423,  0.00506589, -0.00200269]]), 0, 1.0, array([[-0.04276609, -0.20176846,  0.00502584,  0.29227427]]), False)

         update_input = np.zeros((batch_size, self.state_size))

         update_target = np.zeros((batch_size, self.state_size))

         action, reward, done = [], [], []

         for i in range(self.batch_size):

             update_input[i] = mini_batch[i][0]

             action.append(mini_batch[i][1])

             reward.append(mini_batch[i][2])

             update_target[i] = mini_batch[i][3]

             done.append(mini_batch[i][4])

         target = self.model.predict(update_input)#(64,2)

         target_val = self.target_model.predict(update_target)#(64, 2)

         for i in range(self.batch_size):

             # Q Learning: get maximum Q value at s' from target model

             if done[i]:

                 target[i][action[i]] = reward[i]

             else:

                 target[i][action[i]] = reward[i] + self.discount_factor * (

                     np.amax(target_val[i]))#off-policy 更新

         # and do the model fit!

         self.model.fit(update_input, target, batch_size=self.batch_size,

                        epochs=1, verbose=0)

 if __name__ == "__main__":

     # In case of CartPole-v1, maximum length of episode is 500

     env = gym.make('CartPole-v1')

     # get size of state and action from environment

     state_size = env.observation_space.shape[0]#

     action_size = env.action_space.n#

     agent = DQNAgent(state_size, action_size)

     scores, episodes = [], []

     for e in range(EPISODES):

         done = False

         score = 0

         state = env.reset()

         state = np.reshape(state, [1, state_size])

         while not done:

             if agent.render:

                 env.render()

             # get action for the current state and go one step in environment

             action = agent.get_action(state)

             next_state, reward, done, info = env.step(action)

             next_state = np.reshape(next_state, [1, state_size])

             # if an action make the episode end, then gives penalty of -100

             reward = reward if not done or score == 499 else -100

             # save the sample <s, a, r, s'> to the replay memory

             agent.append_sample(state, action, reward, next_state, done)

             # every time step do the training

             agent.train_model()

             score += reward

             state = next_state

             if done:

                 # every episode update the target model to be same with model

                 agent.update_target_model()

                 # every episode, plot the play time

                 score = score if score == 500 else score + 100

                 scores.append(score)

                 episodes.append(e)

                 pylab.plot(episodes, scores, 'b')

                 pylab.savefig("./save_graph/cartpole_dqn.png")

                 print("episode:", e, "  score:", score, "  memory length:",

                       len(agent.memory), "  epsilon:", agent.epsilon)

                 # if the mean of scores of last 10 episode is bigger than 490

                 # stop training

                 if np.mean(scores[-min(10, len(scores)):]) > 490:

                     sys.exit()

         # save the model

         if e % 50 == 0:

             agent.model.save_weights("./save_model/cartpole_dqn.h5")
秒客网

深度增强学习--Deep Q Network

相关文章