首页 > 美文阅读

莫烦python强化学习系列-DQN学习（代码）

更新时间:2023-06-02 18:56:26 阅读：评论：0

import numpy as np

import pandas as pd

import tensorflow as tf

np.random.ed(1)

tf.t_random_ed(1)

# Deep Q Network off-policy

class DeepQNetwork:

def __init__(

lf,

n_actions,

n_features,

learning_rate=0.01,

reward_decay=0.9,

e_greedy=0.9,

replace_target_iter=300,

memory_size=500,

batch_size=32,

e_greedy_increment=None,

output_graph=Fal,

lf.n_actions = n_actions

lf.n_features = n_features

lf.lr = learning_rate

lf.gamma = reward_decay

lf.epsilon_max = e_greedy

<_size = memory_size

lf.batch_size = batch_size

lf.epsilon_increment = e_greedy_increment

lf.epsilon = 0 if e_greedy_increment is not None el lf.epsilon_max

# total learning step

lf.learn_step_counter = 0

# initialize zero memory [s, a, r, s_]

< = np.zeros((lf.memory_size, n_features * 2 + 2))

# consist of [target_net, evaluate_net]

lf._build_net()

#tf.get_collection(key, scope=None)

#⽤来获取⼀个名称是‘key’的集合中的所有元素，返回的是⼀个列表

t_params = tf.get_collection('target_net_params')

e_params = tf.get_collection('eval_net_params')

lf.ss = tf.Session()

if output_graph:

# $ tensorboard --logdir=logs

# tf.train.SummaryWriter soon be deprecated, u following

tf.summary.FileWriter("logs/", aph)

lf.ss.run(tf.global_variables_initializer())

def _build_net(lf):

# ------------------ build evaluate_net ------------------

lf.s = tf.placeholder(tf.float32, [None, lf.n_features], name='s')

# input

lf.q_target = tf.placeholder(tf.float32, [None, lf.n_actions], name='Q_target') # for calculating loss with tf.variable_scope('eval_net'):

# c_names(collections_names) are the collections to store variables

c_names, n_l1, w_initializer, b_initializer = \

['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \

tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1) # config of layers

# first layer. collections is ud later when assign to target net

with tf.variable_scope('l1'):

w1 = tf.get_variable('w1', [lf.n_features, n_l1], initializer=w_initializer, collections=c_names)

b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)

l1 = lu(tf.matmul(lf.s, w1) + b1)

# cond layer. collections is ud later when assign to target net

with tf.variable_scope('l2'):

w2 = tf.get_variable('w2', [n_l1, lf.n_actions], initializer=w_initializer, collections=c_names)

b2 = tf.get_variable('b2', [1, lf.n_actions], initializer=b_initializer, collections=c_names)

lf.q_eval = tf.matmul(l1, w2) + b2 #[batch_size,lf.n_action]

with tf.variable_scope('loss'):

lf.loss = tf.reduce_mean(tf.squared_difference(lf.q_target, lf.q_eval))

with tf.variable_scope('train'):

lf._train_op = tf.train.RMSPropOptimizer(lf.lr).minimize(lf.loss)

# ------------------ build target_net ------------------

lf.s_ = tf.placeholder(tf.float32, [None, lf.n_features], name='s_') # input

with tf.variable_scope('target_net'):

# c_names(collections_names) are the collections to store variables

c_names = ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

# first layer. collections is ud later when assign to target net

列算式

with tf.variable_scope('l1'):

w1 = tf.get_variable('w1', [lf.n_features, n_l1], initializer=w_initializer, collections=c_names)

b1 = tf.get_variable('b1', [1, n_l1], initializer=b_initializer, collections=c_names)

l1 = lu(tf.matmul(lf.s_, w1) + b1)

# cond layer. collections is ud later when assign to target net

with tf.variable_scope('l2'):

w2 = tf.get_variable('w2', [n_l1, lf.n_actions], initializer=w_initializer, collections=c_names)

b2 = tf.get_variable('b2', [1, lf.n_actions], initializer=b_initializer, collections=c_names)

lf.q_next = tf.matmul(l1, w2) + b2

def store_transition(lf, s, a, r, s_):

if not hasattr(lf, 'memory_counter'):

<_counter = 0

transition = np.hstack((s, [a, r], s_))

# replace the old memory with new memory

index = lf.memory_counter % lf.memory_size

<[index, :] = transition

<_counter += 1

def choo_action(lf, obrvation):

# to have batch dimension when feed into tf placeholder

蒲公英功效与作用obrvation = waxis, :] #shape=(1,n_features)

if np.random.uniform() < lf.epsilon:

# forward feed the obrvation and get q value for every actions

actions_value = lf.ss.run(lf.q_eval, feed_dict={lf.s: obrvation})

action = np.argmax(actions_value)#未加axis＝，返回⼀个索引数值

el:

action = np.random.randint(0, lf.n_actions)

return action

def learn(lf):

# check to replace target parameters股票涨停是什么意思

if lf.learn_step_counter % lf.replace_target_iter == 0:

lf.ss.place_target_op)

print('\ntarget_params_replaced\n')

# sample batch memory from all memory

_counter > lf.memory_size:

sample_index = np.random._size, size=lf.batch_size)

el:

sample_index = np.random._counter, size=lf.batch_size)

batch_memory = lf.memory[sample_index, :]

q_next, q_eval = lf.ss.run(

[lf.q_next, lf.q_eval],

feed_dict={

#[s, a, r, s_]

lf.s_: batch_memory[:, -lf.n_features:], # fixed params

lf.s: batch_memory[:, :lf.n_features], # newest params

})

# change q_ q_eval's action

q_target = py()

batch_index = np.arange(lf.batch_size, dtype=np.int32)

eval_act_index = batch_memory[:, lf.n_features].astype(int) #action astype(int) 转换数组的数据类型 reward = batch_memory[:, lf.n_features + 1] #reward

q_target[batch_index, eval_act_index] = reward + lf.gamma * np.max(q_next, axis=1)

"""

For example in this batch I have 2 samples and 3 actions:

q_eval =

[[1, 2, 3],

[4, 5, 6]]

q_target = q_eval =

[[1, 2, 3],

[4, 5, 6]]

Then change q_target with the real q_target the q_eval's action.

For example in:

sample 0, I took action 0, and the max q_target value is -1;

sample 1, I took action 2, and the max q_target value is -2:

q_target =

[[-1, 2, 3],

[4, 5, -2]]

So the (q_target - q_eval) becomes:

[[(-1)-(1), 0, 0],

[0, 0, (-2)-(6)]]

We then backpropagate this the corresponding action to network,

leave other action as error=0 cau we didn't choo it.

"""

# train eval network

_, lf.cost = lf.ss.run([lf._train_op, lf.loss],

feed_dict={lf.s: batch_memory[:, :lf.n_features],

lf.q_target: q_target})

晚春意思

# increasing epsilon

励志英文lf.epsilon = lf.epsilon + lf.epsilon_increment if lf.epsilon < lf.epsilon_max el lf.epsilon_max lf.learn_step_counter += 1

def plot_cost(lf):

import matplotlib.pyplot as plt

plt.plot(np.arange(st_his)), lf.cost_his)

plt.ylabel('Cost')

plt.xlabel('training steps')

6年级数学上册plt.show()

from maze_env import Maze

from RL_brain import DeepQNetwork

def run_maze():

step = 0

for episode in range(300):

# initial obrvation

obrvation = ()

while True:

# fresh env

给领导的生日祝福

# RL choo action bad on obrvation

action = RL.choo_action(obrvation)

# RL take action and get next obrvation and reward

obrvation_, reward, done = env.step(action)

RL.store_transition(obrvation, action, reward, obrvation_)

if (step > 200) and (step % 5 == 0):

RL.learn()

# swap obrvation

obrvation = obrvation_

# break while loop when end of this episode

if done:

break

step += 1

# end of game

print('game over')

env.destroy()

if __name__ == "__main__":

# maze game

env = Maze()

RL = DeepQNetwork(env.n_actions, env.n_features,

learning_rate=0.01,

reward_decay=0.9,

e_greedy=0.9,

replace_target_iter=200,

memory_size=2000,

# output_graph=True

)

env.after(100, run_maze)

env.mainloop()装饰板材品牌

RL.plot_cost()

本文发布于:2023-06-02 18:56:26，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/82/836161.html

上一篇：小学贫困生补助申请书

下一篇：勇于创新作文600字初中(4篇)

标签：学习返回元素股票转换装饰获取

留言与评论（共有 0 条评论）