首页 > 美文鉴赏

强化学习算法实例DQN代码PyTorch实现

更新时间:2023-06-18 08:39:42 阅读：评论：0

强化学习算法实例DQN代码PyTorch实现前⾔

实例参考,

更改为PyTorch实现，并增加了⼏处优化。实现效果如下。

其中，红⾊⽅块作为探索的智能体，到达黄⾊圆形块reward=1，到达⿊⾊⽅块区域reward=-1.

代码

程序主循环

from dqn.maze_env import Maze

from dqn.RL_brain import DQN

import time

def run_maze():

print("====Game Start====")

step = 0

感恩诗歌max_episode = 500

for episode in range(max_episode):

state = () # 重置智能体位置

step_every_episode = 0

epsilon = episode / max_episode # 动态变化随机值

while True:

if episode < 10:

time.sleep(0.1)

if episode > 480:

time.sleep(0.5)

action = model.choo_action(state, epsilon) # 根据状态选择⾏为

# 环境根据⾏为给出下⼀个状态，奖励，是否结束。

next_state, reward, terminal = env.step(action)

model.store_transition(state, action, reward, next_state) # 模型存储经历

# 控制学习起始时间(先积累记忆再学习)和控制学习的频率(积累多少步经验学习⼀次)

if step > 200 and step % 5 == 0:

model.learn()

# 进⼊下⼀步

state = next_state

if terminal:

print("episode=", episode, end=",")

print("step=", step_every_episode)

break

step += 1

step_every_episode += 1

# 游戏环境结束

print("====Game Over====")

env.destroy()

if __name__ == "__main__":

env = Maze() # 环境

model = DQN(

n_states=env.n_states,

n_actions=env.n_actions

) # 算法模型

run_maze()

env.mainloop()

model.plot_cost() # 误差曲线

环境模块maze_env.py

import tkinter as tk

import sys

import numpy as np

UNIT = 40 # pixels

MAZE_H = 4 # grid height

MAZE_W = 4 # grid width

class Maze(tk.Tk, object):

def __init__(lf):

print("<env init>")

super(Maze, lf).__init__()

# 动作空间(定义智能体可选的⾏为),action=0-3

罗曼罗兰名言

lf.action_space = ['u', 'd', 'l', 'r']

# 使⽤变量

lf.n_actions = len(lf.action_space)

lf.n_states = 2

# 配置信息

lf.title('maze')

# 初始化操作

lf.__build_maze()

def render(lf):最大的蝎子

# time.sleep(0.1)

lf.update()

def ret(lf):

# 智能体回到初始位置

# time.sleep(0.1)

lf.update()

lf.canvas.)

origin = np.array([20, 20])

< = ate_rectangle(

origin[0] - 15, origin[1] - 15,

origin[0] + 15, origin[1] + 15,

fill='red')

# return obrvation

return (np.array()[:2]) - np.array(ds(lf.oval)[:2])) / (MAZE_H * UNIT) def step(lf, action):

# 智能体向前移动⼀步：返回next_state,reward,terminal

s = )

ba_action = np.array([0, 0])

if action == 0: # up

if s[1] > UNIT:

ba_action[1] -= UNIT

elif action == 1: # down

if s[1] < (MAZE_H - 1) * UNIT:钱学森夫人蒋英

ba_action[1] += UNIT

elif action == 2: # right

if s[0] < (MAZE_W - 1) * UNIT:

ba_action[0] += UNIT

elif action == 3: # left

if s[0] > UNIT:

ba_action[0] -= UNIT

饱满的热情

, ba_action[0], ba_action[1]) # move agent

next_coords = ) # next state

# reward function

if next_coords == ds(lf.oval):

reward = 1

print("victory")

done = True

elif next_coords in [ds(lf.hell1)]:

reward = -1

print("defeat")

done = True

el:

reward = 0

done = Fal

s_ = (np.array(next_coords[:2]) - np.array(ds(lf.oval)[:2])) / (MAZE_H * UNIT)

return s_, reward, done

def __build_maze(lf):

lf.canvas = tk.Canvas(lf, bg='white',

height=MAZE_H * UNIT,

width=MAZE_W * UNIT)

# create grids

for c in range(0, MAZE_W * UNIT, UNIT):

x0, y0, x1, y1 = c, 0, c, MAZE_H * UNIT

ate_line(x0, y0, x1, y1)

for r in range(0, MAZE_H * UNIT, UNIT):

x0, y0, x1, y1 = 0, r, MAZE_W * UNIT, r

ate_line(x0, y0, x1, y1)

节约用水调查报告origin = np.array([20, 20])

hell1_center = origin + np.array([UNIT * 2, UNIT])

lf.hell1 = ate_rectangle(

hell1_center[0] - 15, hell1_center[1] - 15,

hell1_center[0] + 15, hell1_center[1] + 15,

fill='black')

oval_center = origin + UNIT * 2

lf.oval = ate_oval(

oval_center[0] - 15, oval_center[1] - 15,

oval_center[0] + 15, oval_center[1] + 15,

fill='yellow')

< = ate_rectangle(

origin[0] - 15, origin[1] - 15,

origin[0] + 15, origin[1] + 15,

fill='red')

lf.canvas.pack()

DQN模型RL_brain.py

class Net(nn.Module):

def __init__(lf, n_states, n_actions):

super(Net, lf).__init__()

lf.fc1 = nn.Linear(n_states, 10)

lf.fc2 = nn.Linear(10, n_actions)

lf.fc1.al_(0, 0.1)

lf.fc2.al_(0, 0.1)

def forward(lf, x):

x = lf.fc1(x)

x = F.relu(x)

out = lf.fc2(x)

return out

class DQN:

def __init__(lf, n_states, n_actions):

print("<DQN init>")

# DQN有两个net:target net和eval net，具有选动作，存经历，学习三个基本功能

lf.eval_net, lf.target_net = Net(n_states, n_actions), Net(n_states, n_actions)

lf.loss = nn.MSELoss()

lf.optimizer = torch.optim.Adam(lf.eval_net.parameters(), lr=0.01)

lf.n_actions = n_actions

lf.n_states = n_states

# 使⽤变量

lf.learn_step_counter = 0 # target⽹络学习计数

<_counter = 0 # 记忆计数

< = np.zeros((2000, 2 * 2 + 2)) # 2*2(state和next_state，每个x,y坐标确定)+2(action和reward)，存储2000个记忆体 lf.cost = [] # 记录损失值

def choo_action(lf, x, epsilon):

# print("<choo_action>")

x = torch.unsqueeze(torch.FloatTensor(x), 0) # (1,2)

if np.random.uniform() < epsilon:

action_value = lf.eval_net.forward(x)

action = torch.max(action_value, 1)[1].data.numpy()[0]

el:

action = np.random.randint(0, lf.n_actions)

# print("action=", action)

return action

def store_transition(lf, state, action, reward, next_state):

# print("<store_transition>")

transition = np.hstack((state, [action, reward], next_state))

index = lf.memory_counter % 200 # 满了就覆盖旧的

<[index, :] = transition

<_counter += 1

def learn(lf):

# print("<learn>")

# target net 更新频率,⽤于预测，不会及时更新参数

if lf.learn_step_counter % 100 == 0:

lf.target_net.load_state_dict((lf.eval_net.state_dict()))

f调笛子指法lf.learn_step_counter += 1

# 使⽤记忆库中批量数据

sample_index = np.random.choice(200, 16) # 2000个中随机抽取32个作为batch_size

memory = lf.memory[sample_index, :] # 抽取的记忆单元，并逐个提取

state = torch.FloatTensor(memory[:, :2])

action = torch.LongTensor(memory[:, 2:3])

reward = torch.LongTensor(memory[:, 3:4])

next_state = torch.FloatTensor(memory[:, 4:6])

# 计算loss,q_eval：所采取动作的预测value,q_target：所采取动作的实际value

q_eval = lf.eval_net(state).gather(1, action) # eval_net->(64,4)->按照action索引提取出q_value

q_next = lf.target_net(next_state).detach()

# torch.max->[values=[],indices=[]] max(1)[0]->values=[]

q_target = reward + 0.9 * q_next.max(1)[0].unsqueeze(1) # label

行己loss = lf.loss(q_eval, q_target)

# 反向传播更新

_grad() # 梯度重置

loss.backward() # 反向求导

lf.optimizer.step() # 更新模型参数

def plot_cost(lf):

plt.plot(np.arange(st)), lf.cost)

plt.xlabel("step")

plt.ylabel("cost")

plt.show()

参考

本文发布于:2023-06-18 08:39:42，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1043755.html

上一篇：VW60424_EN_2009

下一篇：Hepatic stem cells

标签：学习智能动作

留言与评论（共有 0 条评论）