强化学习之DQN代码(pytorch实现)此处程序个⼈感觉过多过乱,应整理出属于⾃⼰风格的代码结构,这是编程实现必不可少的环节。
导⼊包
import gym
from gym import wrappers
import torch
as nn
直销员
import torch.optim as optim
functional as F
import numpy as np
from IPython.display import clear_output
from matplotlib import pyplot as plt
%matplotlib inline
import random
from timeit import default_timer as timer
from datetime import timedelta
import math
from utils.wrappers import make_atari, wrap_deepmind, wrap_pytorch
from utils.hyperparameters import Config
from agents.BaAgent import BaAgent
# 这两⾏不加会导致Notebook出现内核停⽌的问题
import os
超参数
utils.hypeparameter⽂件:
import torch
import math
class Config(object):
def__init__(lf):
lf.device = torch.device("cuda"if torch.cuda.is_available()el"cpu")
#PPO controls
lf.ppo_epoch =3
lf.num_mini_batch =32
lf.ppo_clip_param =0.1
#a2c controls
lf.num_agents =8
lf.value_loss_weight =0.5
lf.USE_GAE=True
lf.gae_tau =0.95
#algorithm control
lf.USE_NOISY_NETS=Fal
lf.USE_PRIORITY_REPLAY=Fal
#Multi-step returns
lf.N_STEPS =1
#epsilon variables
lf.epsilon_start =1.0
lf.epsilon_final =0.01
lf.epsilon_decay =30000
lf.epsilon_by_frame =lambda frame_idx: lf.epsilon_final +(lf.epsilon_start - lf.epsilon_final)* p(-1.* frame_idx / lf.epsilon_decay)
#misc agent variables
lf.GAMMA=0.99
lf.LR=1e-4
#memory
lf.TARGET_NET_UPDATE_FREQ =1000
lf.EXP_REPLAY_SIZE =100000
lf.BATCH_SIZE =32
lf.PRIORITY_ALPHA=0.6
lf.PRIORITY_BETA_START=0.4
lf.PRIORITY_BETA_FRAMES =100000
#Noisy Nets
lf.SIGMA_INIT=0.5
#Learning control variables
lf.LEARN_START =10000
lf.MAX_FRAMES=100000
#Categorical Params
lf.ATOMS =51
lf.V_MAX =10
lf.V_MIN =-10
#Quantile Regression Parameters
lf.QUANTILES=51
#DRQN Parameters
lf.SEQUENCE_LENGTH=8
主⽂件代码:
# 导⼊预先设定的参数
config = Config()
config.device = torch.device("cuda"if torch.cuda.is_available()el"cpu")
# epsilon为探索因⼦,越⼩随机探索的概率越低,我们期望在训练初期能给予模型更多的探索机会,找到相对更优的路径,后期稳定更新。
config.epsilon_start =1.0小学一年级汉字
config.epsilon_final =0.01
config.epsilon_decay =30000
config.epsilon_by_frame =lambda frame_idx: config.epsilon_final +(config.epsilon_start - config.epsilon_final)* p(-1.* frame_idx / config.epsilon_ decay)
# 折扣因⼦以及学习率
config.GAMMA=0.99
config.LR=1e-4
#memory
我的小脚丫config.TARGET_NET_UPDATE_FREQ =1000
config.EXP_REPLAY_SIZE =100000
config.BATCH_SIZE =32
#Learning control variables
config.LEARN_START =10000
# 最多跑多少episode
config.MAX_FRAMES=1000000
经验回放池
class ExperienceReplayMemory:
def__init__(lf, capacity):
lf.capacity = capacity
< =[]约旦签证
# 当数据过多时,删除最早的数据
def push(lf, transition):
if )> lf.capacity:
[0]
# 在经验池中抽取batch_size个样本
def sample(lf, batch_size):
return random., batch_size)
def__len__(lf):
return )
神经⽹络
class DQN(nn.Module):
def__init__(lf, input_shape, num_actions):
super(DQN, lf).__init__()
lf.input_shape = input_shape
lf.num_actions = num_actions
# 注意输⼊的是⼀个画⾯,如果有RGB三层,input_shape[0]=3
lf.fc1 = nn.Linear(lf.feature_size(),512)
lf.fc2 = nn.Linear(512, lf.num_actions)
def forward(lf, x):
# 三个卷积层+展开+带ReLU全连接层+不带ReLU全连接层
x = F.v1(x))
x = F.v2(x))
x = F.v3(x))
x = x.view(x.size(0),-1)
x = F.relu(lf.fc1(x))
x = lf.fc2(x)
return x
def feature_size(lf):
s(1,*lf.input_shape)))).view(1,-1).size(1)智能体
关于动物的单词BaAgent⽂件:
import numpy as np
import pickle
import os.path
import torch
import torch.optim as optim
class BaAgent(object):
def__init__(lf):
向日葵的生长过程
梦到前妻lf.target_model=None
lf.optimizer =None
lf.loss =[]
lf.sigma_parameter_mag=[]
def huber(lf, x):
cond =(x.abs()<1.0).float().detach()
return0.5* x.pow(2)* cond +(x.abs()-0.5)*(1.0- cond)
# 保存模型和优化器
def save_w(lf):
torch.del.state_dict(),'./saved_agents/model.dump')
torch.save(lf.optimizer.state_dict(),'./saved_agents/optim.dump')
# 导⼊模型和优化器
def load_w(lf):
fname_model ="./saved_agents/model.dump"
fname_optim ="./saved_agents/optim.dump"
if os.path.isfile(fname_model):
lf.target_model.load_state_del.state_dict())
if os.path.isfile(fname_optim):
lf.optimizer.load_state_dict(torch.load(fname_optim))
# 保存经验池
def save_replay(lf):
pickle.,open('./saved_agents/exp_replay_agent.dump','wb')) # 导⼊经验池
def load_replay(lf):
fname ='./saved_agents/exp_replay_agent.dump'
if os.path.isfile(fname):
< = pickle.load(open(fname,'rb'))
def save_sigma_param_magnitudes(lf):
tmp =[]
for name, param del.named_parameters():
quires_grad:
if'sigma'in name:
# 把数据放cpu上,ravel把数组展开成⼀维,tolist变成列表
tmp+=param.data.cpu().numpy().ravel().tolist()
if tmp:
# 计算平均值,然后加⼊列表中
lf.sigma_parameter_mag.an(np.abs(np.array(tmp))))
def save_loss(lf, loss):
lf.loss.append(loss)
def save_reward(lf, reward):
餐饮店长