pytorch搭建TextRNN做文本分类,TextRNN加Attention做对比

更新时间:2023-07-04 17:08:51 阅读: 评论:0

pytorch搭建TextRNN做⽂本分类,TextRNN加Attention做对⽐
数据集来源:。
完整⼯程代码点击。
数据集⽐较庞⼤,14个类别,每个⽂本平均长度为900。⼀开始就是搭建了很简单的RNN,然后出问题了,模型不收敛,后来看到其他⼤佬分享的baline,基本都是把⽂本截断的,截断到250左右。
于是我截断了下,模型有点收敛了,但是跑了⼏⼗个epoch还是0.3的精度上不去。。。。
然后⼜找了别⼈ 的TextRNN模型框架,发现了有个很细微的区别,别⼈的Lstm⾥⾯加了dropout,我就有点⼉懵,这不是防过拟合的吗?这个模型都还没收敛呢?咋肥事?
本着实践是检验真理的唯⼀标准,我简单加了dropout进⾏试验,没想到模型很快就收敛了。。。。。。。准确率直接奔着0.8去了。。。。
我都⽆语了,咋肥事?
于是搜索了下资料,得到如下:
rnn有放⼤噪⾳的功能,某些时候会反过来伤害模型的学习能⼒,添加dropout会在训练时有选择的抛弃掉⼀些⽆⽤的信息。
让我瞬间恍然⼤悟,为啥呢?因为我当时和TextCNN做⽐较,我也简单搭建了⼀个TextCNN跑这个新闻分类数据,TextCNN很快就收敛了,⽽且准确率也很快上升到0.8左右。
因为TextCNN底层类似于有分段功能,把⽂本分段成很多部分,那么⼀个很长的⽂本中,有⼀些⽆⽤的⼲扰信息⽚段的⽂本,就会被CNN 给踢了,但是RNN就是循环神经⽹络,从左到右不会扔掉任何数据,⼀直循环到结束,这个过程就会不可控的把⼀些⽆⽤⼲扰信息也学习了进来,导致模型⽆法识别收敛。
总结
经过反复实验,结论如下:
1、textcnn的鲁棒性远⾼于textrnn,不论是长⽂本还是短⽂本,抗⼲扰性能都表现良好,在普通的⽂本分类任务中效果教好,但是,对于需要分析更复杂的语义信息这种,textrnn效果好于textcnn,textrnn可以挖掘出更深⼊的信息,做任务时需要有针对性的选择模型。
2、长⽂本很考验RNN的学习能⼒,对模型的要求也很⾼,可以利⽤把长⽂本分成⼏个短⽂本进⾏训
练,效果会好很多。
下⾯的代码为了尽快跑出结果缩⼩了模型规模,所以准确率没到达0.80,需要的同学的⾃⼰改模型结构哈~
普通TextRNN代码
import pandas as pd
from collections import Counter
import pandas as pd
as nn
functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
del_lection import train_test_split
del_lection import train_test_split, GroupKFold, KFold
import numpy as np
import torch
from    torch import autograd
import os
from tqdm import tqdm
dels.word2vec import Word2Vec
#device = torch.device("cuda:0" if torch.cuda.is_available() el "cpu")
device = torch.device("cuda:0")
#--------------------------加载数据----------------------------
df = pd.read_csv('/新闻⽂本分类/train_t.csv',p='\t')
#mx_length = 900
#mx_length = 900
vocabs_size =0
n_class =14
training_step =20#迭代次数
batch_size =256#每个批次的⼤⼩
train =[]
targets =[]
label = df['label'].values
text = df['text'].values
id=0
for val in tqdm(text):sanitize
s = val.split(' ')
single_data =[]
for i in range(len(s)):
vocabs_size =max(vocabs_size,int(s[i])+1)
single_data.append(int(s[i])+1)
if len(single_data)>=256:
train.append(single_data)
targets.append(int(label[id]))
single_data =[]
if len(single_data)>=150:
single_data = single_data +[0]*(256-len(single_data))
train.append(single_data)
decade什么意思
targets.append(int(label[id]))
id+=1
train = np.array(train)
targets = np.array(targets)
class Bi_Lstm(nn.Module):
def__init__(lf):
super(Bi_Lstm,lf).__init__()
lf.lstm = nn.LSTM(input_size =100, hidden_size =100,num_layers =1,bidirectional =Fal,batch_first=True,dropout=0.5)#加了双向,输出的节点数翻2倍
lf.l1 = nn.BatchNorm1d(100)ability
lf.l2 = nn.ReLU()
lf.l3 = nn.Linear(100,n_class)#特征输⼊
lf.l4 = nn.Dropout(0.3)
lf.l5 = nn.BatchNorm1d(n_class)
def forward(lf, x):
x = lf.embeding(x)
out,_ = lf.lstm(x)
#选择最后⼀个时间点的output
out = lf.l1(out[:,-1,:])
out = lf.l2(out)
out = lf.l3(out)
out = lf.l4(out)
out = lf.l5(out)
return out
print(train.shape)
print(targets.shape)
kf = KFold(n_splits=5, shuffle=True, random_state=2021)#5折交叉验证
for fold,(train_idx, test_idx)in enumerate(kf.split(train, targets)):
print('-'*15,'>',f'Fold {fold+1}','<','-'*15)
x_train, x_val = train[train_idx], train[test_idx]
y_train, y_val = targets[train_idx], targets[test_idx]
M_train =len(x_train)-1
M_val =len(x_val)
M_val =len(x_val)
x_train = torch.from_numpy(x_train).to(torch.long).to(device)
x_val = torch.from_numpy(x_val).to(torch.long).to(device)
y_train = torch.from_numpy(y_train).to(torch.long).to(device)
y_val = torch.from_numpy(y_val).to(torch.long).to(device)
model = Bi_Lstm()
<(device)
学习经验演讲稿
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
loss_func = nn.CrossEntropyLoss()#多分类的任务
#开始迭代
for step in range(training_step):
print('step=',step)
L_val =-batch_size
with tqdm(np.arange(0,M_train,batch_size), desc='')as tbar:
for index in tbar:
L = index
R =min(M_train,index+batch_size)
L_val += batch_size
杜甫简介英文
L_val %= M_val
R_val =min(M_val,L_val + batch_size)
#-----------------训练内容------------------
train_pre = model(x_train[L:R])# 喂给 model训练数据 x, 输出预测值
train_loss = loss_func(train_pre, y_train[L:R])
val_pre = model(x_val[L_val:R_val])#验证集也得分批次,不然数据量太⼤内存爆炸
val_loss = loss_func(val_pre, y_val[L_val:R_val])
#----------- -----计算准确率----------------
train_acc = np.sum(np.argmax(np.array(train_pre.data.cpu()),axis=1)== np.array(y_train[L:R].data.cpu()))/(R-L)
val_acc = np.sum(np.argmax(np.array(val_pre.data.cpu()),axis=1)== np.array(y_val[L_val:R_val].data.cpu()))/(R_val-L_val)
#---------------打印在进度条上--------------
tbar.t_postfix(train_loss=float(train_loss.data.cpu()),train_acc=train_acc,val_loss=float(val_loss.data.cpu()),val_acc=val_acc)                tbar.update()# 默认参数n=1,每update⼀次,进度+n
#-----------------反向传播更新---------------
圣诞音乐<_grad()# 清空上⼀步的残余更新参数值
train_loss.backward()# 以训练集的误差进⾏反向传播, 计算参数更新值
optimizer.step()# 将参数更新值施加到 net 的 parameters 上
del model
运⾏结果
加了Attention的代码
import pandas as pd
from collections import Counter
import pandas as pd
as nn
functional as F
from torch.autograd import Variable
from sklearn.preprocessing import MinMaxScaler
del_lection import train_test_split
del_lection import train_test_split, GroupKFold, KFold import numpy as np
import torch
from    torch import autograd
qjd是啥意思
import os
from tqdm import tqdm
dels.word2vec import Word2Vec
#device = torch.device("cuda:0" if torch.cuda.is_available() el "cpu") device = torch.device("cuda:0")
#--------------------------加载数据----------------------------
df = pd.read_csv('/新闻⽂本分类/train_t.csv',p='\t')
#mx_length = 900
vocabs_size =0
connecting peoplen_class =14
training_step =20#迭代次数
toy
batch_size =256#每个批次的⼤⼩
train =[]
targets =[]
label = df['label'].values
text = df['text'].values
id=0
for val in tqdm(text):
s = val.split(' ')
single_data =[]
for i in range(len(s)):
vocabs_size =max(vocabs_size,int(s[i])+1)
single_data.append(int(s[i])+1)
if len(single_data)>=256:
train.append(single_data)
targets.append(int(label[id]))
targets.append(int(label[id]))
single_data =[]
if len(single_data)>=150:
single_data = single_data +[0]*(256-len(single_data))
train.append(single_data)
targets.append(int(label[id]))
id+=1
train = np.array(train)
targets = np.array(targets)
class Bi_Lstm(nn.Module):
def__init__(lf):
super(Bi_Lstm,lf).__init__()
lf.lstm = nn.LSTM(input_size =100, hidden_size =100,num_layers =1,bidirectional =Fal,batch_first=True,dropout=0.5)#加了双向,输出的节点数翻2倍
lf.l1 = nn.BatchNorm1d(100)
lf.l2 = nn.ReLU()
lf.l3 = nn.Linear(100,n_class)#特征输⼊
lf.l4 = nn.Dropout(0.3)
lf.l5 = nn.BatchNorm1d(n_class)
def attention_net(lf, lstm_output, final_state):
batch_size =len(lstm_output)
hidden = final_state.view(batch_size,-1,1)# hidden : [batch_size, n_hidden * num_directions(=2), n_layer(=1)]
attn_weights = torch.bmm(lstm_output, hidden).squeeze(2)# attn_weights : [batch_size, n_step]
soft_attn_weights = F.softmax(attn_weights,1)
# context : [batch_size, n_hidden * num_directions(=2)]
context = torch.bmm(anspo(1,2), soft_attn_weights.unsqueeze(2)).squeeze(2)
return context, soft_attn_weights
def forward(lf, x):
x = lf.embeding(x)
#out,_ = lf.lstm(x)
out,(final_hidden_state, final_cell_state)= lf.lstm(x)
#选择最后⼀个时间点的output
'''
out = lf.l1(out[:,-1,:])
out = lf.l2(out)
out = lf.l3(out)
out = lf.l4(out)
out = lf.l5(out)
'''
#output = anspo(0, 1) # output : [batch_size, q_len, n_hidden]
attn_output, attention = lf.attention_net(out, final_hidden_state)
#return lf.out(attn_output), attention # model : [batch_size, num_class]
out = lf.l3(attn_output)
out = lf.l4(out)
out = lf.l5(out)我爱你的英语
return out
print(train.shape)
print(targets.shape)
kf = KFold(n_splits=5, shuffle=True, random_state=2021)#5折交叉验证
for fold,(train_idx, test_idx)in enumerate(kf.split(train, targets)):
print('-'*15,'>',f'Fold {fold+1}','<','-'*15)
x_train, x_val = train[train_idx], train[test_idx]
y_train, y_val = targets[train_idx], targets[test_idx]
M_train =len(x_train)-1
M_val =len(x_val)

本文发布于:2023-07-04 17:08:51,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/90/166968.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:模型   数据   学习   收敛   选择   训练   需要   分类
相关文章
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图