基于pytorch使⽤BI-LSTM模型做中⽂⽂本分类
基于pytorch使⽤BI-LSTM模型做中⽂⽂本分类
主要涉及使⽤pandas从csv⽂件读取⽂本数据,将dataframe格式数据转为torch iteration 输出模型训练和预测。
import pandas as pd
import numpy as np
import re
import torch.utils.data as data_utils
import torch
from torchtext.legacy import data
from torchtext.legacy import datats,vocab
from tqdm import tqdm
from torch import nn
1.读取数据与预处理
数据格式为csv,按逗号分隔,主要分为complain_content、label两列,其中complain_content为分词后的中⽂⽂本。
trimmean
df = pd.read_csv("/home/jovyan/work/NLP/kesu_cate/data/kesu_clean_data.csv",encoding='utf-8')
df["complain_content"]= df["complain_content"].apply(lambda x :' '.place('[','').replace(']','').replace("'",'').replace(' ','').split(',')))#
df["label"]= df["label"].astype(float)
#划分训练集、测试集、验证集
del_lection import train_test_split
train_data,test_data= train_test_split(df,test_size =0.2,random_state =1024)
train_data,valid_data= train_test_split(train_data,test_size =0.2,random_state =1024)
2.使⽤torchtext处理dataframe数据,并⽣成词典,并创建iteration
tokenize=lambda x:x.split(' ')
TEXT=data.Field(tokenize=tokenize,quential=True)
LABEL=data.Field(quential=Fal, u_vocab=Fal)
#数据处理类
class DatatProcess(data.Datat):
def__init__(lf,df,text_tield,label_field):
fields=[('complain_content',text_tield),("label",label_field)]
examples=[]
for text , label in tqdm(zip(df['complain_content'], df['label'])):
examples.append(data.Example.fromlist([text, label], fields))
super(DatatProcess,lf).__init__(examples,fields)
train_data = DatatProcess(train_data,TEXT,LABEL)
valid_data = DatatProcess(valid_data,TEXT,LABEL)
test_data = DatatProcess(test_data,TEXT,LABEL)
#加载腾讯NLP预训练结果并建⽴词典
vectors = vocab.Vectors(name='/home/jovyan/work/NLP/pytoch/.vector_cache/tencent-ailab-embedding-zh-d100-v0.2.0-s/tencent-ailab-embedding-zh-d10 0-v0.')考研报名费
TEXT.build_vocab(train_data,max_size=25000,vectors=vectors,unk_init=al_)
LABEL.build_vocab(train_data)
train_iter,valid_iter,test_iter=data.BucketIterator.splits(
(train_data,valid_data,test_data),
batch_size=256,
device='cpu',
sort_key=lambda x:plain_content),
sort_within_batch=Fal,
repeat=Faldriver是什么意思
)
3.构建BI-LSTM模型
#初始化参数
vocab_size =len(TEXT.vocab)
embedding_size =100
output_size =1
num_layers =2
bluegill
dropout =0.3
sheepdog
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
hidden_size =100
device ='cpu'
#定义模型
class RNNModel(nn.Module):新概念英语第二册课文
def__init__(lf,vocab_size,embedding_size,output_size,hidden_size,num_layers,dropout,pad_idx): super(RNNModel,lf).__init__()
lf.lstm = nn.LSTM(embedding_size,hidden_size,bidirectional=True,num_layers=num_layers)
lf.linear = nn.Linear(hidden_size * num_layers,output_size)
lf.dropout = nn.Dropout(dropout)
def forward(lf,text):
embeded = lf.embed(text)
embeded = lf.dropout(embeded)kirs
output,(hidden,cell)= lf.lstm(embeded)
hidden = lf.dropout(hidden.squeeze())
hidden = torch.cat([hidden[-1],hidden[-2]],dim=1)
return lf.linear(hidden)
#定义评价函数和训练验证函数
def binary_accuracy(preds,y):
rounded_preds = und(torch.sigmoid(preds))
correct =(rounded_preds == y).float()
acc = correct.sum()/len(correct)
return acc
def train(model,iterator,optimizer,crit):
epoch_loss,epoch_acc =0.,0.
total_len =0.
for batch in iterator:
preds = plain_content).squeeze()
loss = crit(preds,batch.label.float())
acc = binary_accuracy(preds,batch.label)
<_grad()
loss.backward()
optimizer.step()
epoch_loss += loss.item()*len(batch.label)
银行存款日记账怎么记epoch_acc += acc.item()*len(batch.label)
total_len +=len(batch.label)
print(epoch_loss,epoch_acc,len(batch.label))
return epoch_loss / total_len,epoch_acc/total_len
def evaluate(model,iterator,crit):
epoch_loss,epoch_acc =0.,0.
model.eval()
total_len =0.
for batch in iterator:
preds = plain_content).squeeze()
loss = crit(preds,batch.label.float())
怎么说话幽默
acc = binary_accuracy(preds,batch.label)
epoch_loss += loss.item()*len(batch.label)
epoch_acc += acc.item()*len(batch.label)
epoch_acc += acc.item()*len(batch.label)
total_len +=len(batch.label)
return epoch_loss / total_len,epoch_acc/total_len
model = RNNModel(vocab_size,embedding_size,output_size,hidden_size,num_layers,dropout,pad_i
dx) #初始化模型参数
pretrained_embedding = TEXT.vocab.vectors
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
#训练模型
n_epoch =10
best_valid_acc =0.
for epoch in range(n_epoch):
train_loss,train_acc = train(model,train_iter,optimizer,crit)
valid_loss,valid_acc = evaluate(model,valid_iter,crit)
if valid_acc > best_valid_acc:
best_valid_acc = valid_acc
torch.save(model.state_dict(),'lstm_model_cn.pth')
print("Epoch",epoch,"Train loss",train_loss,"Train Acc",train_acc)
print("Epoch",epoch,"valid_loss",valid_loss,"valid Acc",valid_acc)
#测试模型
def removepad(arr):
res =[]
2012四级真题for x in arr:
if x !='<pad>':
res.append(x)
return res
def predict(test_iter):
_grad():
for batch in test_iter:
pred = torch.sigmoid(plain_content))
for i in range(len(batch.label)):
index = plain_content.t()[i]
tmp =[TEXT.vocab.itos[x]for x in index]
q =' '.join(removepad(tmp))
print(pred[i].item(),q)
参考链接