TextCNN(文本分类)

更新时间:2023-05-27 09:01:02 阅读: 评论:0

TextCNN(⽂本分类)TextCNN⽹络结构如图所⽰:
引经据典利⽤TextCNN做⽂本分类基本流程(以句⼦分类为例):
(1)将句⼦转成词,利⽤词建⽴字典
(2)词转成向量(word2vec,Glove,bert,nn.embedding)
(3)句⼦补0操作变成等长
(4)建TextCNN模型,训练,测试
TextCNN按照流程的⼀个例⼦。
1,预测结果不是很好,句⼦太少
2,没有⽤到复杂的word2vec的模型
3,句⼦太少,没有eval function。
import torch
as nn
functional as F
ntence = [['The movie is great'],['Tom has a headache today'],['I think the apple is bad'],\
['You are so beautiful']]
Label = [1,0,0,1]
test_ntence = ['The apple is great']
class ntence2id:
def __init__(lf,ntence):
< = ntence
lf.dic = {}
lf.words = []
lf.words = []
lf.words_num = 0
def n2n(lf,ntence): ##⼤写转⼩写
nten = []
过年有哪些习俗if type(ntence[0])== list:
for n in ntence:
n = n[0].lower()
nten.append([n])
el:
nten.append(ntence)
nten = lf.n2n(nten)
return nten
def countword(lf): ##统计单词个数
>>##[建库过程不涉及到test模块]>>###
for n :
n = n[0].split(' ') ##空格分隔
for word in n:
lf.words.append(word.lower())
lf.words = list(t(lf.words))
lf.words = sorted(lf.words)
lf.words_num = len(lf.words)
return lf.words,lf.words_num
def word2id(lf): ### 创建词汇表
flag = 1
for word in lf.words:
if flag <= lf.words_num:
lf.dic[word] = flag
flag += 1
#print(lf.dic)
return lf.dic
def n2id(lf,ntence): ###
ntence = lf.n2n(ntence)
ntoid = []
for n in ntence:
nten = []
for word in n[0].split():
nten.append(lf.dic[word])
ntoid.append(nten)
return ntoid
def padded(ntence,pad_token): #token'<pad>'
max_len = len(ntence[0])
for i in range(0,len(ntence)-1):
if max_len < len(ntence[i+1]):
max_len = len(ntence[i+1])
i += 1
for i in range(0,len(ntence)):
开元化工
for j in range(0,max_len-len(ntence[i])):
ntence[i].append(pad_token)
return ntence
class ModelEmbeddings(nn.Module):
一流大学名单
def __init__(lf,words_num,embed_size,pad_token):
super(ModelEmbeddings, lf).__init__()
lf.words_num = words_num
lf.Embedding = nn.Embedding(words_num,embed_size,pad_token)
class textCNN(nn.Module):
def __init__(lf,words_num,embed_size,class_num,dropout_rate=0.1):
def __init__(lf,words_num,embed_size,class_num,dropout_rate=0.1):
super(textCNN, lf).__init__()
lf.words_num = words_num什么是阴阳
需求近义词lf.embed_size = embed_size
lf.class_num = class_num
lf.max_pool1 = nn.MaxPool1d(5)
lf.max_pool2 = nn.MaxPool1d(4)
lf.max_pool3 = nn.MaxPool1d(3)
lf.dropout = nn.Dropout(dropout_rate)
lf.linear = nn.Linear(3*3*1,class_num)
# 3 -> out_channels 3 ->kernel_size 1 ->max_pool
def forward(lf,n_embed): #(batch,max_len,embed_size)
n_embed = n_embed.unsqueeze(1) #(batch,in_channels,max_len,embed_size)
conv1 = F.v1(n_embed))  # ->(batch_size,out_channels.size,1)
conv2 = F.v2(n_embed))
conv3 = F.v3(n_embed))
conv1 = torch.squeeze(conv1,dim=3)
conv2 = torch.squeeze(conv2,dim=3)
conv3 = torch.squeeze(conv3,dim=3)
x1 = lf.max_pool1(conv1)
x2 = lf.max_pool2(conv2)
x3 = lf.max_pool3(conv3)
x = torch.cat((x1,x2),dim=1)
x = torch.cat((x,x3),dim=1).squeeze(dim=2)
output = lf.linear(lf.dropout(x))
return output
def train(model,ntence,label):
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
steps = 0
best_acc = 0
print ("-"*80)
print('')
for epoch in range(1,2): ##2个epoch
for step,x in enumerate(torch.split(ntence,1,dim=0)):
target = s(1)
体育特长情况target[0] = label[step]
target = sor(target,dtype=torch.long)
<_grad()
output  = model(x)
loss = criterion(output, target)
#loss.backward()
loss.backward(retain_graph=True)
optimizer.step()
if step % 2 == 0:
result = torch.max(output,1)[1].view(target.size())
corrects = (result.data == target.data).sum()
accuracy = corrects*100.0/1 ####1 is batch size
超氧化物歧化酶
print('Epoch:',epoch,'step:',step,'- loss: %.6f'% loss.data.item(),\
print('Epoch:',epoch,'step:',step,'- loss: %.6f'% loss.data.item(),\
'acc: %.4f'%accuracy)
return model
if __name__ == '__main__':
test = ntence2id(ntence)
test.n2n(ntence)
word,words_num = untword()
test.word2id()
n_train = test.n2id(ntence)
n_test = test.n2id(test_ntence)
X_train = torch.LongTensor((padded(n_train,0)))
X_test = torch.LongTensor((padded(n_test,0)))
Embedding = ModelEmbeddings(words_num+1,10,0)
X_train_embed = Embedding.Embedding(X_train)
X_test_embed = Embedding.Embedding(X_test)
print(X_train_embed.size())
#print(X_test_embed.size())
## TextCNN
textcnn = textCNN(words_num,10,2)
model = train(textcnn,X_train_embed,Label)
print(torch.max(model(X_test_embed),1)[1])
其中建⽴卷积层时,可以采⽤nn.ModuleList(),因为⽤起来不熟练,就直接展开了。等学好了,再补。

本文发布于:2023-05-27 09:01:02,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/89/936562.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:流程   涉及   分类   训练   统计   特长   转成
相关文章
留言与评论(共有 0 条评论)
   
验证码:
推荐文章
排行榜
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图