Word2Vec 学习(Skip-gram方法)
参考资料:https://wmathor.com/index.php/archives/1443/
import torchimport numpy as np import unicodedataimport stringimport reimport torch.nn as nnimport torch.optim as optimimport torch.utils.data as data
字典库
class Dictionary: def __init__(lf, name): lf.name=name lf.words=[] def addWord(lf, word): if word not in lf.words: lf.words.append(word) def getSize(lf): return len(lf.words)
字符串转换函数,可去除标点符号,统一为小写
(参照pytorch中文文档教程)
def unicodeToAscii(s): return ''.join( c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn' )def normalizeString(s): s = unicodeToAscii(s.lower()) s = re.sub(r"([.!?])", r" ", s) s = re.sub(r"[^a-zA-Z.!?]+", r" ", s) s=s.strip() return s
从文件中读取字符串
lines=[]with open('data\\news.txt',encoding='utf-8') as f: phras=f.read().strip().split('\n') for phra in phras: phra=phra.strip().split('.') for p in phra: if p!='': lines+=normalizeString(p定语从句三要素).split(' ')
建立中心/上下文词对
engDictionary=Dictionary('English')for word in lines: engDictionary.addWord(word)def make_list(lines): words_list=[] for i in range(2, len(l乞ines)-2): centre=engDictionary.words.index(lines[i]) context=[] for t in range(i-2, i+3): if t!=i: context.append(engDictionary.words.index(lines[t])) for w in context: words_list.append([centre, w]) return words_list words_list=make_list(lines)
转换为训练数据
ps: 在造one-hot向量时要用numpy的narray,直接用torch.tensor会报错
input_data=[]output_data=[]def make_data(words_list): for w in words_list: k=np.zeros(engDictionary.getSize()) k[w[0]]=1 input_data.append(k) output_data.append(w[1])make_data(words_list)input_data=torch.Tensor(input_data)output_data=torch.LongTensor(output_data)datat=data.TensorDatat(input_data, output_data)dataloader=data.DataLoader(datat, batch_size=8, shuffle=True)
模型
ps: 最后不需要softmax,crosntropy自带softmax
class Skip_gram(nn.Module): def __init__(lf, input_size, hidden_size): super(Skip_gram, lf).__init__() lf.hidden_size=hidden_size lf.W=nn.Linear(input_size, hidden_size, bias=Fal) lf.V=nn.Linear(hidden_size, input_size, bias=Fal) def forward(lf, x): x=s2017版 我的前半生 elf.W(x) x=lf.V(x) return x model=Skip_gram(engDi醒酒的食物ctionary.getSize(), 10)
训练
ps: 1. optimizer用Adam, 不用SGD
2. crosntropy的输入一个为概率向量,另一个为分类标签的值,不是向量
criterion=nn.CrossEntropyLoss()optimizer=optim.Adam(model.parameters(), lr=0.001)def train(epoches): for epoch in range(1, epoches+1): sumloss=0 for x, y in iter(dataloader):作文写作 optimizer.zero_grad() o=model(x) loss=criterion(o, y) sumloss+=loss.item() loss.backward() optimizer.step() if epoch%100==0: print('epoch {}: loss:{:.2f}'.format(epoch, sumloss))train(1000)
保存网络数据
PATH='./word2vec.pth'torch.save(model.state_dict(), PATH)
本文地址:https://blog.csdn.net/qq_32401661/article/details/109960321
本文发布于:2023-04-09 00:18:43,感谢您对本站的认可!
本文链接:https://www.wtabcd.cn/fanwen/zuowen/a0034139c2320826309bf58e56480af4.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
本文word下载地址:Word2Vec 学习整理.doc
本文 PDF 下载地址:Word2Vec 学习整理.pdf
留言与评论(共有 0 条评论) |