基于CNN-LSTM命名实体识别和关系抽取联合学习
Word2vec
Gensim训练Word2vec步骤
1 将语料库预处理:⼀⾏⼀个⽂档或句⼦,将⽂档或句⼦分词(以空格分割,英⽂可以不⽤分词,英⽂单词之间已经由空格分割,中⽂
预料需要使⽤分词⼯具进⾏分词,
常见的分词⼯具有StandNLP、ICTCLAS、Ansj、FudanNLP、HanLP、结巴分词等);
2 将原始的训练语料转化成⼀个ntence的迭代器,每⼀次迭代返回的ntence是⼀个word(utf-8格式)的列表。可以使⽤
Gensim中word2vec.py中的LineSentence()⽅法实现;
3 将上⾯处理的结果输⼊Gensim内建的word2vec对象进⾏训练即可:
import os
import json
dels.word2vec import LineSentence, Word2Vec
# 将json转换为原⽂,⼀⾏⼀句
def func(fin, fout):
for line in fin:
line = line.strip()
if not line:
continue
ntence = json.loads(line)
番禺区ntence = ntence["ntText"].strip().strip('"').lower()
fout.write(ntence +'\n')
def make_corpus():
with open('data/NYT_','wt', encoding='utf-8')as fout:
with open('data/NYT_CoType/train.json','rt', encoding='utf-8')as fin:
func(fin, fout)
with open('data/NYT_CoType/test.json','rt', encoding='utf-8')as fin:
func(fin, fout)
if __name__ =="__main__":
if not ists('data/NYT_'):
make_corpus()
茶叶的种类和图片ntences = LineSentence('data/NYT_')
'''
(1)size:是指词向量的维度,默认为100。这个维度的取值⼀般与我们的语料的⼤⼩相关
(2)workers:⽤于控制训练的并⾏数。
(3)sg:训练模型 0表⽰CBOW,1表⽰skip-gram
(4)iter: 随机梯度下降法中迭代的最⼤次数,默认是5。对于⼤语料,可以增⼤这个值。
(5)negative:即使⽤Negative Sampling时负采样的个数,默认是5。推荐在[3,10]之间
'''
model = Word2Vec(ntences, sg=1, size=300, workers=4,iter=8, negative=8)
word_vectors = model.wv
word_vectors =word_vectors
word_vectors.save('data/NYT_CoType/word2vec')
word_vectors.save_word2vec_format('data/NYT_', fvocab='data/NYT_')
Tagging scheme学部委员
根据中科院论⽂中的标注,这⾥举例如下:
预处理过程主要注意⼏个要点:
1 转换⼤⼩写
2 长度不够要padding
3 将带声调⾳节(如⼀些法语单词)变形,这⾥采⽤先转换为unicode再变回去的办法
def make_tag_t(tag_t, relation_label):
'''
make_tag_t(tag_t, relation_mention["label"])
'''
if relation_label =="None":
return
for pos in"BIES":
for role in"12":
tag_t.add("-".join([pos, relation_label, role]))#pos-relation_label-role
def update_tag_q(em_text, ntence_text, relation_label, role, tag_t, tags_idx):
'''
res1 = update_tag_q(em1_text, ntence_text, relation_mention["label"], 1, tag_t, tags_idx) B-begin,I-inside,O-outside,E-end,S-single
以下均以词为单位
'''
overlap =Fal
start = arch(em_text, ntence_text)#⾸词的词index
tag ="-".join(["S", relation_label,str(role)])
if len(em_text)==1:
if tags_idx[start]!= tag_t["O"]:
overlap =True
tags_idx[start]= tag_t[tag]
el:
tag ="B"+ tag[1:]
if tags_idx[start]!= tag_t["O"]:
overlap =True
tags_idx[start]= tag_t[tag]
tag ="E"+ tag[1:]
end = start +len(em_text)-1
if tags_idx[end]!= tag_t["O"]:
overlap =True
tags_idx[end]= tag_t[tag]
tag ="I"+ tag[1:]
for index in range(start +1, end):
if tags_idx[index]!= tag_t["O"]:
overlap =True
tags_idx[index]= tag_t[tag]
return overlap
def prepare_data_t(fin, chart, vocab, relation_labels, entity_labels, tag_t, datat, fout):
'''
res=prepare_data_t(fin, chart, vocab, relation_labels, entity_labels, tag_t, train, fout)
fin:data/NYT_CoType/train.json
'''
小学趣味数学题num_overlap =0
for line in fin:
overlap =Fal
line = line.strip()#移除字符串头尾指定的字符(默认为空格或换⾏符)或字符序列。
if not line:
if not line:
continue
ntence = json.loads(line)
for entity_mention in ntence["entityMentions"]:
entity_labels.add(entity_mention["label"])
for relation_mention in ntence["relationMentions"]:
relation_labels.add(relation_mention["label"])
make_tag_t(tag_t, relation_mention["label"])
ntence_text = ntence["ntText"].strip().strip('"')
ntence_text = alize('NFKD', ntence_text).encode('ascii','ignore').decode().split()#原句的⼀个个词#split():空⽩符分隔,不包含序列开头或末尾的空⽩符。
length_nt =len(ntence_text)
if length_nt > MAX_SENT_LENGTH:
continue
lower_ntence_text =[token.lower()for token in ntence_text]
ntence_idx = prepare_quence(lower_ntence_text, vocab)#返回vocab编号构成的list
tokens_idx =[]#字母编号
for token in ntence_text:
if len(token)<= MAX_TOKEN_LENGTH:
图片怎么转文字tokens_idx.append(prepare_quence(token, chart)+[chart["<pad>"]]*(MAX_TOKEN_LENGTH-len(token)))#补全el:
tokens_idx.append(prepare_quence(token[0:13]+ token[-7:], chart))#两端开花
tags_idx =[tag_t["O"]]* length_nt #tag2id
for relation_mention in ntence["relationMentions"]:
if relation_mention["label"]=="None":
continue
em1_text = alize('NFKD', relation_mention["em1Text"]).encode('ascii','ignore').decode().split()
res1 = update_tag_q(em1_text, ntence_text, relation_mention["label"],1, tag_t, tags_idx)
em2_text = alize('NFKD', relation_mention["em2Text"]).encode('ascii','ignore').decode().split()
res2 = update_tag_q(em2_text, ntence_text, relation_mention["label"],2, tag_t, tags_idx)
if res1 or res2:
num_overlap +=1
overlap =True
datat.append((ntence_idx, tokens_idx, tags_idx))
# if overlap:
# fout.write(line+"\n")
newnt =dict()
newnt['tokens']= lower_ntence_text
newnt['tags']= tags_idx
fout.write(json.dumps(newnt)+'\n')
return num_overlap
End2End Model
CNN Encoder
其中,第⼀层卷积层可表⽰为
卷积核为3,再经过两层卷积
LSTM Decoder
import torch
as nn
functional as F
from conv_net import ConvNet
import numpy as np
import torch.autograd as autograd
from torch.autograd import Variable
class CharEncoder(nn.Module):
"""
Input: (batch_size, q_len)
Output: (batch_size, conv_size)
"""
def__init__(lf, char_num, embedding_size, channels, kernel_size, padding_idx, dropout, emb_dropout): super(CharEncoder, lf).__init__()
lf.drop = nn.Dropout(emb_dropout)
lf.init_weights()
def forward(lf, inputs):
q_len = inputs.size(1)
# (batch_size, q_len) -> (batch_size, q_len, embedding_size) -> (batch_size, embedding_size, q_len) embeddings = lf.bed(inputs)).transpo(1,2).contiguous()
# (batch_size, embedding_size, q_len) -> (batch_size, conv_size, q_len)
# -> (batch_size, conv_size, 1) -> (batch_size, conv_size)
return F.max_v_net(embeddings), q_len).squeeze()
def init_weights(lf):
nn.init.kaiming_uniform_(lf.embed.weight.data, mode='fan_in', nonlinearity='relu')
德州学院分数线class WordEncoder(nn.Module):
"""
Input: (batch_size, q_len), (batch_size, q_len, char_features)
"""100法郎
def__init__(lf, weight, channels, kernel_size, dropout, emb_dropout):
super(WordEncoder, lf).__init__()
lf.drop = nn.Dropout(emb_dropout)
def forward(lf, word_input, char_input):
# (batch_size, q_len) -> (batch_size, q_len, embedding_size)
# -> (batch_size, q_len, embedding_size + char_features)
# -> (batch_size, embedding_size + char_features, q_len)
embeddings = torch.cat((lf.embed(word_input), char_input),2).transpo(1,2).contiguous()
#print("embeddings:----------",embeddings.size())
# (batch_size, embedding_size + char_features, q_len) -> (batch_size, conv_size, q_len)
conv_out = lf.conv_net(lf.drop(embeddings))
# (batch_size, conv_size, q_len) -> (batch_size, conv_size + embedding_size + char_features, q_len) # -> (batch_size, q_len, conv_size + embedding_size + char_features)
return torch.cat((embeddings, conv_out),1).transpo(1,2).contiguous()
#lf.char_conv_size+lf.word_embedding_size+lf.word_conv_size, num_tag
class Decoder(nn.Module):
def__init__(lf,input_size,hidden_dim,output_size,NUM_LAYERS):
super(Decoder, lf).__init__()
lf.input_size=input_size
lf.hidden_dim = hidden_dim
lf.output_size=output_size
lf.lstm = nn.LSTM(input_size, hidden_dim, num_layers = NUM_LAYERS,bidirectional=True)#update on 5.21 lf.hidden2label = nn.Linear(2*lf.hidden_dim, output_size)#update on 5.21
lf.init_weight()
def forward(lf, inputs):
lf.lstm.flatten_parameters()
lstm_out, lf.hidden = lf.lstm(inputs,None)
y = lf.hidden2label(lstm_out)
return y
def init_weight(lf):
nn.init.kaiming_uniform_(lf.hidden2label.weight.data, mode='fan_in', nonlinearity='relu')
驾照照片要求
def init_hidden(lf, batch_size):
return(autograd.Variable(torch.randn(1, batch_size, lf.hidden_dim)),
autograd.Variable(torch.randn(1, batch_size, lf.hidden_dim)))