首页 > 美文鉴赏

基于CNN-LSTM命名实体识别和关系抽取联合学习

更新时间:2023-06-24 11:37:25 阅读：评论：0

Word2vec

Gensim训练Word2vec步骤

1 将语料库预处理：⼀⾏⼀个⽂档或句⼦，将⽂档或句⼦分词（以空格分割，英⽂可以不⽤分词，英⽂单词之间已经由空格分割，中⽂

预料需要使⽤分词⼯具进⾏分词，

常见的分词⼯具有StandNLP、ICTCLAS、Ansj、FudanNLP、HanLP、结巴分词等）；

2 将原始的训练语料转化成⼀个ntence的迭代器，每⼀次迭代返回的ntence是⼀个word（utf-8格式）的列表。可以使⽤

Gensim中word2vec.py中的LineSentence()⽅法实现；

3 将上⾯处理的结果输⼊Gensim内建的word2vec对象进⾏训练即可：

import os

import json

dels.word2vec import LineSentence, Word2Vec

# 将json转换为原⽂,⼀⾏⼀句

def func(fin, fout):

for line in fin:

line = line.strip()

if not line:

continue

ntence = json.loads(line)

番禺区ntence = ntence["ntText"].strip().strip('"').lower()

fout.write(ntence +'\n')

def make_corpus():

with open('data/NYT_','wt', encoding='utf-8')as fout:

with open('data/NYT_CoType/train.json','rt', encoding='utf-8')as fin:

func(fin, fout)

with open('data/NYT_CoType/test.json','rt', encoding='utf-8')as fin:

func(fin, fout)

if __name__ =="__main__":

if not ists('data/NYT_'):

make_corpus()

茶叶的种类和图片ntences = LineSentence('data/NYT_')

'''

(1)size：是指词向量的维度，默认为100。这个维度的取值⼀般与我们的语料的⼤⼩相关

(2)workers：⽤于控制训练的并⾏数。

(3)sg：训练模型 0表⽰CBOW,1表⽰skip-gram

(4)iter: 随机梯度下降法中迭代的最⼤次数，默认是5。对于⼤语料，可以增⼤这个值。

(5)negative：即使⽤Negative Sampling时负采样的个数，默认是5。推荐在[3,10]之间

'''

model = Word2Vec(ntences, sg=1, size=300, workers=4,iter=8, negative=8)

word_vectors = model.wv

word_vectors =word_vectors

word_vectors.save('data/NYT_CoType/word2vec')

word_vectors.save_word2vec_format('data/NYT_', fvocab='data/NYT_')

Tagging scheme学部委员

根据中科院论⽂中的标注，这⾥举例如下:

预处理过程主要注意⼏个要点:

1 转换⼤⼩写

2 长度不够要padding

3 将带声调⾳节(如⼀些法语单词)变形，这⾥采⽤先转换为unicode再变回去的办法

def make_tag_t(tag_t, relation_label):

'''

make_tag_t(tag_t, relation_mention["label"])

'''

if relation_label =="None":

return

for pos in"BIES":

for role in"12":

tag_t.add("-".join([pos, relation_label, role]))#pos-relation_label-role

def update_tag_q(em_text, ntence_text, relation_label, role, tag_t, tags_idx):

'''

res1 = update_tag_q(em1_text, ntence_text, relation_mention["label"], 1, tag_t, tags_idx) B-begin，I-inside，O-outside，E-end，S-single

以下均以词为单位

'''

overlap =Fal

start = arch(em_text, ntence_text)#⾸词的词index

tag ="-".join(["S", relation_label,str(role)])

if len(em_text)==1:

if tags_idx[start]!= tag_t["O"]:

overlap =True

tags_idx[start]= tag_t[tag]

el:

tag ="B"+ tag[1:]

if tags_idx[start]!= tag_t["O"]:

overlap =True

tags_idx[start]= tag_t[tag]

tag ="E"+ tag[1:]

end = start +len(em_text)-1

if tags_idx[end]!= tag_t["O"]:

overlap =True

tags_idx[end]= tag_t[tag]

tag ="I"+ tag[1:]

for index in range(start +1, end):

if tags_idx[index]!= tag_t["O"]:

overlap =True

tags_idx[index]= tag_t[tag]

return overlap

def prepare_data_t(fin, chart, vocab, relation_labels, entity_labels, tag_t, datat, fout):

'''

res=prepare_data_t(fin, chart, vocab, relation_labels, entity_labels, tag_t, train, fout)

fin:data/NYT_CoType/train.json

'''

小学趣味数学题num_overlap =0

for line in fin:

overlap =Fal

line = line.strip()#移除字符串头尾指定的字符（默认为空格或换⾏符）或字符序列。

if not line:

continue

ntence = json.loads(line)

for entity_mention in ntence["entityMentions"]:

entity_labels.add(entity_mention["label"])

for relation_mention in ntence["relationMentions"]:

relation_labels.add(relation_mention["label"])

make_tag_t(tag_t, relation_mention["label"])

ntence_text = ntence["ntText"].strip().strip('"')

ntence_text = alize('NFKD', ntence_text).encode('ascii','ignore').decode().split()#原句的⼀个个词#split()：空⽩符分隔，不包含序列开头或末尾的空⽩符。

length_nt =len(ntence_text)

if length_nt > MAX_SENT_LENGTH:

continue

lower_ntence_text =[token.lower()for token in ntence_text]

ntence_idx = prepare_quence(lower_ntence_text, vocab)#返回vocab编号构成的list

tokens_idx =[]#字母编号

for token in ntence_text:

if len(token)<= MAX_TOKEN_LENGTH:

图片怎么转文字tokens_idx.append(prepare_quence(token, chart)+[chart["<pad>"]]*(MAX_TOKEN_LENGTH-len(token)))#补全el:

tokens_idx.append(prepare_quence(token[0:13]+ token[-7:], chart))#两端开花

tags_idx =[tag_t["O"]]* length_nt #tag2id

for relation_mention in ntence["relationMentions"]:

if relation_mention["label"]=="None":

continue

em1_text = alize('NFKD', relation_mention["em1Text"]).encode('ascii','ignore').decode().split()

res1 = update_tag_q(em1_text, ntence_text, relation_mention["label"],1, tag_t, tags_idx)

em2_text = alize('NFKD', relation_mention["em2Text"]).encode('ascii','ignore').decode().split()

res2 = update_tag_q(em2_text, ntence_text, relation_mention["label"],2, tag_t, tags_idx)

if res1 or res2:

num_overlap +=1

overlap =True

datat.append((ntence_idx, tokens_idx, tags_idx))

# if overlap:

# fout.write(line+"\n")

newnt =dict()

newnt['tokens']= lower_ntence_text

newnt['tags']= tags_idx

fout.write(json.dumps(newnt)+'\n')

return num_overlap

End2End Model

CNN Encoder

其中，第⼀层卷积层可表⽰为

卷积核为3，再经过两层卷积

LSTM Decoder

import torch

as nn

functional as F

from conv_net import ConvNet

import numpy as np

import torch.autograd as autograd

from torch.autograd import Variable

class CharEncoder(nn.Module):

"""

Input: (batch_size, q_len)

Output: (batch_size, conv_size)

"""

def__init__(lf, char_num, embedding_size, channels, kernel_size, padding_idx, dropout, emb_dropout): super(CharEncoder, lf).__init__()

lf.drop = nn.Dropout(emb_dropout)

lf.init_weights()

def forward(lf, inputs):

q_len = inputs.size(1)

# (batch_size, q_len) -> (batch_size, q_len, embedding_size) -> (batch_size, embedding_size, q_len) embeddings = lf.bed(inputs)).transpo(1,2).contiguous()

# (batch_size, embedding_size, q_len) -> (batch_size, conv_size, q_len)

# -> (batch_size, conv_size, 1) -> (batch_size, conv_size)

return F.max_v_net(embeddings), q_len).squeeze()

def init_weights(lf):

nn.init.kaiming_uniform_(lf.embed.weight.data, mode='fan_in', nonlinearity='relu')

德州学院分数线class WordEncoder(nn.Module):

"""

Input: (batch_size, q_len), (batch_size, q_len, char_features)

"""100法郎

def__init__(lf, weight, channels, kernel_size, dropout, emb_dropout):

super(WordEncoder, lf).__init__()

lf.drop = nn.Dropout(emb_dropout)

def forward(lf, word_input, char_input):

# (batch_size, q_len) -> (batch_size, q_len, embedding_size)

# -> (batch_size, q_len, embedding_size + char_features)

# -> (batch_size, embedding_size + char_features, q_len)

embeddings = torch.cat((lf.embed(word_input), char_input),2).transpo(1,2).contiguous()

#print("embeddings:----------",embeddings.size())

# (batch_size, embedding_size + char_features, q_len) -> (batch_size, conv_size, q_len)

conv_out = lf.conv_net(lf.drop(embeddings))

# (batch_size, conv_size, q_len) -> (batch_size, conv_size + embedding_size + char_features, q_len) # -> (batch_size, q_len, conv_size + embedding_size + char_features)

return torch.cat((embeddings, conv_out),1).transpo(1,2).contiguous()

#lf.char_conv_size+lf.word_embedding_size+lf.word_conv_size, num_tag

class Decoder(nn.Module):

def__init__(lf,input_size,hidden_dim,output_size,NUM_LAYERS):

super(Decoder, lf).__init__()

lf.input_size=input_size

lf.hidden_dim = hidden_dim

lf.output_size=output_size

lf.lstm = nn.LSTM(input_size, hidden_dim, num_layers = NUM_LAYERS,bidirectional=True)#update on 5.21 lf.hidden2label = nn.Linear(2*lf.hidden_dim, output_size)#update on 5.21

lf.init_weight()

def forward(lf, inputs):

lf.lstm.flatten_parameters()

lstm_out, lf.hidden = lf.lstm(inputs,None)

y = lf.hidden2label(lstm_out)

return y

def init_weight(lf):

nn.init.kaiming_uniform_(lf.hidden2label.weight.data, mode='fan_in', nonlinearity='relu')

驾照照片要求

def init_hidden(lf, batch_size):

return(autograd.Variable(torch.randn(1, batch_size, lf.hidden_dim)),

autograd.Variable(torch.randn(1, batch_size, lf.hidden_dim)))

本文发布于:2023-06-24 11:37:25，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1052703.html

上一篇：目标检测网络中大框包含小框的问题，非极大值抑制算法（Non-MaximumSuppress。。。

下一篇：[pyecharts学习笔记]——Grid并行多图、组合图、多XY轴、

标签：分词训练默认空格迭代字符图片预处理

留言与评论（共有 0 条评论）