关于bert+lstm+crf实体识别训练数据的构建

更新时间:2023-06-11 20:20:11 阅读：评论：0

⼀.在实体识别中，bert+lstm+crf也是近来常⽤的⽅法。这⾥的bert可以充当固定的embedding层，也可以⽤来和其它模型⼀起训练fine-tune。⼤家知道输⼊到bert中的数据需要⼀定的格式，如在单个句⼦的前后需要加⼊"[CLS]"和“[SEP]”，需要mask等。下⾯使⽤pad_quences对句⼦长度进⾏截断以及padding填充，使每个输⼊句⼦的长度⼀致。构造训练集后，下载中⽂的预训练模型并加载相应的模型和词表vocab以参数配置，最后并利⽤albert抽取句⼦的embedding，这个embedding可以作为⼀个下游任务和其它模型进⾏组合完成特定任务的训练。

阿肯色大学排名

1import torch

2from configs.ba import config

分钟用英语怎么说deling_albert import BertConfig, BertModel

kenization_bert import BertTokenizer

5from keras.preprocessing.quence import pad_quences

6from torch.utils.data import TensorDatat, DataLoader, RandomSampler

8import os

10 device = torch.device('cuda'if torch.cuda.is_available() el"cpu")

cst off11 MAX_LEN = 10

12if__name__ == '__main__':

potato

13 bert_config = BertConfig.from_pretrained(str(config['albert_config_path']), share_type='all')

14 ba_path = os.getcwd()

15 VOCAB = ba_path + '/'# your path for model and vocab

16 tokenizer = BertTokenizer.from_pretrained(VOCAB)

18# encoder text

19 tag2idx={'[SOS]':101, '[EOS]':102, '[PAD]':0, 'B_LOC':1, 'I_LOC':2, 'O':3}

20 ntences = ['我是中华⼈民共和国国民', '我爱祖国']

21 tags = ['O O B_LOC I_LOC I_LOC I_LOC I_LOC I_LOC O O', 'O O O O']

23 tokenized_text = [kenize(nt) for nt in ntences]紫色英文

24#利⽤pad_quence对序列长度进⾏截断和padding

25 input_ids = pad_quences([vert_tokens_to_ids(txt) for txt in tokenized_text], #没法⼀条⼀条处理，只能2-d的数据，即多于⼀条样本，但是如果全部加载到内存是不是会爆

26 maxlen=MAX_LEN-2,

大一英语学习计划

27 truncating='post',

28 padding='post',

29 value=0)

31 tag_ids = pad_quences([[(tok) for tok in tag.split()] for tag in tags],

32 maxlen=MAX_LEN-2,

33 padding="post",

34 truncating="post",

35 value=0)

37#bert中的句⼦前后需要加⼊[CLS]:101和[SEP]:102

38 input_ids_cls_p = []

39for input_id in input_ids:

40 linelist = []

41 linelist.append(101)

42 flag = True

43for tag in input_id:

44if tag > 0:

45 linelist.append(tag)

scud46elif tag == 0 and flag:

47 linelist.append(102)

48 linelist.append(tag)

49 flag = Fal

50el:

51 linelist.append(tag)

52if tag > 0:

53 linelist.append(102)

54 input_ids_cls_p.append(linelist)

cfr56 tag_ids_cls_p = []

57for tag_id in tag_ids:

58 linelist = []

59 linelist.append(101)

60 flag = True

61for tag in tag_id:

62if tag > 0:

63 linelist.append(tag)

64elif tag == 0 and flag:

65 linelist.append(102)

66 linelist.append(tag)

respectful67 flag = Fal

68el:

69 linelist.append(tag)

70if tag > 0:

71 linelist.append(102)

72 tag_ids_cls_p.append(linelist)

74 attention_masks = [[int(tok > 0) for tok in line] for line in input_ids_cls_p]

76print('---------------------------')

77print('input_ids:{}'.format(input_ids_cls_p))

78print('tag_ids:{}'.format(tag_ids_cls_p))城乡规划专业就业前景

79print('attention_masks:{}'.format(attention_masks))

82# input_ids = sor([de('我是中华⼈民共和国国民', add_special_tokens=True)]) #为True则句⼦⾸尾添加[CLS]和[SEP]

83# print('input_ids:{}, size:{}'.format(input_ids, len(input_ids)))

84# print('attention_masks:{}, size:{}'.format(attention_masks, len(attention_masks)))

86 inputs_tensor = sor(input_ids_cls_p)

87 tags_tensor = sor(tag_ids_cls_p)

88 masks_tensor = sor(attention_masks)

90 train_data = TensorDatat(inputs_tensor, tags_tensor, masks_tensor)

91 train_sampler = RandomSampler(train_data)

92 train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=2)

94 model = BertModel.from_pretrained(config['bert_dir'],config=bert_config)

95 (device)

96 model.eval()

97 _grad():

98'''

99 note:

100⼀.

101如果设置："output_hidden_states":"True"和"output_attentions":"True"

102输出的是：所有层的 quence_output, pooled_output, (hidden_states), (attentions)

103则 all_hidden_states, all_attentions = model(input_ids)[-2:]

104

105⼆.

106如果没有设置：output_hidden_states和output_attentions

107输出的是：最后⼀层 --> (output_hidden_states, output_attentions)

108'''

109for index, batch in enumerate(train_dataloader):

110 batch = (device) for t in batch)

111 b_input_ids, b_input_mask, b_labels = batch

112 last_hidden_state = model(input_ids = b_input_ids,attention_mask = b_input_mask)

113print(len(last_hidden_state))

114 all_hidden_states, all_attentions = last_hidden_state[-2:] #这⾥获取所有层的hidden_satates以及attentions

115print(all_hidden_states[-2].shape)#倒数第⼆层hidden_states的shape

　print(all_hidden_states[-2])

⼆.打印结果

input_ids:[[101, 2769, 3221, 704, 1290, 782, 3696, 1066, 1469, 102], [101, 2769, 4263, 4862, 1744, 102, 0, 0, 0, 0]] tag_ids:[[101, 3, 3, 1, 2, 2, 2, 2, 2, 102], [101, 3, 3, 3, 3, 102, 0, 0, 0, 0]]

attention_masks:[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 0, 0, 0, 0]]

torch.Size([2, 10, 768])

tensor([[[-1.1074, -0.0047, 0.4608, ..., -0.1816, -0.6379, 0.2295],

[-0.1930, -0.4629, 0.4127, ..., -0.5227, -0.2401, -0.1014],

[ 0.2682, -0.6617, 0.2744, ..., -0.6689, -0.4464, 0.1460],

...,

[-0.1723, -0.7065, 0.4111, ..., -0.6570, -0.3490, -0.5541],

[-0.2028, -0.7025, 0.3954, ..., -0.6566, -0.3653, -0.5655],

[-0.2026, -0.6831, 0.3778, ..., -0.6461, -0.3654, -0.5523]],

[[-1.3166, -0.0052, 0.6554, ..., -0.2217, -0.5685, 0.4270],

[-0.2755, -0.3229, 0.4831, ..., -0.5839, -0.1757, -0.1054],

[-1.4941, -0.1436, 0.8720, ..., -0.8316, -0.5213, -0.3893],

...,

[-0.7022, -0.4104, 0.5598, ..., -0.6664, -0.1627, -0.6270],

[-0.7389, -0.2896, 0.6083, ..., -0.7895, -0.2251, -0.4088],

[-0.0351, -0.9981, 0.0660, ..., -0.4606, 0.4439, -0.6745]]])

本文发布于:2023-06-11 20:20:11，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/90/141698.html

上一篇：粘合剂MSDS

下一篇：法国插头NF新标准NF C61-314

标签：训练模型加载截断数据长度任务识别

留言与评论（共有 0 条评论）