Datawhale组队学习NLP_Bert抽取式问答学习笔记
任务:抽取式问答
数据集:squad
三个key:“context", "question"和“answers”
# 展⽰训练集的第⼀个句⼦
datats["train"][0]
{'id':'5733be284776f41900661182',
'title':'University_of_Notre_Dame',
'context':'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in fro nt of the Main Building and facing it, is a copper statue of Christ with arms upraid with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of praye
r and reflection. It is a replica of the grotto at Lourdes, F rance where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects thro ugh 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
'question':'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
'answers':{'text':['Saint Bernadette Soubirous'],'answer_start':[515]}}
answers保存答案的开始位置和整个答案内容
1 数据读⼊
依旧是colab下载后本地读⼊
from datats import load_from_disk
datats = load_from_disk("E:/jupyter_notebook/0_learn-nlp-with-transformers-main/docs/篇章4-使⽤Transformers解决NLP任务/datats/squad")
2 数据预处理
heavy的比较级和最高级
定义
squad_v2 =Fal
model_checkpoint ="distilbert-ba-uncad"
batch_size =16
from transformers import AutoTokenizer
import transformers
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
asrt isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
抽取式问答的数据预处理有⼏个点
⼀是context可能会很长,如何处理长度超过max_length的⽂本?
⼆是tokenizer以后,start_label和end_label要重新定位
三是切割⽂本以后,label标记可能⼜会出现问题
def prepare_train_features(examples):
# 既要对examples进⾏truncation(截断)和padding(补全)还要还要保留所有信息,所以要⽤的切⽚的⽅法。
# 每⼀个⼀个超长⽂本example会被切⽚成多个输⼊,相邻两个输⼊之间会有交集。
tokenized_examples = tokenizer(
examples["question"if pad_on_right el"context"],
examples["context"if pad_on_right el"question"],
truncation="only_cond"if pad_on_right el"only_first",# 如果context是拼接在question后⾯的,对应着第2个⽂本,所以使⽤only_cond控制
max_length=max_length,
adam looking for eve
stride=doc_stride,# tokenizer使⽤doc_stride控制切⽚之间的重合长度
return_overflowing_tokens=True,
return_offts_mapping=True,# 可以得到token对应原context中的位置
padding="max_length",
)
# !!这⾥⽤的是pop⽅法
# 我们使⽤overflow_to_sample_mapping参数来映射切⽚⽚ID到原始ID。
# ⽐如有2个expamples被切成4⽚,那么对应是[0, 0, 1, 1],前两⽚对应原来的第⼀个example。
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# offt_mapping也对应4⽚
# offt_mapping参数帮助我们映射到原始输⼊,由于答案标注在原始输⼊上,所以有助于我们找到答案的起始和结束位置。
offt_mapping = tokenized_examples.pop("offt_mapping")
# 重新标注数据
tokenized_examples["start_positions"]=[]
tokenized_examples["end_positions"]=[]
for i, offts in enumerate(offt_mapping):# i就是第⼏个句⼦,offts是⼀个存储这个句⼦每个token在原context中对应位置的列表# 对每⼀⽚进⾏处理
# 将⽆答案的样本标注到CLS上
input_ids = tokenized_examples["input_ids"][i]# 得到这个句⼦的输⼊token
cls_index = input_ids.index(tokenizer.cls_token_id)# 找到CLS也就是token为101的位置 = 0
# 区分question和context
quence_ids = tokenized_examples.quence_ids(i)# None,0,1分别标注为特殊符号,第⼀个句⼦和第⼆个句⼦# 拿到原始的example 下标.
sample_index = sample_mapping[i]# 第i个切⽚对应的原context标号
answers = examples["answers"][sample_index]# 得到该切⽚对应的原context的answer
# 如果没有答案,则使⽤CLS所在的位置为答案.
if len(answers["answer_start"])==0:# 感觉这⾥就需要看具体的数据集标注了,这⾥认为没有答案的数据集answer_start什么都没存 tokenized_examples["start_positions"].append(cls_index)# 没有答案就标注头尾都在CLS
tokenized_examples["end_positions"].append(cls_index)
el:
# 答案的character级别Start/end位置.
start_char = answers["answer_start"][0]
end_char = start_char +len(answers["text"][0])
# 找到token级别的index start.
token_start_index =0
# quence_ids就是存0,1,None的⼀个区分句⼦的列表
while quence_ids[token_start_index]!=(1if pad_on_right el0):
token_start_index +=1
# 找到token级别的index end.evp
token_end_index =len(input_ids)-1
# 这⾥的输⼊经过tokenizer以后,长度全变成最⼤长度384了,input_ids没有做填充,后⾯都是0,0对应的label也是None
# 所以0肯定不会代表token
while quence_ids[token_end_index]!=(1if pad_on_right el0):
token_end_index -=1
# 检测答案是否超出⽂本长度,超出的话也适⽤CLS index作为标注.
if not(offts[token_start_index][0]<= start_char and offts[token_end_index][1]>= end_char):
# start_char和end_char是答案的位置
# offts[token_start_index][0]就是当前输⼊token对应的原始⽂本的开始字母,0就是这个token对应的元组的第⼀个数
# offts[token_end_index][1]就是当前输⼊token对应的原始⽂本的结束字母
tokenized_examples["start_positions"].append(cls_index)
tokenized_examples["end_positions"].append(cls_index)
el:
# 如果不超出则找到答案token的start和end位置。.
# Note: we could go after the last offt if the answer is the last word (edge ca).
# 符合的话就开始纠正位置
while token_start_index <len(offts)and offts[token_start_index][0]<= start_char:
token_start_index +=1
tokenized_examples["start_positions"].append(token_start_index -1)
while offts[token_end_index][1]>= end_char:
token_end_index -=1
tokenized_examples["end_positions"].append(token_end_index +1)
return tokenized_examples
进⾏处理
tokenized_datats = datats.map(prepare_train_features, batched=True, remove_columns=datats["train"].column_names)
3 训练和评估
训练还是常规操作
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)
args = TrainingArguments(
f"test-squad",
evaluation_strategy ="epoch",
learning_rate=2e-5,#学习率
per_device_train_batch_size=batch_size,
sheryl crowper_device_eval_batch_size=batch_size,
num_train_epochs=1,# 训练的论次
vvipweight_decay=0.01,
)
托福ibt
data_collator = default_data_collator
trainer = Trainer(
model,
args,
train_datat=tokenized_datats["train"],
eval_datat=tokenized_datats["validation"],
data_collator=data_collator,
tokenizer=tokenizer,
)
trainer.save_model("test-squad-trained")
评估过程要复杂⼀些,因为可能输出得到的两个分数告诉我们找不到答案:⽐如start的位置⽐end的
位置下标⼤,或者start和end的位置指向了question。此时就要综合打分(其实也就是start下标的值+end下标的值最⼤的⼀组)然后再检查答案是否有效,最后进⾏排序选择得分最⾼的。
简⽽⾔之,始末位置得分相加+有效判断+排序
import numpy as np
start_logits = output.start_logits[0].cpu().numpy()
end_logits = d_logits[0].cpu().numpy()
# 收集最佳的start和end logits的位置:
# [-1 : -n_best_size - 1 : -1]的⽤法!!
# argsort()函数是将x中的元素从⼩到⼤排列,提取其对应的index(索引),然后输出到y
start_indexes = np.argsort(start_logits)[-1:-n_best_size -1:-1].tolist()
end_indexes = np.argsort(end_logits)[-1:-n_best_size -1:-1].tolist()
valid_answers =[]
for start_index in start_indexes:
for end_index in end_indexes:
安详的拼音if start_index <= end_index:# 如果start⼩于end,那么合理的
valid_answers.append(
{
"score": start_logits[start_index]+ end_logits[end_index],
"text":""# 后续需要根据token的下标将答案找出来
}
)
上⾯完成了始末位置得分相加+有效判断,随后我们对根据score对valid_answers进⾏排序,找到最好的那⼀个。最后还剩⼀步是:检查start和end位置对应的⽂本是否在context⾥⾯⽽不是在question⾥⾯。
为了完成这件事情,我们需要添加以下两个信息到validation的features(切⽚)⾥⾯:
1 产⽣切⽚的example的ID。由于每个example可能会产⽣多个切⽚,所以每个切⽚的需要知道他们对应的example。
2 offt mapping: 将每个切⽚的tokens的位置映射会原始⽂本基于character(字母级别)的下标位置。
所以准备验证集的数据就和准备训练集数据时有所不同
def prepare_validation_features(examples):
# Tokenize our examples with truncation and maybe padding, but keep the overflows using a stride. This results
# in one example possible giving veral features when a context is long, each of tho features having a
# context that overlaps a bit the context of the previous feature.
tokenized_examples = tokenizer(
examples["question"if pad_on_right el"context"],
examples["context"if pad_on_right el"question"],
truncation="only_cond"if pad_on_right el"only_first",
max_length=max_length,
stride=doc_stride,
return_overflowing_tokens=True,
return_offts_mapping=True,
padding="max_length",
)
# Since one example might give us veral features if it has a long context, we need a map from a feature to
# its corresponding example. This key gives us just that.
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
# We keep the example_id that gave us this feature and we will store the offt mappings.
tokenized_examples["example_id"]=[]
for i in range(len(tokenized_examples["input_ids"])):
# Grab the quence corresponding to that example (to know what is the context and what is the question).
quence_ids = tokenized_examples.quence_ids(i)# None,0,1 得到是特殊符号、question或者context
context_index =1if pad_on_right el0
# 处理1 得到example id
# One example can give veral spans, this is the index of the example containing this span of text.
sample_index = sample_mapping[i]# 得到第i个切⽚对应的原context的第⼏个
tokenized_examples["example_id"].append(examples["id"][sample_index])# 把上⾯的切⽚对应的哪部分这个信息纳⼊到tokenized_examples中
# 处理2 修正offt_mapping
# Set to None the offt_mapping that are not part of the context so it's easy to determine if a token
# position is part of the context or not.
tokenized_examples["offt_mapping"][i]=[
(o if quence_ids[k]== context_index el None)# 如果是context内容就赋值,不是就赋None
for k, o in enumerate(tokenized_examples["offt_mapping"][i])
]
return tokenized_examples
处理验证集数据
validation_features = datats["validation"].map(
prepare_validation_features,
batched=True,
remove_columns=datats["validation"].column_names
)
获得所有预测结果
raw_predictions = trainer.predict(validation_features)
这个 Trainer 隐藏了 ⼀些模型训练时候没有使⽤的属性(这⾥是 example_id和offt_mapping,后处理的时候会⽤到),所以我们需要把这些设置回来:
validation_features.t_format(type=validation_features.format["type"], columns=list(validation_features.features.keys()))
当⼀个token位置对应着question部分时候,prepare_validation_features函数将offt mappings设定为None,所以我们根据offt mapping很容易可以鉴定token是否在context⾥⾯啦。我们同样也根绝扔掉了特别长的答案。
整合上⾯的流程,仍然是处理⼀个句⼦,顺序和刚才相反,是要先预处理数据,这样就得到了切⽚对应的example id并对offt_mapping 值做了修正,接来⽤这个预处理得到的数据进⾏检查和打分,要考虑全⾯。
max_answer_length =30
start_logits = output.start_logits[0].cpu().numpy()
end_logits = d_logits[0].cpu().numpy()
offt_mapping = validation_features[0]["offt_mapping"]
# The first feature comes from the first example. For the more general ca, we will need to be match the example_id to
# an example index
context = datats["validation"][0]["context"]
# Gather the indices the best start/end logits:
start_indexes = np.argsort(start_logits)[-1:-n_best_size -1:-1].tolist()
end_indexes = np.argsort(end_logits)[-1:-n_best_size -1:-1].tolist()
valid_answers =[]
for start_index in start_indexes:
for end_index in end_indexes:
# Don't consider out-of-scope answers, either becau the indices are out of bounds or correspond
# to part of the input_ids that are not in the context.
if(
start_index >=len(offt_mapping)
or end_index >=len(offt_mapping)
or offt_mapping[start_index]is None
or offt_mapping[end_index]is None
):
continue
# Don't consider answers with a length that is either < 0 or > max_answer_length.
if end_index < start_index or end_index - start_index +1> max_answer_length:常用英语单词大全
continue
if start_index <= end_index:# We need to refine that test to check the answer is inside the context
start_char = offt_mapping[start_index][0]
end_char = offt_mapping[end_index][1]
valid_answers.append(
{
detective
青云翻译"score": start_logits[start_index]+ end_logits[end_index],
"text": context[start_char: end_char]
}
)
valid_answers =sorted(valid_answers, key=lambda x: x["score"], rever=True)[:n_best_size]
valid_answers
可以得到⼀顺排列