使⽤BERT和GPT-2计算句⼦困惑度PPL
定义
BERT
对于给定的ntence,按顺序依次mask掉⼀个token,并计算所预测单词的nll loss,将所有的token的loss求和再取平均,最后取以⾃然数为底的次⽅即为该句话的PPL。
测试写法:
import numpy as np
import torch
as nn
from transformers import BertTokenizer, BertForMaskedLM
# Load pre-trained model (weights)
_grad():
model = BertForMaskedLM.from_pretrained('hfl/chine-bert-wwm-ext')
model.eval()
# Load pre-trained model tokenizer (vocabulary)如何开红酒
tokenizer = BertTokenizer.from_pretrained('hfl/chine-bert-wwm-ext')
ntence ="我不会忘记和你⼀起奋⽃的时光。"
tokenize_input = kenize(ntence)
tensor_input = sor([vert_tokens_to_ids(tokenize_input)])
n_len =len(tokenize_input)魏雪漫
ntence_loss =0.
for i, word in enumerate(tokenize_input):
# add mask to i-th character of the ntence
tokenize_input[i]='[MASK]'
exportedmask_input = sor([vert_tokens_to_ids(tokenize_input)])
output = model(mask_input)
prediction_scores = output[0]
softmax = nn.Softmax(dim=0)
ps = softmax(prediction_scores[0, i]).log()
word_loss = ps[tensor_input[0, i]]
ntence_loss += word_loss.item()
为什么眼睛会痛tokenize_input[i]= word
ppl = np.exp(-ntence_loss/n_len)
print(ppl)
tensor思维的写法:
def score(model, tokenizer, ntence, mask_token_id=103):
tensor_input = de(ntence, return_tensors='pt')
我要去旅游
repeat_input = peat(tensor_input.size(-1)-2,1)
mask = s(tensor_input.size(-1)-1).diag(1)[:-2]
masked_input = repeat_input.masked_fill(mask ==1,103)
labels = repeat_input.masked_fill( masked_input !=103,-100)
loss,_ = model(masked_input, masked_lm_labels=labels)
result = np.exp(loss.item())
return result
s = score(model, tokenizer,'我不会忘记和你⼀起奋⽃的时光。')
print(s)
GPT-2
官⽅的gpt-2不⽀持中⽂,且是BPE分词⽅式。对于中⽂,有NLPer训练出了中⽂的gpt-2模型,且分词采⽤的是bert tokenizer的分词⽅式。花费时间英语
对于给定的ntence,若其长度为n,⾸先将其向左偏移⼀位作为label,将其去除末位作为input,将gpt-2的输出与label求cross entroy loss,再求以⾃然数为底的次⽅即为该句话的PPL。
import torch
from transformers import BertTokenizer, GPT2LMHeadModel
import CrossEntropyLoss
def cal_ppl_bygpt2():
福原爱东北话ns =["今天是个好⽇⼦。","天今⼦⽇。个是好","这个婴⼉有900000克呢。","我不会忘记和你⼀起奋⽃的时光。",
"我不会记忘和你⼀起奋⽃的时光。","会我记忘和你⽃起⼀奋的时光。"]
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-chine-cluecorpussmall")
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-chine-cluecorpussmall")
inputs = tokenizer(ns, padding='max_length', max_length=50, truncation=True, return_tensors="pt")
bs, sl = inputs['input_ids'].size()
outputs = model(**inputs, labels=inputs['input_ids'])
logits = outputs[1]
# Shift so that tokens < n predict n
shift_logits = logits[:,:-1,:].contiguous()
shift_labels = inputs['input_ids'][:,1:].contiguous()
shift_attentions = inputs['attention_mask'][:,1:].contiguous()
# Flatten the tokens
loss_fct = CrossEntropyLoss(ignore_index=0, reduction="none")
大的成语loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)).detach().reshape(bs,-1)
meanloss = loss.sum(1)/ shift_attentions.sum(1)
ppl = p(meanloss).numpy().tolist()
return ppl
if __name__ =='__main__':
cal_ppl_bygpt2()