Bert⽂本分类实践(⼀):实现⼀个简单的分类模型
写在前⾯
⽂本分类是nlp中⼀个⾮常重要的任务,也是⾮常适合⼊坑nlp的第⼀个完整项⽬。虽然⽂本分类看似简单,但⾥⾯的门道好多好多,作者⽔平有限,只能将平时⽤到的⽅法和trick在此做个记录和分享,希望⼤家看过都能有所收获,享受编程的乐趣。
第⼀部分
模型
Bert模型是Google在2018年10⽉发布的语⾔表⽰模型,⼀经问世在NLP领域横扫了11项任务的最优结果,可谓风头⼀时⽆⼆。有关于Bert 中transformer的模型细节,推荐看。在此不做赘述。baby是什么意思英文
图⼀:bert分类模型结构
Bert⽂本分类模型常见做法为将bert最后⼀层输出的第⼀个token位置(CLS位置)当作句⼦的表⽰,后接全连接层进⾏分类。模型很简单,我们直接看代码!
free是什么意思第⼆部分
pytorch代码实现
# -*- coding:utf-8 -*-
# bert⽂本分类baline模型
# model: bert
# date: 2021.10.10 10:01
import osheadcount什么意思
import numpy as np
import pandas as pd
import torch
as nn
import torch.utils.data as Data
import torch.optim as optim
import transformers
from transformers import AutoModel, AutoTokenizer
import matplotlib.pyplot as plt
train_curve = []
device = torch.device('cuda' if torch.cuda.is_available() el 'cpu')
# 定义⼀些参数,模型选择了最基础的bert中⽂模型
batch_size = 2
epoches = 100
model = "bert-ba-chine"
hidden_size = 768
n_class = 2
maxlen = 8
# data,构造⼀些训练数据
ntences = ["我喜欢打篮球", "这个相机很好看", "今天玩的特别开⼼", "我不喜欢你", "太糟糕了", "真是件令⼈伤⼼的事情"]
labels = [1, 1, 1, 0, 0, 0] # 1积极, 0消极.
# word_list = ' '.join(ntences).split()
# word_list = list(t(word_list))
# word_dict = {w: i for i, w in enumerate(word_list)}
# num_dict = {i: w for w, i in word_dict.items()}
# vocab_size = len(word_list)
# 将数据构造成bert的输⼊格式
# inputs_ids: token的字典编码
dreamworld# attention_mask:长度与inputs_ids⼀致,真实长度的位置填充1,padding位置填充0
# token_type_ids: 第⼀个句⼦填充0,第⼆个句⼦句⼦填充1
class MyDatat(Data.Datat):
def __init__(lf, ntences, labels=None, with_labels=True,):
lf.with_labels = with_labels
lf.labels = labels
def __len__(lf):
return len(ntences)
def __getitem__(lf, index):
# Selecting ntence1 and ntence2 at the specified index in the data frame
nt = lf.ntences[index]
# Tokenize the pair of ntences to get token ids, attention masks and token type ids
encoded_pair = lf.tokenizer(nt,
padding='max_length', # Pad to max_length
truncation=True, # Truncate to max_length
max_length=maxlen,
return_tensors='pt') # Return torch.Tensor objects
token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids
attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values
token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st ntence tokens & "1" for the 2nd ntence tokens if lf.with_labels: # True if the datat has labels
label = lf.labels[index]
return token_ids, attn_masks, token_type_ids, label
el:
英语填字游戏return token_ids, attn_masks, token_type_ids
train = Data.DataLoader(datat=MyDatat(ntences, labels), batch_size=batch_size, shuffle=True, num_workers=1)
# model
class BertClassify(nn.Module):
def __init__(lf):
super(BertClassify, lf).__init__()成员英语
lf.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)
lf.linear = nn.Linear(hidden_size, n_class) # 直接⽤cls向量接全连接层分类
lf.dropout = nn.Dropout(0.5)http
innerpeacedef forward(lf, X):
care是什么意思
input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
outputs = lf.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回⼀个output字典
# ⽤最后⼀层cls向量做分类
# outputs.pooler_output: [bs, hidden_size]
logits = lf.linear(lf.dropout(outputs.pooler_output))
return logits
bc = BertClassify().to(device)
optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)
loss_fn = nn.CrossEntropyLoss()
# train
sum_loss = 0
total_step = len(train)
for epoch in range(epoches):
for i, batch in enumerate(train):
<_grad()
batch = (device) for p in batch)
英文文章pred = bc([batch[0], batch[1], batch[2]])
loss = loss_fn(pred, batch[3])
sum_loss += loss.item()
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))
train_curve.append(sum_loss)
sum_loss = 0
# test
bc.eval()
_grad():
test_text = ['我不喜欢打篮球']
test = MyDatat(test_text, labels=None, with_labels=Fal) x = test.__getitem__(0)
x = tuple(p.unsqueeze(0).to(device) for p in x)
pred = bc([x[0], x[1], x[2]])
pred = pred.data.max(dim=1, keepdim=True)[1]
if pred[0][0] == 0:
print('消极')
el:
print('积极')
pd.DataFrame(train_curve).plot() # loss曲线
测试单条样本结果:
代码链接:
jupyter版本:
py版本:
喜欢的话,给萌新的github仓库⼀颗⼩星星哦……^ _^