训练BERT模型加⼊到深度学习⽹络层中——keras_bert库使⽤指
南
1 前⾔
BERT模型的使⽤可以分为两种形式:第⼀种使⽤⽅法直接将语句序列输⼊BERT模型获取特征表⽰,BERT模型⼀共提供⼗⼆层不同的特征向量输出,随层数的递进,特征表⽰从专于词义表⽰到专于语义表⽰⽽有所区别,此时BERT模型相当于静态的word2vector模型,仅⽤于特征表⽰,关于如何获取BERT预训练模型及如何使⽤第⼀种⽅法,可以参考前⼀篇博客。
第⼆种则是更为常⽤的将BERT模型作为深度学习⽹络的⼀部分继续训练,以达到个性化需求适应的⽬的,此时BERT模型相当于为深度学习⽹络层中预设了⼀个较优的初始参数值,有利于模型训练在优化⽬标损失值时有⼀个较好的初始点,这有利于优化算法能够更快更精确地寻找到可⾏域中的最优解。
本⽂主要就如何使⽤keras_bert库来实现第⼆种⽅法,以及当中可能存在的问题做阐述
2 keras_bert 库使⽤⽰例
pip install keras_bert
官⽅给出的keras_bert使⽤⽰例是这样的
#! -*- coding:utf-8 -*-
import json
import numpy as np
import pandas as pd
from random import choice
from keras_bert import load_trained_model_from_checkpoint, Tokenizer
import re, os
import codecs
maxlen = 100
config_path = '../bert/chine_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '../bert/chine_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '../bert/chine_L-12_H-768_'
token_dict = {}
with codecs.open(dict_path, 'r', 'utf8') as reader:
for line in reader:
token = line.strip()
token_dict[token] = len(token_dict)
class OurTokenizer(Tokenizer):
def _tokenize(lf, text):
R = []
for c in text:
if c in lf._token_dict:
R.append(c)
elif lf._is_space(c):
报考时间和截止时间 R.append('[unud1]') # space类⽤未经训练的[unud1]表⽰
el:
R.append('[UNK]') # 剩余的字符是[UNK]
return R
tokenizer = OurTokenizer(token_dict)
neg = pd.read_excel('neg.xls', header=None)
pos = pd.read_excel('pos.xls', header=None)
data = []
for d in neg[0]:
data.append((d, 0))
for d in pos[0]:
data.append((d, 1))
# 按照9:1的⽐例划分训练集和验证集
random_order = range(len(data))
np.random.shuffle(random_order)
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0]
成都卫校招生 valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0]
def q_padding(X, padding=0):
loap L = [len(x) for x in X]
ML = max(L)
return np.array([
np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML el x for x in X ])
class data_generator:
def __init__(lf, data, batch_size=32):
lf.data = data
lf.batch_size = batch_size
lf.steps = len(lf.data) // lf.batch_size
if len(lf.data) % lf.batch_size != 0:
lf.steps += 1
def __len__(lf):
return lf.steps
def __iter__(lf):
wai while True:
idxs = range(len(lf.data))
np.random.shuffle(idxs)
X1, X2, Y = [], [], []
桃花源记 翻译
for i in idxs:
d = lf.data[i]
text = d[0][:maxlen]
struck
x1, x2 = de(first=text)
y = d[1]
X1.append(x1)
X2.append(x2)
Y.append([y])
if len(X1) == lf.batch_size or i == idxs[-1]:
X1 = q_padding(X1)
X2 = q_padding(X2)
Y = q_padding(Y)
yield [X1, X2], Y
[X1, X2, Y] = [], [], []
from keras.layers import *
dels import Model
agreements
import keras.backend as K
from keras.optimizers import Adam
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, q_len=None) for l in bert_model.layers:
l.trainable = True
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Den(1, activation='sigmoid')(x)
model = Model([x1_in, x2_in], p)
pile(
loss='binary_crosntropy',
optimizer=Adam(1e-5), # ⽤⾜够⼩的学习率
metrics=['accuracy']
)
model.summary()
train_D = data_generator(train_data)
valid_D = data_generator(valid_data)
model.fit_generator(
train_D.__iter__(),
steps_per_epoch=len(train_D),
epochs=5,
validation_data=valid_D.__iter__(),
validation_steps=len(valid_D)
)
我只在乎你日语>takeafter 上⾯这段代码的模型构建的核⼼部分如下所⽰
bert_model = load_trained_model_from_checkpoint(config_path, checkpoint_path, q_len=None)
for l in bert_model.layers:
l.trainable = True
x1_in = Input(shape=(None,))
x2_in = Input(shape=(None,))
x = bert_model([x1_in, x2_in])
x = Lambda(lambda x: x[:, 0])(x)
p = Den(1, activation='sigmoid')(x)
model = Model([x1_in, x2_in], p)
pile(
loss='binary_crosntropy',
optimizer=Adam(1e-5), # ⽤⾜够⼩的学习率
metrics=['accuracy']
)
model.summary()
我们⾸先需要导⼊bert_model,两个参数config_path与checkpoint_path,在下载好BERT模型后是显⽽易见的
然后我们看到bert_model的是需要两个输⼊x1_in与x2_in,这两个输⼊怎么获得?事实上这两个输⼊就是第⼀种使⽤BERT模型⽅法需要输⼊的参数,即下⾯这段代码中text2input函数返回的前两个值(这
段代码即为BERT模型的第⼀种使⽤⽅法⽰例)
# -*- coding: UTF-8 -*-
# Author: 囚⽣
# 调⽤BERT模型的⼯具函数
import os
import tensorflow as tf
from bert import modeling,tokenization
def text2input(text,tokenizer, # 接收三个参数: 超参数, ⽂本, 分词器
maxlen=100, # ⽂本token最⼤数
return_tensor=True, # 是否返回tensor类型的结果: 否则返回list类型
): # 将⽂本转化为BERT输⼊
tokens = kenize(text) # 分词器分词
the carpenters if len(tokens)>maxlen-2: tokens = tokens[:maxlen-2] # 注意tokens的数量不能超过maxlen-2, 因为头尾还需要加句⾸与分句标志
tokens_bert = ["[CLS]"] # 存放token的列表: 置⼊句⾸标志
token_type_ids = [0] # 标识token属句类别的列表: 置⼊句⾸标志的标识
for token in tokens: # 添加token与对应标识
tokens_bert.append(token) # 添加token
token_type_ids.append(0) # 这个表⽰⼀般⽤0,1,2,...表⽰是第⼏句话, 该函数⼀般只接收⼀个句⼦, 因此都是0
tokens_bert.append("[SEP]") # 置⼊分句标志
token_type_ids.append(0) # 置⼊分句标志的标识
input_ids = vert_tokens_to_ids(tokens_bert) # 将tokens转化为input_ids
input_mask = [1]*len(input_ids) # 设置蒙布
while len(input_ids)
input_ids.append(0) # 零填充
input_mask.append(0) # 零填充
token_type_ids.append(0) # 零填充
if return_tensor: # 若返回tensor类型
input_ids = tf.convert_to_tensor([input_ids],dtype=tf.int32,name="input_ids")
input_mask = tf.convert_to_tensor([input_mask],dtype=tf.int32,name="input_mask")
token_type_ids = tf.convert_to_tensor([token_type_ids],dtype=tf.int32,name="token_type_ids")
return input_ids,input_mask,token_type_ids # 返回BERT输⼊的三个参数
def load_model(input_ids,input_mask,token_type_ids,cpath,mpath,
): # 模型载⼊
config = modeling.BertConfig.from_json_file(cpath) # 载⼊配置⽂件
config_ssion = tf.ConfigProto() # 创建对象配置ssion运⾏参数
config_ssion.gpu_options.allow_growth = True # 动态申请显存
with tf.Session(config=config_ssion).as_default() as ssion:
model = modeling.BertModel( # 载⼊模型
config=config, # BERT配置信息
is_training=True, # 训练模式
input_ids=input_ids, # 输⼊参数: 输⼊token的索引
input_mask=input_mask, # 输⼊参数: 蒙布
token_type_ids=token_type_ids, # 输⼊参数:
u_one_hot_embeddings=Fal, # 不使⽤one-hot编码
)
saver = tf.train.Saver() # 训练保存器
ssion.run(tf.global_variables_initializer()) # 先初始化, 再加载参数,否则会把BERT的参数重新初始化
store(ssion,mpath) # 保存模型到ckpt⽂件
quence_output = _quence_output() # 获取每个token的输出: shape(batch_size,quence_length,embedding_size) pooled_output = _pooled_output() # 获取每个分句的输出: shape(batch_size,embedding_size)
layers = model.all_encoder_layers # 获取所有层的输出: shape(batch_size,quence_length,embedding_size)
embedding_output = _embedding_output()
embedding_table = _embedding_table()
with tf.Session() as ssion:
ssion.run(tf.global_variables_initializer())
quence_output = ssion.run(quence_output)
pooled_output = ssion.run(pooled_output)