【深度学习】ntencepiece⼯具之BPE训练使⽤为什么要使⽤BPE,BPE是什么
BPE:迭代的将字符串⾥出现频率最⾼的⼦串进⾏合并
训练过程
使⽤教程
# -*- coding: utf-8 -*-
#/usr/bin/python3
import os
import errno
import ntencepiece as spm
import re
import logging
logging.basicConfig(level=logging.INFO)
def prepro(hp):
print("# Check if raw files exist")
train1 ="iwslt2016/de-en/train.tags.de-en.de"
train2 ="iwslt2016/de-en/train."
eval1 ="iwslt2016/de-en/IWSLT16.TED.tst2013.l"
eval2 ="iwslt2016/de-en/IWSLT16.TED.l"
test1 ="iwslt2016/de-en/IWSLT16.TED.tst2014.l"
test2 ="iwslt2016/de-en/IWSLT16.TED.l"
for f in(train1, train2, eval1, eval2, test1, test2):
if not os.path.isfile(f):
rai FileNotFoundError(errno.ENOENT, os.strerror(errno.ENOENT), f)
print("# Preprocessing")
# train
_prepro =lambda x:[line.strip()for line in open(x, mode='r',encoding="utf-8").read().split("\n") \ if not line.startswith("<")]
prepro_train1, prepro_train2 = _prepro(train1), _prepro(train2)
asrt len(prepro_train1)==len(prepro_train2),"Check if train source and target files match."
# eval
_prepro =lambda x:[re.sub("<[^>]+>","", line).strip() \
for line in open(x, mode='r',encoding="utf-8").read().split("\n") \
if line.startswith("<g id")]
prepro_eval1, prepro_eval2 = _prepro(eval1), _prepro(eval2)
asrt len(prepro_eval1)==len(prepro_eval2),"Check if eval source and target files match."
# test
prepro_test1, prepro_test2 = _prepro(test1), _prepro(test2)
asrt len(prepro_test1)==len(prepro_test2),"Check if test source and target files match."
print("Let's e how preprocesd data look like")
print("prepro_train1:", prepro_train1[0])
print("prepro_train2:", prepro_train2[0])
print("prepro_eval1:", prepro_eval1[0])
print("prepro_eval2:", prepro_eval2[0])
print("prepro_test1:", prepro_test1[0])
print("prepro_test2:", prepro_test2[0])
print("# write preprocesd files to disk")
os.makedirs("iwslt2016/prepro", exist_ok=True)
def_write(nts, fname):
with open(fname, mode='w',encoding="utf-8")as fout:
fout.write("\n".join(nts))
_write(prepro_train1,"iwslt2016/prepro/train.de")
_write(prepro_train2,"iwslt2016/")
_write(prepro_train1+prepro_train2,"iwslt2016/prepro/train")
_write(prepro_eval1,"iwslt2016/prepro/eval.de")
_write(prepro_eval2,"iwslt2016/")
_write(prepro_test1,"iwslt2016/prepro/test.de")
_write(prepro_test2,"iwslt2016/")
print("# Train a joint BPE model with ntencepiece")
os.makedirs("iwslt2016/gmented", exist_ok=True)
train = '--input=iwslt2016/prepro/train --pad_id=0--unk_id=1 \
--bos_id=2--eos_id=3\
--model_prefix=iwslt2016/gmented/bpe --vocab_size={} \
--model_type=bpe'.format(hp.vocab_size)
spm.SentencePieceTrainer.Train(train)
print("# Load trained bpe model")
sp = spm.SentencePieceProcessor()
sp.Load("iwslt2016/del")
print("# Segment")
def_gment_and_write(nts, fname):
with open(fname,mode="w",encoding="utf-8")as fout:
for nt in nts:
pieces = sp.EncodeAsPieces(nt)
fout.write(" ".join(pieces)+"\n")
_gment_and_write(prepro_train1,"iwslt2016/gmented/train.de.bpe")
_gment_and_write(prepro_train2,"iwslt2016/bpe")
_gment_and_write(prepro_eval1,"iwslt2016/gmented/eval.de.bpe")
_gment_and_write(prepro_eval2,"iwslt2016/bpe")
_gment_and_write(prepro_test1,"iwslt2016/gmented/test.de.bpe")
print("Let's e how gmented data look like")
print("train1:",open("iwslt2016/gmented/train.de.bpe",mode='r',encoding="utf-8").readline()) print("train2:",open("iwslt2016/bpe", mode='r',encoding="utf-8").readline()) print("eval1:",open("iwslt2016/gmented/eval.de.bpe", mode='r',encoding="utf-8").readline()) print("eval2:",open("iwslt2016/bpe", mode='r',encoding="utf-8").readline()) print("test1:",open("iwslt2016/gmented/test.de.bpe", mode='r',encoding="utf-8").readline())
if __name__ =='__main__':
hparams = Hparams()
parr = hparams.parr
hp = parr.par_args()
prepro(hp)
print("Done")