动手实践bert+BiLstm+crf

更新时间:2023-06-11 19:51:28 阅读: 评论:0

动⼿实践bert+BiLstm+crf
⽹上⼤部分都是使⽤ChinaPeoplesDailyNerCorpus语料做的,真正应⽤到⾃已的语料和需求中还是有⼀些坑,这⾥整理记录⼀下
⾸先明确语料需要处理成什么格式,贴图理解⼀下
这⾥⾯需要搞清楚⼏点,我们的语料最⼩粒度是字级别的,然后每句话结束会有⼀个空⾏(当年踩过的坑),后⾯的标记简单科普⼀下,专业⼈⼠直接跳过,⼤O表⽰⾮实体,B-ORD表⽰机构开头第⼀个字,I-ORD表⽰中间,有些预料可能会有结束标记,这⾥只使⽤了开头和中间,当然你可能还需要识别⼈名(B-PER, I-PER),地名(B-LOC, I-LOC),同理。
import re
# txt2ner_train_data turn label str into ner trainable data
# s :labeled str  eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]'
# save_path: ner_trainable_txt name
def str2ner_train_data(s,save_path):
ner_data = []
result_1 = re.finditer(r'\[\@', s)
result_2 = re.finditer(r'\*\]', s)
begin = []
end = []
for each in result_1:
begin.append(each.start())
for each in result_2:
end.d())
asrt len(begin) == len(end)
i = 0
j = 0
while i < len(s):
if i not in begin:
ner_data.append([s[i], 0])
i = i + 1
el:
ann = s[i + 2:end[j] - 2]
entity, ner = ann.rsplit('#')
if (len(entity) == 1):
ner_data.append([entity, 'S-' + ner])
el:
if (len(entity) == 2):
ner_data.append([entity[0], 'B-' + ner])
ner_data.append([entity[1], 'E-' + ner])
el:
ner_data.append([entity[0], 'B-' + ner])
for n in range(1, len(entity) - 1):
ner_data.append([entity[n], 'I-' + ner])
ner_data.append([entity[-1], 'E-' + ner])
i = end[j]
j = j + 1
f = open(save_path, 'w', encoding='utf-8')
for each in ner_data:
f.write(each[0] + ' ' + str(each[1]))
f.write('\n')
f.clo()
# txt2ner_train_data turn label str into ner trainable data
# file_path :labeled multi lines' txt  eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]' # save_path: ner_trainable_txt name
def txt2ner_train_data(file_path,save_path):
fr=open(file_path,'r',encoding='utf-8')
adlines()
benjaminfranklins=''
for line in lines:
place('\n','')
place(' ','')
s=s+line
fr.clo()
str2ner_train_data(s, save_path)展春园
if(__name__=='__main__'):
s = '我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]'
save_path = 's.txt'
str2ner_train_data(s, save_path)
file_path='D:\\codes\\python_codes\\SUTDAnnotator-master\\demotext\\ann'
txt2ner_train_data(file_path,'s1.txt')
通过观察可以发现,我们需要将语料处理成下⾯这种格式
''我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]''
抽象⼀下就是实体部分需要是[@实体部分#实名类别*]
那么接下来只需要将⼀段话或者⼀篇⽂章中的实体部分处理上述格式,任务很明确,直接上代码分析
# coding=utf-8
from config import *
from pymysql import *
def label_data():
'''通过查询数据库数据,然后处理'''
conn = connect(host=HOST, port=3306, databa=DATABASE, ur=USER,
password=PASSWORD, chart='utf8')
cs1 = conn.cursor()
sql1 = 'SELECT deal_name, company_name, introduce from dw_deals_qmp ORDER BY id limit 100'
pnlist = []
alldata = cs1.fetchall()
for s in alldata:
pnlist.append([s[0], s[1], s[2]])
cs1.clo()
conn.clo()
for s_li in pnlist:
deal_name = s_li[0]
company_name = s_li[1]
introduce = s_li[2]
new_intro = ''
# 处理deal_name
if deal_name in introduce:
new_intro = place(deal_name, '[@{}#PRO*]'.format(deal_name))
# 处理company_name
if deal_name != company_name:
if company_name in introduce:
new_intro = place(company_name, '[@{}#COMP*]'.format(company_name))
el:affect>短裤怎么说
if company_name in introduce:
new_intro = place(company_name, '[@{}#COMP*]'.format(company_name))
# 写⼊到ann
with open('/Urs/Devintern/Documents/pachong/Ner/', 'a') as f:
f.write(new_intro + '\r\n')
def custom_corpus():
deal_name = '汇盈环保'
company_name = '江西汇盈环保科技有限公司'
introduce = '⾦圆股份(000546)(000546.SZ)公布,公司控股⼦公司江西新⾦叶实业有限公司(“新⾦叶”)于2019年05⽉20⽇在浙江省兰溪市与上饶县星灿环保科技研发中⼼(有限合伙)及陈奇峰签订股权收购协议,以1.415亿元的价格收购江西汇盈环保科技有限公司(“江西汇盈”)100%股权。江西汇盈为资源化综合利⽤企业,地处江西省上饶市铅⼭县,已于2019年4⽉18⽇取得危险废物经营许可证(赣环危废临证字(2019)07号),核准经营规模为13.2667万吨/年。此次通过控股⼦公司新⾦叶收购江西汇盈100%股权,新增13.2667万吨/年的危险废物处置产能。江西汇盈投产后将与新⾦叶实现优势互补、协同发展,提⾼公司在资源化综合利⽤领域的竞争⼒及盈利能⼒,⼒争成为江西省资源化综合利⽤业务区域龙头,进⼀步深化落实公司环保发展战略,符合公司整体发展战略规划'
new_intro = ''
# 处理deal_name
if deal_name in introduce:
new_intro = place(deal_name, '[@{}#ORG*]'.format(deal_name))
# 处理company_name
if deal_name != company_name:
if deal_name != company_name:
if company_name in introduce:
new_intro = place(company_name, '[@{}#ORG*]'.format(company_name))
el:
if company_name in introduce:
new_intro = place(company_name, '[@{}#ORG*]'.format(company_name))
# 写⼊到
threadlesswith open('/Urs/Devintern/Documents/pachong/Ner/', 'a') as f:
f.write(new_intro + '\r\n')
def main():
# 处理数据库数据
label_data()
# 处理⾃定义预料
# custom_corpus()
if __name__ == '__main__':
main()
简单解释⼀下,因为这⾥我的需求只是需要标记出项⽬名(PRO)和公司名(COMP),别的不关注,⼀个
项⽬名和公司名对应⼀句话,所以只需要处理出这句话中的项⽬名和公司名就可以,需求很简单,上述有两个⽅法,⼀个是读取数据库数据然后处理,另⼀个⽅法是直接输⼊语料处理,选择其⼀就可以。简单看⼀下处理后的效果
接下来就直接使⽤上述参考的⽅法,这⾥有⼀些改动,我将最新的代码贴上来
import re
# txt2ner_train_data turn label str into ner trainable data
# s :labeled str  eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]'
# save_path: ner_trainable_txt name
def str2ner_train_data(s, save_path):
ner_data = []
result_1 = re.finditer(r'\[\@', s)
result_2 = re.finditer(r'\*\]', s)
begin = []
end = []
for each in result_1:
begin.append(each.start())
for each in result_2:
end.d())
asrt len(begin) == len(end)
i = 0
j = 0
while i < len(s):
if i not in begin:
印度支那是什么意思ner_data.append([s[i], 'O'])
i = i + 1
el:
ann = s[i + 2:end[j] - 2]
ann = s[i + 2:end[j] - 2]
entity, ner = ann.rsplit('#')
if (len(entity) == 1):
ner_data.append([entity, 'B-' + ner])
# ner_data.append([entity, 'S-' + ner])
el:
if (len(entity) == 2):
有效沟通技巧ner_data.append([entity[0], 'B-' + ner])
ner_data.append([entity[1], 'I-' + ner])
# ner_data.append([entity[1], 'E-' + ner])
el:
ner_data.append([entity[0], 'B-' + ner])
for n in range(1, len(entity)):
ner_data.append([entity[n], 'I-' + ner])
# ner_data.append([entity[-1], 'E-' + ner])
i = end[j]
j = j + 1
f = open(save_path, 'a', encoding='utf-8')
for each in ner_data:
f.write(each[0] + ' ' + str(each[1]))
if each[0] == '。' or each[0] == '?' or each[0] == '!':
f.write('\n')
f.write('\n')
el:
f.write('\n')
f.clo()
三齐儿童网# txt2ner_train_data turn label str into ner trainable data
# file_path :labeled multi lines' txt  eg.'我来到[@1999年#YEAR*]的[@上海#LOC*]的[@东华⼤学#SCHOOL*]' # save_path: ner_trainable_txt name
def txt2ner_train_data(file_path, save_path):
fr = open(file_path, 'r', encoding='utf-8')
lines = fr.readlines()
s = ''
for line in lines:
line = place('\n', '')新东方新概念英语
line = place(' ', '')
s = s + line
japanegranny70
fr.clo()
str2ner_train_data(s, save_path)
if (__name__ == '__main__'):
save_path = '/Urs/Devintern/Documents/pachong/'
file_path = '/Urs/Devintern/Documents/pachong/Ner/'
txt2ner_train_data(file_path, save_path)

本文发布于:2023-06-11 19:51:28,感谢您对本站的认可!

本文链接:https://www.wtabcd.cn/fanwen/fan/90/141680.html

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。

标签:处理   江西   公司   需要   汇盈   综合   资源化
相关文章
留言与评论(共有 0 条评论)
   
验证码:
Copyright ©2019-2022 Comsenz Inc.Powered by © 专利检索| 网站地图