hanlp⼊门(含标准分词、NLP分词、索引分词、N-最短路径分词、CRF分词、极速词典分。。。
直接给代码
1 #-*- coding:utf-8 -*-
2 from pyhanlp import *
3
4 # 中⽂分词
5 ('你好,欢迎在Python中调⽤HanLP的API'))
6 print("-" * 70)
7
8 print("=" * 30 + "标准分词" + "=" * 30)
9 StandardTokenizer = JClass('com.kenizer.StandardTokenizer')
10 ('你好,欢迎在Python中调⽤HanLP的API'))
11 print("-" * 70)
12
13 # NLP分词NLPTokenizer会执⾏全部命名实体识别和词性标注
14 print("=" * 30 + "NLP分词" + "=" * 30)
15 NLPTokenizer = JClass('com.kenizer.NLPTokenizer')
16 ('中国科学院计算技术研究所的宗成庆教授正在教授⾃然语⾔处理课程'))
17 print("-" * 70)
18
19 print("=" * 30 + "索引分词" + "=" * 30)
20 IndexTokenizer = JClass('com.kenizer.IndexTokenizer')
21 termList = ("主副⾷品")
22 for term in termList:
23 print(str(term) + " [" + str(term.offt) + ":" + str(term.offt + len(term.word)) + "]")
24 print("-" * 70)
25
26 print("=" * 30 + " N-最短路径分词" + "=" * 30)
27 # CRFSegment = JClass('com.CRF.CRFSegment')
28 # gment=CRFSegment()
29 # testCa ="今天,刘志军案的关键⼈物,⼭西⼥商⼈丁书苗在市⼆中院出庭受审。"
30 # print(gment.g("你看过穆赫兰道吗"))
31 print("-" * 70)
32
33 print("=" * 30 + " CRF分词" + "=" * 30)
34 print("-" * 70)
35
36 print("=" * 30 + " 极速词典分词" + "=" * 30)
37 SpeedTokenizer = JClass('com.kenizer.SpeedTokenizer')
38 ('江西鄱阳湖⼲枯,中国最⼤淡⽔湖变成⼤草原'))
39 print("-" * 70)
40
41 print("=" * 30 + " ⾃定义分词" + "=" * 30)
42 CustomDictionary = JClass('com.hankcs.hanlp.dictionary.CustomDictionary')
43 CustomDictionary.add('攻城狮')
44 CustomDictionary.add('单⾝狗')
45 HanLP = JClass('com.hankcs.hanlp.HanLP')
46 ('攻城狮逆袭单⾝狗,迎娶⽩富美,⾛上⼈⽣巅峰'))
47 print("-" * 70)