python实现⾃然语⾔处理之词⼲提取和词性还原
词⼲提取
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
宽带用户名和密码怎么查
import nltk.stem.snowball as sb
老男孩原唱# 波特词⼲提取器 (偏宽松)
stemmer = pt.PorterStemmer()
# 朗卡斯特词⼲提取器 (偏严格)
stemmer = lc.LancasterStemmer()
# 思诺博词⼲提取器 (偏中庸)
stemmer = sb.SnowballStemmer('english')
r = stemmer.stem('playing')# 词⼲提取
词性还原
与词⼲提取作⽤类似, 次⼲提取出的词⼲信息不利于⼈⼯⼆次处理(⼈读不懂), 词性还原可以把名词复数等形式恢复为单数形式. 更有利于⼈⼯⼆次处理.
import nltk.stem as ns
# 词性还原器
lemmatizer = ns.WordNetLemmatizer()
n_lemm=lemmatizer.lemmatize(word, pos='n')
实验物理学家v_lemm=lemmatizer.lemmatize(word, pos='v')
案例:词⼲提取
儿童近视眼治疗"""
词⼲提取器
"""
import nltk.stem.porter as pt
import nltk.stem.lancaster as lc
import nltk.stem.snowball as sb
words =['table','probably','wolves',
'playing','is','the','beaches',
'grouded','dreamt','envision']
pt_stemmer = pt.PorterStemmer()
lc_stemmer = lc.LancasterStemmer()
sb_stemmer = sb.SnowballStemmer('english')
苹果制作铃声for word in words:
宫保鸡丁的由来
pt_stem = pt_stemmer.stem(word)
lc_stem = lc_stemmer.stem(word)
sb_stem = sb_stemmer.stem(word)
瑜伽健身操print('%8s %8s %8s %8s'% \
(word, pt_stem, lc_stem, sb_stem))
提取的结果:
table tabl tabl tabl
probably probabl prob probabl
wolves wolv wolv wolv
playing play play play
is is is is
the the the the
beaches beach beach beach
grouded groud groud groud
dreamt dreamt dreamt dreamt
envision envis envid envis
案例:词性还原
"""
词性还原
"""
import nltk.stem as ns
import nltk
nltk.download('wordnet')
words =['table','probably','wolves',
'playing','is','the','beaches',
'grouded','dreamt','envision']
lemmatizer = ns.WordNetLemmatizer()
for word in words:
n_lemm = lemmatizer.lemmatize(word,pos='n') v_lemm = lemmatizer.lemmatize(word,pos='v') print('%8s %8s %8s'% \
(word, n_lemm, v_lemm))
如下是词性还原的结果:
table table table
probably probably probably
wolves wolf wolves
爆竹的由来playing playing play
is is be
the the the
beaches beach beach
grouded grouded grouded
dreamt dreamt dream
envision envision envision