NLP(⼀)语料库和WordNet 访问语料库
NLTK数据库的安装:
NLTK语料库列表:
内部访问(以Reuters corpus为例):
import nltk
pus import reuters
# 下载路透社语料库
nltk.download('reuters')
# 查看语料库的内容
files = reuters.fileids()
print(files)
# 访问其中⼀个⽂件的内容
words14826 = reuters.words(['test/14826'])
print(words14826[:20])
# 输出主题(⼀共90个)
reutersGenres = reuters.categories()
print(reutersGenres)
# 访问⼀个主题,⼀句话⼀⾏输出
for w in reuters.words(categories=['tea']):
print(w + ' ',end='')
if w is '.':
print()
下载外部语料库并访问(以影评数据集为例)
下载数据集:
本例下载了1000积极和1000消极的影评
pus import CategorizedPlaintextCorpusReader
# 读取语料库
reader = CategorizedPlaintextCorpusReader(r'D:\PyCharm 5.0.3\WorkSpace\2.NLP\语料库\1.movie_review_data_1000\txt_ntoken',r'.*\.txt',cat_pattern=r'(\w+)/*') print(reader.categories())
print(reader.fileids())
# 语料库分成两类
posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')
# 从posFiles或negFiles随机选择⼀个⽂件
from random import randint
fileP = posFiles[randint(0,len(posFiles)-1)]
fileN = negFiles[randint(0,len(negFiles)-1)]
# 逐句打印随机的选择⽂件
for w in reader.words(fileP):
print(w + ' ',end='')
if w is '.':
print()
for w in reader.words(fileN):
print(w + ' ',end='')
if w is '.':
print()
CategorizedPlaintextCorpusReader类通过参数的设置,从内部将样本加载到合适的位置
语料库中的词频计算和计数分布分析
以布朗语料库为例:布朗⼤学 500个⽂本 15个类
import nltk
pus import brown
nltk.download('brown')
# 查看brown中的类别
print(brown.categories())
# 挑选出三种类别,并获取其中的疑问词
genres = ['fiction','humor','romance']
whwords = ['what','which','how','why','when','where','who']
# 迭代器分别分析3种类
for i in range(0,len(genres)):
genre = genres[i]
print()
print("Analysing '"+ genre + "' wh words")
genre_text = brown.words(categories = genre)
print(genre_text)
# 返回输⼊单词对象的wh类及对应的频率
fdist = nltk.FreqDist(genre_text)
for wh in whwords:
print(wh + ':',fdist[wh],end=' ')
print()
输出:
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction'] Analysing 'fiction' wh words
['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]
what: 128 which: 123 how: 54 why: 18 when: 133 where: 76 who: 103
Analysing 'humor' wh words
['It', 'was', 'among', 'the', 'that', 'Hinkle', ...]
what: 36 which: 62 how: 18 why: 9 when: 52 where: 15 who: 48
Analysing 'romance' wh words
['They', 'neither', 'liked', 'nor', 'disliked', 'the', ...]
what: 121 which: 104 how: 60 why: 34 when: 126 where: 54 who: 89
⽹络⽂本和聊天⽂本的词频分布
import nltk
pus import webtext
# nltk.download('webtext')
print(webtext.fileids())
# 选择⼀个数据⽂件,并计算频率分布,获得FreqDist的对象fdist
fileid = '' # 个⼈⼴告
wbt_words = webtext.words(fileid)
fdist = nltk.FreqDist(wbt_words)
# 获取⾼频单词及其计数
print('最多出现的词 "' , fdist.max() , '" :' , fdist[fdist.max()])
# 获取所有单词的计数
print(fdist.N())
# 找出最常见的10个词
st_common(10))
# 将单词和频率制成表格
print(fdist.tabulate(5))
# 将单词和频率制成分布图
fdist.plot(cumulative=True) # 计数显⽰,cumulative=percents为百分⽐显⽰
输出:
['', '', '', '', '', '']
最多出现的词 " , " : 539
4867
[(',', 539), ('.', 353), ('/', 110), ('for', 99), ('and', 74), ('to', 74), ('lady', 68), ('-', 66), ('eks', 60), ('a', 52)]
, . / for and
539 353 110 99 74
None
累计计数分布图:
使⽤WordNet获取⼀个词的不同含义
# import nltk
# nltk.download('wordnet')
pus import wordnet as wn
chair = 'chair'
# 输出chair的各种含义
chair_synts = wn.synts(chair)
print('Chair的意思:',chair_synts,'\n\n')
# 迭代输出含义,含义的定义,同义词条,例句
for synt in chair_synts:
print(synt,': ')
print('Definition: ',synt.definition())
print('Lemmas/Synonymous words: ',synt.lemma_names())
print('Example: ',amples(),'\n')
输出:
Chair的意思: [Synt('01'), Synt('01'), Synt('04'), Synt('01'), Synt('05'), Synt('chair.v.01'), Synt('moderate.v.01')] Synt('01') :
Definition: a at for one person, with a support for the back
Lemmas/Synonymous words: ['chair']
Example: ['he put his coat over the back of the chair and sat down']
Synt('01') :
Definition: the position of professor
Lemmas/Synonymous words: ['professorship', 'chair']
Example: ['he was awarded an endowed chair in economics']
Synt('04') :
Definition: the officer who presides at the meetings of an organization
Lemmas/Synonymous words: ['president', 'chairman', 'chairwoman', 'chair', 'chairperson']
Example: ['address your remarks to the chairperson']
Synt('01') :
Definition: an instrument of execution by electrocution; rembles an ordinary at for one person
Lemmas/Synonymous words: ['electric_chair', 'chair', 'death_chair', 'hot_at']
Example: ['the murderer was ntenced to die in the chair']
Synt('05') :
Definition: a particular at in an orchestra
Lemmas/Synonymous words: ['chair']
Example: ['he is cond chair violin']
Synt('chair.v.01') :
Definition: act or preside as chair, as of an academic department in a university
Lemmas/Synonymous words: ['chair', 'chairman']
Example: ['She chaired the department for many years']
Synt('moderate.v.01') :
Definition: preside over
Lemmas/Synonymous words: ['moderate', 'chair', 'lead']
Example: ['John moderated the discussion']
上位词和下位词
下位词更具体,上位词更⼀般(泛化)
以01和01为例:
pus import wordnet as wn
woman = wn.synt('01')
bed = wn.synt('01')
# 返回据有直系关系的同义词集,上位词!
print(woman.hypernyms())
woman_paths = woman.hypernym_paths()
# 打印从根节点到01的所有路径
for idx,path in enumerate(woman_paths):
print('\n\nHypernym Path :',idx+1)
for synt in path:
print(synt.name(),',',end='')
# 更具体的术语,下位词!
types_of_bed = bed.hyponyms()
print('\n\nTypes of beds(Hyponyms): ',types_of_bed)
# 打印出更有意义的lemma(词条)
print('\n',sorted(t(lemma.name() for synt in types_of_bed for lemma in synt.lemmas())))
输出:
[Synt('01'), Synt('02')]
Hypernym Path : 1
<01 ,01 ,01 ,01 ,01 ,01 ,
Hypernym Path : 2
<01 ,01 ,01 ,02 ,01 ,01 ,01 ,01 ,01 ,
Hypernym Path : 3
<01 ,01 ,01 ,01 ,02 ,01 ,
Hypernym Path : 4
<01 ,01 ,01 ,02 ,01 ,01 ,01 ,02 ,01 ,
Types of beds(Hyponyms): [Synt('03'), Synt('01'), Synt('03'), Synt('01'), Synt('03'), Synt('03'), Synt('02'), Synt('01'), Synt('01'), Syn ['Murphy_bed', 'berth', 'built-in_bed', 'built_in_bed', 'bunk', 'bunk_bed', 'camp_bed', 'cot', 'couch', 'deathbed', 'double_bed', 'four-poster', 'hammock', 'marriage_bed', 'plank-bed', 'platform_bed', 'sack', 'sickbed', 'single_bed', 'sleigh_bed', 'truckle', 'truckle
基于WordNet计算某种词性的多义性
以名词n为例:
pus import wordnet as wn
type = 'n' #动词v,副词r,形容词a
# 返回WordNet中所有type类型的同义词集
sysnets = wn.all_synts(type)
# 将所有词条合并成⼀个⼤list
lemmas = []
for sysnet in sysnets:
for lemma in sysnet.lemmas():
lemmas.append(lemma.name())
# 删除重复词条,list=>t
lemmas = t(lemmas)
# 计算每个词条type类型的含义数并加到⼀起
count = 0
for lemma in lemmas:
count = count + len(wn.synts(lemma,type)) # lemma在type类型下的所有含义
# 打印所有数值
print('%s总词条数: '%(type),len(lemmas))
print('%s总含义数: '%(type),count)
print('%s平均多义性: '%(type),count/len(lemmas))
输出:
n总词条数: 119034
n总含义数: 152763
n平均多义性: 1.2833560159282222