word2vec实现影评情感分析
⽬录
import所需库
# bs4 nltk gensim
import os
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn. import CountVectorizer
ble import RandomForestClassifier
ics import confusion_matrix
from sklearn.linear_model import LogisticRegression
import nltk
pus import stopwords
# nltk.download()
# 测试 nltk tokenizers 部分的安装
import nltk
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
⽤pandas读⼊训练数据
df = pd.read_csv('../data/labeledTrainData.tsv', p='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df.head()
# ntiment 喜欢电影与否;review:对电影的评论
Number of reviews: 25000
id ntiment review
05814_81With all this stuff going down at the
12381_91"The Classic War of the Worlds" by
27759_30The film starts with a manager (Nicholas Bell)...
33630_40It must be assumed that tho who
49495_81Superbly trashy and wondrously
对影评数据做预处理,⼤概有以下环节:
1. 去掉html标签
2. 移除标点
3. 切分成词/token
4. 去掉停⽤词
5. 重组为新的句⼦
df['review'][1000]
"I watched this movie really late last night and usually if it's late then I'm pretty forgiving of movies. Although I tried, I just could not stand this movie at all, it kept getting wor and wor as the movie went on. Although I know it's suppo to be # 去掉HTML标签的数据
example = BeautifulSoup(df['review'][1000], 'html.parr').get_text()
example
"I watched this movie really late last night and usually if it's late then I'm pretty forgiving of movies. Although I tried, I just could not stand this movie at all, it kept getting wor and wor as the movie went on. Although I know it's suppo to be # 去掉标点符号
example_letters = re.sub(r'[^a-zA-Z]', ' ', example)
example_letters
'I watched this movie really late last night and usually if it s late then I m pretty forgiving of movies Although I tried I just could not stand this movie at all it kept getting wor and wor as the movie went on Although I know it s suppo to be words = example_letters.lower().split()
words
['i',
'watched',
'this',
'movie',
'really',
'late',
'last',
'night',
'and',
'usually',
'if',
'it',
's',
'late',
'then',
'i',
'm',
'pretty',
'forgiving',
'of',
'movies',
'although',
'i',
'tried',
'i',
'just',
'could',
'not',
'stand',
'this',
'getting',
'wor',
'and',
'wor',
'as',
'the',
'movie',
'went',
'on',
'although',
'i',
'know',
'it',
's',
'suppo',
'to',
'be',
'a',
'comedy',
'but',
'i',
'didn',
't',
'find',
'it',
'very',
'funny',
'it',
'was',
'also',
'an',
'especially',
'unrealistic',
'and',
'jaded',
'portrayal',
'of',
'rural',
'life',
'in',
'ca',
'this',
'is',
'what',
'any',
'of',
'you',
'think',
'country',
'life',
'is',
'like',
'it',
's',
'definitely',
'not',
'i',
'do',
'have',
'to',
'agree',
'that',
'some',
'of',
'the',
'guy',
'cast',
'members',
'were',
'cute',
'but',
'the',
'french',
'guy',
'was',
'really',
'fake',
'i',
'do',
'have',
'to',
'agree',
'that',
'it',
'tried',
'to',
'have',
'a',
'good',
'lesson',
'in',
'the',
'story',
'but',
'overall',
'my',
'recommendation',
'is',
'that',
'no',
'one',
'over',
'watch',
'it',
'it',
's',
'just',
'too',
'annoying']
#下载停⽤词和其他语料会⽤到
#nltk.download()
#去停⽤词
stopwords = {}.fromkeys([ line.rstrip() for line in open('../')]) words_nostop = [w for w in words if w not in stopwords]
words_nostop
'pretty',
'forgiving',
'movies',
'stand',
'movie',
'wor',
'wor',
'movie',
'suppo',
'comedy',
'didn',
'funny',
'unrealistic',
'jaded',
'portrayal',
'rural',
'life',
'country',
'life',
'agree',
'guy',
'cast',
'cute',
'french',
'guy',
'fake',
'agree',
'lesson',
'story',
'recommendation',
'watch',
'annoying']
eng_stopwords = t(stopwords)
def clean_text(text):
text = BeautifulSoup(text, 'html.parr').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
words = [w for w in words if w not in eng_stopwords]
return ' '.join(words)
df['review'][1000]
"I watched this movie really late last night and usually if it's late then I'm pretty forgiving of movies. Although I tried, I just could not stand this movie at all, it kept getting wor and wor as the movie went on. Although I know it's suppo to be clean_text(df['review'][1000])
'watched movie late night late pretty forgiving movies stand movie wor wor movie suppo comedy didn funny unrealistic jaded portrayal rural life country life agree guy cast cute french guy fake agree lesson story recommendation watch an 清洗数据添加到dataframe⾥
df['clean_review'] = df.review.apply(clean_text) # clean_review 清洗后的数据
df.head()
id ntiment review clean_review
05814_81With all this stuff going down at the stuff moment mj ve started listening
12381_91"The Classic War of the Worlds" by lassic war worlds timothy hines entertaining ...
27759_30The film starts with a manager (Nicholas Bell)...film starts manager nicholas bell
33630_40It must be assumed that tho who assumed praid film filmed opera didn
49495_81Superbly trashy and wondrously superbly trashy wondrously
抽取bag of words特征(⽤sklearn的CountVectorizer)
vectorizer = CountVectorizer(max_features = 5000) # 基于词频排序,选取前 5k 个词,建⽴ 5k 维度的向量。
train_data_features = vectorizer.fit_transform(df.clean_review).toarray() # 将⽂本数据转换为词袋模型的特征数据
train_data_features.shape
(25000, 5000)
ss_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data_iment,test_size = 0.2, random_state = 0)
C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_lection module into which all the refactored class and functions are moved. Also note th "This module will be removed in 0.20.", DeprecationWarning)
混淆矩阵
可当做模板来使⽤
import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, class,
title='Confusion matrix',
Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
tick_marks = np.arange(len(class))
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
<(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh el "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
训练分类器
# 使⽤逻辑回归来做这个基本分类任务。先不⽤ word2vec。
LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train, y_train)
print("Recall metric in the testing datat: ", cnf_matrix[1,1]/(cnf_matrix[1,0] + cnf_matrix[1,1]))
# 精度
print("accuracy metric in the testing datat: ", (cnf_matrix[1,1] + cnf_matrix[0,0])/(cnf_matrix[0,0] + cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[0,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, class=class_names, title='Confusion matrix')
plt.show()
Recall metric in the testing datat: 0.853*********
accuracy metric in the testing datat: 0.8454
使⽤ word2vec
df = pd.read_csv('../data/unlabeledTrainData.tsv', p='\t', escapechar='\\')
print('Number of reviews: {}'.format(len(df)))
df.head()
Number of reviews: 50000
id review
09999_0Watching Time Chars, it obvious that it was ...
145057_0I saw this film about 20 years ago
215561_0Minor Spoilers<br /><br />In New York,
37161_0I went to e this film with a great deal
443971_0Yes, I agree with everyone on this site
df['clean_review'] = df.review.apply(clean_text)
df.head()
id review clean_review
09999_0Watching Time Chars, it obvious that it was ...watching time chars obvious bunch
145057_0I saw this film about 20 years ago film ago remember nasty bad true
215561_0Minor Spoilers<br /><br />In New York, inor spoilersin york joan barnard
37161_0I went to e this film with a great deal film deal excitement school director
443971_0Yes, I agree with everyone on this site agree site movie bad call movie insult movies ...
review_part = df['clean_review']
review_part.shape
(50000,)
import warnings
warnings.filterwarnings("ignore")
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # 加载英⽂分词器;分词,因为 word2vec 是基于词建模。
def split_ntences(review):
raw_ntences = kenize(review.strip())
ntences = [clean_text(s) for s in raw_ntences if s]
return ntences
ntences = sum(review_part.apply(split_ntences), [])
print('{} reviews -> {} ntences'.format(len(review_part), len(ntences)))
50000 reviews -> 50000 ntences
ntences[0]
'watching time chars obvious bunch friends sitting day film school hey pool money bad movie bad movie dull story bad script lame acting poor cinematography bottom barrel stock music corners cut prevented film relea life' ntences_list = []
for line in ntences:
ntences_list.append(nltk.word_tokenize(line))
ntences:可以是⼀个list。
sg:⽤于设置训练算法,默认为0,对应CBOW算法;sg=1则采⽤skip-gram算法。
size:是指特征向量的维度,默认为100。⼤的size需要更多的训练数据,但是效果会更好. 推荐值为⼏⼗到⼏百。 # 300 以下⽐较好。
window:表⽰当前词与预测词在⼀个句⼦中的最⼤距离是多少;上下⽂的滑动窗⼝。
alpha: 是学习速率。
ed:⽤于随机数发⽣器。与初始化词向量有关。
min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。
max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独⽴单词个数超过这个,则就消除掉其中最不频繁的⼀个。
每⼀千万个单词需要⼤约1GB的RAM。设置成None则没有限制。(很少有⼀千万个词,设置成 None 就可以了)
workers:控制训练的并⾏数。
hs:如果为1则会采⽤hierarchica·softmax技巧(哈弗曼树或负采样)。如果设置为0(defau·t),则negative sampling会被使⽤。
negative:如果>0,则会采⽤ negative sampling,⽤于设置多少个noi words。
iter:迭代次数,默认为5。
# 设定词向量训练的参数
num_features = 300 # Word vector dimensionality 特征维度
min_word_count = 40 # Minimum word count 词频
num_workers = 4 # Number of threads to run in parallel 并⾏数
context = 10 # Context window size
model_name = '{}features_{}minwords_{}del'.format(num_features, min_word_count, context)
使⽤ gensim 建模
dels.word2vec import Word2Vec
model = Word2Vec(ntences_list, workers=num_workers, \
size=num_features, min_count = min_word_count, \
window = context)
# If you don't plan to train the model any further, calling
# It can be helpful to create a meaningful model name and
# save the model for later u. You can load it later using Word2Vec.load()
model.save(os.path.join('..', 'models', model_name)) # 存储模型
print(model.doesnt_match(['man','woman','child','kitchen'])) # 计算相似度,返回最不相关的词
#print(model.doesnt_match('france england germany berlin'.split())
kitchen
[('girl', 0.7018299698829651),
('astro', 0.6647905707359314),
('teenage', 0.6317306160926819),
('frat', 0.60948246717453),
('dad', 0.601148**********),
('yr', 0.6010577082633972),
('teenager', 0.5974895358085632),
('brat', 0.5941195487976074),
('joshua', 0.5832049250602722),
('father', 0.5825375914573669)]
[('wor', 0.7071679830551147),
('horrible', 0.7065873742103577),
('terrible', 0.6872220635414124),
('sucks', 0.6666240692138672),
('crappy', 0.6634873747825623),
('lousy', 0.6494461297988892),
('horrendous', 0.6371070742607117),
('atrocious', 0.62550288438797),
('suck', 0.6224384307861328),
('awful', 0.619296669960022)]
df = pd.read_csv('../data/labeledTrainData.tsv', p='\t', escapechar='\\')
df.head()
id ntiment review
05814_81With all this stuff going down at the
12381_91"The Classic War of the Worlds" by
27759_30The film starts with a manager (Nicholas Bell)...
33630_40It must be assumed that tho who
49495_81Superbly trashy and wondrously
pus import stopwords
eng_stopwords = t(stopwords.words('english'))
def clean_text(text, remove_stopwords=Fal):
text = BeautifulSoup(text, 'html.parr').get_text()
text = re.sub(r'[^a-zA-Z]', ' ', text)
words = text.lower().split()
if remove_stopwords:
words = [w for w in words if w not in eng_stopwords]
return words
def to_review_vector(review):
global word_vec
review = clean_text(review, remove_stopwords=True)
#print (review)
#words = nltk.word_tokenize(review)
word_vec = np.zeros((1,300))
for word in review:
#word_vec = np.zeros((1,300))
if word in model:
word_vec += np.array([model[word]])
#print (an(axis = 0))
return pd.Series(an(axis = 0))
train_data_features = df.review.apply(to_review_vector)
train_data_features.head()
0123456789 (290291292293294295296297)
0-
0.6966640.903903
-
0.625330
-
1.004056
0.3043150.757687
-
0.585106
1.0637580.361671
-
1.063279
...
-
2.586864
0.0981971.739277
-
0.427558
1.084251
-
2.500275
1.8449000.923524
1
0.888799-
0.449773
1.340381
-
3.644667
2.221354
-
2.437322
-
1.399687
0.5395502.0.9325570.607203
-
0.594103
-
0.159929
-
1.501902
-
1.217742
0.1153451.562480
2
0.5898624.321714-
0.652215
5.326607
-
8.739010
0.0055901.371678
-
0.868081
-
1.485593
-
2.200574
...
-
1.301250
0.6322172.265594
-
2.135824
2.925084
-
0.933688
-
0.872354
-
0.567039
3-
1.029406-
0.387385
0.504282
-
1.223156
-
0.733892
0.389869
-
1.111555
-
0.703193
3.
-
2.293309
0.6158990.1095690.8091280.798448
-
1.626398
0.3404301.064533
4-
2.3434732.814057
-
2.822986
1.471130
-
4.252637
0.1174153.3096420.895924
-
2.021818
-
0.558035
...
-
2.075477
0.9502464.536721
-
1.728554
2.433016
-
1.895700
-
0.214796
0.251841
5 rows × 300 columns
ss_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train_data_iment,test_size = 0.2, random_state = 0)
LR_model = LogisticRegression()
LR_model = LR_model.fit(X_train, y_train)
y_pred = LR_model.predict(X_test)
cnf_matrix = confusion_matrix(y_test,y_pred)
print("Recall metric in the testing datat: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("accuracy metric in the testing datat: ", (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[0,1])) # Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
, class=class_names
, title='Confusion matrix')
plt.show()
Recall metric in the testing datat: 0.87969004894
accuracy metric in the testing datat: 0.865