首页 > 英文翻译

word2vec实现影评情感分析

更新时间:2023-05-07 00:37:48 阅读：评论：0

word2vec实现影评情感分析

⽬录

import所需库

# bs4 nltk gensim

import os

import re

import numpy as np

import pandas as pd

from bs4 import BeautifulSoup

from sklearn. import CountVectorizer

ble import RandomForestClassifier

ics import confusion_matrix

from sklearn.linear_model import LogisticRegression

import nltk

pus import stopwords

# nltk.download()

# 测试 nltk tokenizers 部分的安装

import nltk

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

⽤pandas读⼊训练数据

df = pd.read_csv('../data/labeledTrainData.tsv', p='\t', escapechar='\\')

print('Number of reviews: {}'.format(len(df)))

df.head()

# ntiment 喜欢电影与否；review：对电影的评论

Number of reviews: 25000

id ntiment review

05814_81With all this stuff going down at the

12381_91"The Classic War of the Worlds" by

27759_30The film starts with a manager (Nicholas Bell)...

33630_40It must be assumed that tho who

49495_81Superbly trashy and wondrously

对影评数据做预处理，⼤概有以下环节：

1. 去掉html标签

2. 移除标点

3. 切分成词/token

4. 去掉停⽤词

5. 重组为新的句⼦

df['review'][1000]

"I watched this movie really late last night and usually if it's late then I'm pretty forgiving of movies. Although I tried, I just could not stand this movie at all, it kept getting wor and wor as the movie went on. Although I know it's suppo to be # 去掉HTML标签的数据

example = BeautifulSoup(df['review'][1000], 'html.parr').get_text()

example

example_letters = re.sub(r'[^a-zA-Z]', ' ', example)

example_letters

'I watched this movie really late last night and usually if it s late then I m pretty forgiving of movies Although I tried I just could not stand this movie at all it kept getting wor and wor as the movie went on Although I know it s suppo to be words = example_letters.lower().split()

words

['i',

'watched',

'this',

'movie',

'really',

'late',

'last',

'night',

'and',

'usually',

'if',

'it',

's',

'late',

'then',

'i',

'm',

'pretty',

'forgiving',

'of',

'movies',

'although',

'i',

'tried',

'i',

'just',

'could',

'not',

'stand',

'this',

'getting',

'wor',

'and',

'wor',

'as',

'the',

'movie',

'went',

'on',

'although',

'i',

'know',

'it',

's',

'suppo',

'to',

'be',

'a',

'comedy',

'but',

'i',

'didn',

't',

'find',

'it',

'very',

'funny',

'it',

'was',

'also',

'an',

'especially',

'unrealistic',

'and',

'jaded',

'portrayal',

'of',

'rural',

'life',

'in',

'ca',

'this',

'is',

'what',

'any',

'of',

'you',

'think',

'country',

'life',

'is',

'like',

'it',

's',

'definitely',

'not',

'i',

'do',

'have',

'to',

'agree',

'that',

'some',

'of',

'the',

'guy',

'cast',

'members',

'were',

'cute',

'but',

'the',

'french',

'guy',

'was',

'really',

'fake',

'i',

'do',

'have',

'to',

'agree',

'that',

'it',

'tried',

'to',

'have',

'a',

'good',

'lesson',

'in',

'the',

'story',

'but',

'overall',

'my',

'recommendation',

'is',

'that',

'no',

'one',

'over',

'watch',

'it',

's',

'just',

'too',

'annoying']

#下载停⽤词和其他语料会⽤到

#nltk.download()

#去停⽤词

stopwords = {}.fromkeys([ line.rstrip() for line in open('../')]) words_nostop = [w for w in words if w not in stopwords]

words_nostop

'pretty',

'forgiving',

'movies',

'stand',

'movie',

'wor',

'movie',

'suppo',

'comedy',

'didn',

'funny',

'unrealistic',

'jaded',

'portrayal',

'rural',

'life',

'country',

'life',

'agree',

'guy',

'cast',

'cute',

'french',

'guy',

'fake',

'agree',

'lesson',

'story',

'recommendation',

'watch',

'annoying']

eng_stopwords = t(stopwords)

def clean_text(text):

text = BeautifulSoup(text, 'html.parr').get_text()

text = re.sub(r'[^a-zA-Z]', ' ', text)

words = text.lower().split()

words = [w for w in words if w not in eng_stopwords]

return ' '.join(words)

df['review'][1000]

'watched movie late night late pretty forgiving movies stand movie wor wor movie suppo comedy didn funny unrealistic jaded portrayal rural life country life agree guy cast cute french guy fake agree lesson story recommendation watch an 清洗数据添加到dataframe⾥

df['clean_review'] = df.review.apply(clean_text) # clean_review 清洗后的数据

df.head()

id ntiment review clean_review

05814_81With all this stuff going down at the stuff moment mj ve started listening

12381_91"The Classic War of the Worlds" by lassic war worlds timothy hines entertaining ...

27759_30The film starts with a manager (Nicholas Bell)...film starts manager nicholas bell

33630_40It must be assumed that tho who assumed praid film filmed opera didn

49495_81Superbly trashy and wondrously superbly trashy wondrously

抽取bag of words特征(⽤sklearn的CountVectorizer)

vectorizer = CountVectorizer(max_features = 5000) # 基于词频排序，选取前 5k 个词，建⽴ 5k 维度的向量。

train_data_features = vectorizer.fit_transform(df.clean_review).toarray() # 将⽂本数据转换为词袋模型的特征数据

train_data_features.shape

(25000, 5000)

ss_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data_iment,test_size = 0.2, random_state = 0)

C:\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_lection module into which all the refactored class and functions are moved. Also note th "This module will be removed in 0.20.", DeprecationWarning)

混淆矩阵

可当做模板来使⽤

import matplotlib.pyplot as plt

import itertools

def plot_confusion_matrix(cm, class,

title='Confusion matrix',

Blues):

"""

This function prints and plots the confusion matrix.

"""

plt.imshow(cm, interpolation='nearest', cmap=cmap)

plt.title(title)

tick_marks = np.arange(len(class))

thresh = cm.max() / 2.

for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):

<(j, i, cm[i, j],

horizontalalignment="center",

color="white" if cm[i, j] > thresh el "black")

plt.tight_layout()

plt.ylabel('True label')

plt.xlabel('Predicted label')

训练分类器

# 使⽤逻辑回归来做这个基本分类任务。先不⽤ word2vec。

LR_model = LogisticRegression()

LR_model = LR_model.fit(X_train, y_train)

print("Recall metric in the testing datat: ", cnf_matrix[1,1]/(cnf_matrix[1,0] + cnf_matrix[1,1]))

# 精度

print("accuracy metric in the testing datat: ", (cnf_matrix[1,1] + cnf_matrix[0,0])/(cnf_matrix[0,0] + cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[0,1]))

# Plot non-normalized confusion matrix

class_names = [0,1]

plt.figure()

plot_confusion_matrix(cnf_matrix, class=class_names, title='Confusion matrix')

plt.show()

Recall metric in the testing datat: 0.853*********

accuracy metric in the testing datat: 0.8454

使⽤ word2vec

df = pd.read_csv('../data/unlabeledTrainData.tsv', p='\t', escapechar='\\')

print('Number of reviews: {}'.format(len(df)))

df.head()

Number of reviews: 50000

id review

09999_0Watching Time Chars, it obvious that it was ...

145057_0I saw this film about 20 years ago

215561_0Minor Spoilers<br /><br />In New York,

37161_0I went to e this film with a great deal

443971_0Yes, I agree with everyone on this site

df['clean_review'] = df.review.apply(clean_text)

df.head()

id review clean_review

09999_0Watching Time Chars, it obvious that it was ...watching time chars obvious bunch

145057_0I saw this film about 20 years ago film ago remember nasty bad true

215561_0Minor Spoilers<br /><br />In New York, inor spoilersin york joan barnard

37161_0I went to e this film with a great deal film deal excitement school director

443971_0Yes, I agree with everyone on this site agree site movie bad call movie insult movies ...

review_part = df['clean_review']

review_part.shape

(50000,)

import warnings

warnings.filterwarnings("ignore")

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # 加载英⽂分词器；分词，因为 word2vec 是基于词建模。

def split_ntences(review):

raw_ntences = kenize(review.strip())

ntences = [clean_text(s) for s in raw_ntences if s]

return ntences

ntences = sum(review_part.apply(split_ntences), [])

print('{} reviews -> {} ntences'.format(len(review_part), len(ntences)))

50000 reviews -> 50000 ntences

ntences[0]

'watching time chars obvious bunch friends sitting day film school hey pool money bad movie bad movie dull story bad script lame acting poor cinematography bottom barrel stock music corners cut prevented film relea life' ntences_list = []

for line in ntences:

ntences_list.append(nltk.word_tokenize(line))

ntences：可以是⼀个list。

sg：⽤于设置训练算法，默认为0，对应CBOW算法；sg=1则采⽤skip-gram算法。

size：是指特征向量的维度，默认为100。⼤的size需要更多的训练数据，但是效果会更好. 推荐值为⼏⼗到⼏百。 # 300 以下⽐较好。

window：表⽰当前词与预测词在⼀个句⼦中的最⼤距离是多少；上下⽂的滑动窗⼝。

alpha: 是学习速率。

ed：⽤于随机数发⽣器。与初始化词向量有关。

min_count: 可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。

max_vocab_size: 设置词向量构建期间的RAM限制。如果所有独⽴单词个数超过这个，则就消除掉其中最不频繁的⼀个。

每⼀千万个单词需要⼤约1GB的RAM。设置成None则没有限制。（很少有⼀千万个词，设置成 None 就可以了）

workers：控制训练的并⾏数。

hs：如果为1则会采⽤hierarchica·softmax技巧（哈弗曼树或负采样）。如果设置为0（defau·t），则negative sampling会被使⽤。

negative：如果>0，则会采⽤ negative sampling，⽤于设置多少个noi words。

iter：迭代次数，默认为5。

# 设定词向量训练的参数

num_features = 300 # Word vector dimensionality 特征维度

min_word_count = 40 # Minimum word count 词频

num_workers = 4 # Number of threads to run in parallel 并⾏数

context = 10 # Context window size

model_name = '{}features_{}minwords_{}del'.format(num_features, min_word_count, context)

使⽤ gensim 建模

dels.word2vec import Word2Vec

model = Word2Vec(ntences_list, workers=num_workers, \

size=num_features, min_count = min_word_count, \

window = context)

# If you don't plan to train the model any further, calling

# It can be helpful to create a meaningful model name and

# save the model for later u. You can load it later using Word2Vec.load()

model.save(os.path.join('..', 'models', model_name)) # 存储模型

print(model.doesnt_match(['man','woman','child','kitchen'])) # 计算相似度，返回最不相关的词

#print(model.doesnt_match('france england germany berlin'.split())

kitchen

[('girl', 0.7018299698829651),

('astro', 0.6647905707359314),

('teenage', 0.6317306160926819),

('frat', 0.60948246717453),

('dad', 0.601148**********),

('yr', 0.6010577082633972),

('teenager', 0.5974895358085632),

('brat', 0.5941195487976074),

('joshua', 0.5832049250602722),

('father', 0.5825375914573669)]

[('wor', 0.7071679830551147),

('horrible', 0.7065873742103577),

('terrible', 0.6872220635414124),

('sucks', 0.6666240692138672),

('crappy', 0.6634873747825623),

('lousy', 0.6494461297988892),

('horrendous', 0.6371070742607117),

('atrocious', 0.62550288438797),

('suck', 0.6224384307861328),

('awful', 0.619296669960022)]

df = pd.read_csv('../data/labeledTrainData.tsv', p='\t', escapechar='\\')

df.head()

id ntiment review

05814_81With all this stuff going down at the

12381_91"The Classic War of the Worlds" by

27759_30The film starts with a manager (Nicholas Bell)...

33630_40It must be assumed that tho who

49495_81Superbly trashy and wondrously

pus import stopwords

eng_stopwords = t(stopwords.words('english'))

def clean_text(text, remove_stopwords=Fal):

text = BeautifulSoup(text, 'html.parr').get_text()

text = re.sub(r'[^a-zA-Z]', ' ', text)

words = text.lower().split()

if remove_stopwords:

words = [w for w in words if w not in eng_stopwords]

return words

def to_review_vector(review):

global word_vec

review = clean_text(review, remove_stopwords=True)

#print (review)

#words = nltk.word_tokenize(review)

word_vec = np.zeros((1,300))

for word in review:

#word_vec = np.zeros((1,300))

if word in model:

word_vec += np.array([model[word]])

#print (an(axis = 0))

return pd.Series(an(axis = 0))

train_data_features = df.review.apply(to_review_vector)

train_data_features.head()

0123456789 (290291292293294295296297)

0.6966640.903903

0.625330

1.004056

0.3043150.757687

0.585106

1.0637580.361671

1.063279

...

2.586864

0.0981971.739277

0.427558

1.084251

2.500275

1.8449000.923524

0.888799-

0.449773

1.340381

3.644667

2.221354

2.437322

1.399687

0.5395502.0.9325570.607203

0.594103

0.159929

1.501902

1.217742

0.1153451.562480

0.5898624.321714-

0.652215

5.326607

8.739010

0.0055901.371678

0.868081

1.485593

2.200574

...

1.301250

0.6322172.265594

2.135824

2.925084

0.933688

0.872354

0.567039

1.029406-

0.387385

0.504282

1.223156

0.733892

0.389869

1.111555

0.703193

2.293309

0.6158990.1095690.8091280.798448

1.626398

0.3404301.064533

2.3434732.814057

2.822986

1.471130

4.252637

0.1174153.3096420.895924

2.021818

0.558035

...

2.075477

0.9502464.536721

1.728554

2.433016

1.895700

0.214796

0.251841

5 rows × 300 columns

ss_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(train_data_iment,test_size = 0.2, random_state = 0)

LR_model = LogisticRegression()

LR_model = LR_model.fit(X_train, y_train)

y_pred = LR_model.predict(X_test)

cnf_matrix = confusion_matrix(y_test,y_pred)

print("Recall metric in the testing datat: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

print("accuracy metric in the testing datat: ", (cnf_matrix[1,1]+cnf_matrix[0,0])/(cnf_matrix[0,0]+cnf_matrix[1,1]+cnf_matrix[1,0]+cnf_matrix[0,1])) # Plot non-normalized confusion matrix

class_names = [0,1]

plt.figure()

plot_confusion_matrix(cnf_matrix

, class=class_names

, title='Confusion matrix')

plt.show()

Recall metric in the testing datat: 0.87969004894

accuracy metric in the testing datat: 0.865

本文发布于:2023-05-07 00:37:48，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/90/98640.html

上一篇：研究生综合英语1答案

下一篇：八年级下册英语阅读理解练习