深度学习算法--python实现多层RNN对IMDb电影评论进⾏情感分析(源码+详细
注释)
情感分析涉及分析句⼦或⽂本⽂档所表达的想法。现⽤多对⼀的体系结构来实现多层RNN以⽤于情感分析。
输⼊或输出的数据将会属于以下三种不同的类别:
·多对⼀:输⼊数据是⼀个序列,但输出数据不是序列⽽是固定的向 量。例如,情感分析的输⼊基于⽂本,⽽输出是分类标签。
·⼀对多:输⼊数据是标准格式,不是序列,⽽输出数据是序列。⼀个例⼦是图像字幕,输⼊是图像,输出是英语短语。
·多对多:输⼊和输出阵列都是序列。可以根据输⼊和输出是否同步来进⼀步划分该类别。同步多对多建模任务的例⼦是视频分类,标记视频的每帧。延迟多对多的例⼦是把⼀种语⾔翻译成另⼀种语⾔。例如,⼀个完整的英语句⼦必须在机器翻译成德语之前先被机器阅读和处理。
import pandas as pd
import pyprind
from string import punctuation
import re
import numpy as np
import os
from collections import Counter
pat.v1 as tf
tf.disable_v2_behavior()
# import tensorflow as tf
df = pd.read_csv('xxx\movie_data.csv',
encoding='utf-8')
print(df.head(3))
# 将⽂本转换成整数列表
counts = Counter()
pbar = pyprind.ProgBar(len(df['review']), title='Counting words occurences')
for i,review in enumerate(df['review']):
# join()⽅法将序列中的元素以指定的字符连接⽣成⼀个新的字符串。
# 标点符号两边加空格,⽅便后续将单词分割出来
text =''.join([c if c not in punctuation el' '+c+' 'for c in review]).lower()
df.loc[i,'review']= text
pbar.update()
# Counter类定义⼀个counts对象,该对象收集⽂本中每个独⽴单词出现的频率。
counts.update(text.split())
## Create a mapping:
## Map each unique word to an integer
# 基于词频对独⽴词排序,然后进⾏映射
# ⽅法获取元素出现的次数
word_counts =sorted(counts, , rever=True)
print(word_counts[:5])
word_to_int ={word: ii for ii, word in enumerate(word_counts,1)}
# 将⽂本转换成整数列表
mapped_reviews =[]
pbar = pyprind.ProgBar(len(df['review']), title='Map reviews to ints')
克莱帝卫浴
汽水稻盛和夫是什么人
for review in df['review']:
mapped_reviews.append([word_to_int[word]for word in review.split()])
pbar.update()
# 创建相同长度的序列,⽣成与RNN体系结构兼容的输⼊数据
quence_length =200## quence length (or T in our formulas)
quences = np.zeros((len(mapped_reviews), quence_length), dtype=int)
for i, row in enumerate(mapped_reviews):
review_arr = np.array(row)
quences[i,-len(row):]= review_arr[-quence_length:]
# 数据集已经洗牌,可以简单地将数据集的前半部分⽤于训练,后半部分⽤于测试
X_train = quences[:25000,:]
X_train = quences[:25000,:]
y_train = df.loc[:25000,'ntiment'].values
电脑自动休眠怎么取消X_test = quences[25000:,:]
y_test = df.loc[25000:,'ntiment'].values
# ⼩批量
np.random.ed(123)# for reproducibility
## Function to generate minibatches:
def create_batch_generator(x, y=None, batch_size=64):
# python中与除法相关的三个运算符是// 和 / 和 %,下⾯逐⼀介绍。
# “/”,这是传统的除法,3/2=1.5
# “//”,在python中,这个叫“地板除”,3//2=1
交通安全宣传标语# “%”,这个是取模操作,也就是区余数,4%2=0,5%2=1
n_batches =len(x)//batch_size
x= x[:n_batches*batch_size]
if y is not None:
y = y[:n_batches*batch_size]
for ii in range(0,len(x), batch_size):
if y is not None:
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
el:
yield x[ii:ii+batch_size]
# 构建⼀个RNN模型
class SentimentRNN(object):
def__init__(lf, n_words, q_len=200,
lstm_size=256, num_layers=1, batch_size=64,
learning_rate=0.0001, embed_size=200):
lf.n_words = n_words
lf.q_len = q_len
lf.lstm_size = lstm_size ## number of hidden units
lf.num_layers = num_layers
lf.batch_size = batch_size
lf.learning_rate = learning_rate
lf.g = tf.Graph()
as_default():
tf.t_random_ed(123)
lf.build()
lf.saver = tf.train.Saver()
lf.init_op = tf.global_variables_initializer()
def build(lf):
## Define the placeholders
tf_x = tf.placeholder(tf.int32,
shape=(lf.batch_size, lf.q_len),
name='tf_x')
tf_y = tf.placeholder(tf.float32,
shape=(lf.batch_size),
name='tf_y')
tf_keepprob = tf.placeholder(tf.float32,
name='tf_keepprob')
## Create the embedding layer
# 增加嵌⼊层并构建嵌⼊式表⽰embed_x
# 创建⼤⼩为[n_words×embedding_size]的矩阵作为张量变量,
# 该变量被称为embedding,⽤[-1,1]之间的随机浮点数来初始化其元素
embedding = tf.Variable(
tf.random_uniform(
(lf.n_words, lf.embed_size),
minval=-1, maxval=1),
name='embedding')
# bedding_lookup函数在嵌⼊矩阵中查找与tf_x的每个元素相关联的⾏ embed_x = bedding_lookup(
embedding, tf_x,
embedding, tf_x,
name='embeded_x')
## Define LSTM cell and stack them together
# ⾸先定义多层RNN单元
# ⽤BasicLSTMCell来创建RNN单元
# ⽤DropoutWrapper对RNN单元应⽤淘汰策略
# 调⽤MultiRNNCell封装类堆叠起来形成多层RNN
# 这⾥的cell即为⼀个LSTM⽹络
cells = _cell.MultiRNNCell(
[_cell.DropoutWrapper(
<_cell.BasicLSTMCell(lf.lstm_size),
output_keep_prob=tf_keepprob)
for i in range(lf.num_layers)])
# 定义CNN单元的初态
## Define the initial state:
# 这⾥批量处理100个句⼦,LSTM⽹络的神经元为128个,每个句⼦对应128个状态值。
lf.initial_state = _state(
lf.batch_size, tf.float32)
# << initial state >> (LSTMStateTuple(c=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0' shape=(100 , 128) dtype=float32>,
# h=<tf.Tensor 'MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0' shape=(100, 128) dtype=float32>),)
print(' << initial state >> ', lf.initial_state)
# ⽤RNN单元及其初始化值创建RNN
# ⽤tf.nn.dynamic_rnn函数组合所有的组件,
曲开头的成语
# 整合嵌⼊数据、RNN单元及其初态,并根据LSTM单元所展现的体系结构为其创建管道
# 处理完⼀个⼩批量之后,调⽤tf.nn.dynamic_rnn函数,将状态更新为终态。更新后的状态将⽤于执⾏下⼀个⼩批量。
# 反复进⾏该过程并在整个迭代过程中不断地更新当前的状态。
lstm_outputs, lf.final_state = tf.nn.dynamic_rnn(
cells, embed_x,
initial_state=lf.initial_state)
## Note: lstm_outputs shape:
# 参考:
# /p/79021e23d683?utm_campaign=maleskine&utm_content=note&utm_medium=o_notes&utm_source=recommendation ## [batch_size, max_time, cells.output_size]
# << lstm_output >> Tensor("rnn/transpo_1:0", shape=(100, 200, 128), dtype=float32)
print('\n << lstm_output >> ', lstm_outputs)
# << final state >> (LSTMStateTuple(c=<tf.Tensor 'rnn/while/Exit_3:0' shape=(100, 128) dtype=float
32>,
# h=<tf.Tensor 'rnn/while/Exit_4:0' shape=(100, 128) dtype=float32>),)
print('\n << final state >> ', lf.final_state)
## Apply a FC layer after on top of RNN output:
logits = tf.layers.den(
inputs=lstm_outputs[:,-1],# 100*128
units=1, activation=None,
name='logits')
logits = tf.squeeze(logits, name='logits_squeezed')# ⼆维变⼀维
# << logits >> Tensor("logits_squeezed:0", shape=(100,), dtype=float32)
print('\n << logits >> ', logits)
# 应⽤sigmoid函数可以将输出压缩⾄0~1的范围
y_proba = tf.nn.sigmoid(logits, name='probabilities')
# tf.cast():将x的数据格式转化成dtype数据类型
# tf.round():四舍五⼊
predictions ={
'probabilities': y_proba,
'labels': tf.und(y_proba), tf.int32,
name='labels')
}
# << predictions >> {'probabilities': <tf.Tensor 'probabilities:0' shape=(100,) dtype=float32>,
# 'labels': <tf.Tensor 'labels:0' shape=(100,) dtype=int32>}
print('\n << predictions >> ', predictions)
## Define the cost function
# tf.nn.sigmoid_cross_entropy_with_logits()预测越准确,结果的值越⼩
# tf.reduce_mean操作,对向量求均值
cost = tf.reduce_mean(
labels=tf_y, logits=logits),
name='cost')
## Define the optimizer
# tf.train.AdamOptimizer()函数是Adam优化算法:是⼀个寻找全局最优点的优化算法,引⼊了⼆次⽅梯度校正。
# Adam优化器是⼀个是强⼤的基于梯度的优化⽅法,适合⾮顶点优化和机器学习。
optimizer = tf.train.AdamOptimizer(lf.learning_rate)
train_op = optimizer.minimize(cost, name='train_op')
def train(lf, X_train, y_train, num_epochs):
with tf.Session(graph=lf.g)as ss:
ss.run(lf.init_op)
iteration =1
for epoch in range(num_epochs):
state = ss.run(lf.initial_state)
for batch_x, batch_y in create_batch_generator(
X_train, y_train, lf.batch_size):
feed ={'tf_x:0': batch_x,
'tf_y:0': batch_y,
'tf_keepprob:0':0.5,
lf.initial_state: state}
loss, _, state = ss.run(
['cost:0','train_op',
lf.final_state],
feed_dict=feed)
if iteration %20==0:
print("Epoch: %d/%d Iteration: %d "
"| Train loss: %.5f"%(
epoch +1, num_epochs,
iteration, loss))
iteration +=1
if(epoch +1)%10==0:
lf.saver.save(ss,
"model/ntiment-%d.ckpt"% epoch)
# 与train⽅法类似,需要不断地更新当前的状态
def predict(lf, X_data, return_proba=Fal):
preds =[]
with tf.Session(graph=lf.g)as ss:
# tf.train.latest_checkpoint()⾃动寻找最新的checkpoint
store(
ss, tf.train.latest_checkpoint('model/'))
test_state = ss.run(lf.initial_state)
for ii, batch_x in enumerate(
create_batch_generator(
X_data,None, batch_size=lf.batch_size),1):
feed ={'tf_x:0': batch_x,
'tf_keepprob:0':1.0,教学设计理念
lf.initial_state: test_state}
if return_proba:
pred, test_state = ss.run(
['probabilities:0', lf.final_state],
feed_dict=feed)
el:
pred, test_state = ss.run(
['labels:0', lf.final_state],
feed_dict=feed)
feed_dict=feed)
preds.append(pred)
atenate(preds)
## Train:
# 设置参数n_words使其等于独⽴单词的数⽬+1(加上1是因为当序列长度⼩于200时⽤0来填充)
n_words =max(list(word_to_int.values()))+1
rnn = SentimentRNN(n_words=n_words,
q_len=quence_length,
embed_size=256,
lstm_size=128,# lstm_size决定了每个RNN层中隐藏单元的数量
num_layers=1,# 设置num_layers=1来使⽤单层RNN
batch_size=100,
learning_rate=0.001)
# 40次迭代来训练模型
## Test:
preds = rnn.predict(X_test)
y_true = y_test[:len(preds)]
print('Test Acc.: %.3f'%(
np.sum(preds == y_true)/len(y_true)))
## Get probabilities:
proba = rnn.predict(X_test, return_proba=True)
跑的时间稍长,因为IMDb电影评论数据量很⼤。
运⾏结果:
review ntiment
0 In 1974, the teenager Martha Moxley (Maggie Gr (1)
so... I really like Kris Kristofferson a 0
2 SPOILER Do not read this, if you think a 0
Counting words occurences
0% [>>>>>>] 100% | ETA: 00:00:00Total time elapd: 00:01:35
数一数ppt
[‘the’, ‘.’, ‘,’, ‘and’, ‘a’]
Map reviews to ints
0% [>>>>>>] 100% | ETA: 00:00:00Total time elapd: 00:00:02
<< initial state >> (LSTMStateTuple(c=<tf.Tensor
‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros:0’ shape=(100, 128)
dtype=float32>, h=<tf.Tensor ‘MultiRNNCellZeroState/DropoutWrapperZeroState/BasicLSTMCellZeroState/zeros_1:0’shape=(100,
128) dtype=float32>),)
<< lstm_output >> Tensor(“rnn/transpo_1:0”, shape=(100, 200, 128), dtype=float32)
<< final state >> (LSTMStateTuple(c=<tf.Tensor ‘rnn/while/Exit_3:0’ shape=(100, 128) dtype=float32>, h=<tf.Tensor
‘rnn/while/Exit_4:0’ shape=(100, 128) dtype=float32>),)
<< logits >> Tensor(“logits_squeezed:0”, shape=(100,), dtype=float32)
<< predictions >> {‘probabilities’: <tf.Tensor ‘probabilities:0’ shape=(100,) dtype=float32>, ‘labels’: <tf.Tensor
‘labels:0’ shape=(100,) dtype=int32>}