Tensorflow实例：实现基于LSTM的语言模型

更新时间:2023-07-04 18:38:40 阅读：评论：0

Tensorflow实例：实现基于LSTM的语⾔模型

RNN

⼈每次思考时不会重头开始，⽽是保留之前思考的⼀些结果为现在的决策提供⽀持。例如我们对话时，我们会根据上下⽂的信息理解⼀句话的含义，⽽不是对每⼀句话重头进⾏分析。传统的神经⽹络不能实现这个功能，这可能是其⼀⼤缺陷。例如卷积神经⽹络虽然可以对图像进⾏分类，但是可能⽆法对视频中每⼀帧图像发⽣的事情进⾏关联分析，我们⽆法利⽤前⼀帧图像的信息，⽽循环神经⽹络则可以解决这个问题。

如上图所⽰，x是RNN的输⼊，s是RNN的⼀个节点，⽽o是输出。我们对这个RNN输⼊数据x，然后通过⽹络计算并得到输出结果o，再将某些信息（state，状态）传⼊到⽹络的输⼊。我们将o与label进⾏⽐较可以得到误差，有了这个误差之后，就能使⽤梯度下降（Gradient Descent）和Back-Propagation Through Time（BPTT）⽅法对⽹络进⾏训练，BPTT与训练前馈神经⽹络的传统BP⽅法类似，也是使⽤反向传播求梯度并更新⽹络参数权重。另外，还有⼀种⽅法叫Real-Time Recurrent Learning(RTRL)，它可以正向求解梯度，不过其计算复杂度⽐较⾼。

RNN展开后，类似于有⼀系列输⼊x和⼀系列输出o的串联的普通神经⽹络，上⼀层的神经⽹络会传递信息给下⼀层。这种串联的结构天然就⾮常适合时间序列数据的处理和分析。需要注意的是，展开后的每⼀层级的神经⽹络，其参数都是相同的，我们并不需要训练成百上千层神经⽹络的参数，只需要训练⼀层RNN的参数。这就是它结构巧妙的地⽅，这⾥共享参数的思想和卷积⽹络中权值共享的⽅式也很类似。

LSTM

对于某些简单的问题，可能只需要最后输⼊的少量时序信息即可解决。但是对某些复杂问题，可能需要更早的⼀些信息，甚⾄是时间序列开头的信息，但间隔太远的输⼊信息，RNN是难以记忆的，因此长程依赖（Long-term Dependencies）是传统RNN的致命伤。

LSTM天⽣就是为了解决长程依赖⽽设计的，不需要特别复杂地调试超参数，默认就可以记住长期的信息。

LSTM的内部结构相⽐RNN更复杂，其中包含了4层神经⽹络，其中⼩圈圈是point-wi的操作，⽐如向量加法、点乘等，⽽⼩矩阵则代表

⼀层可学习参数的神经⽹络。

LSTM单元上⾯的那条直线代表了LSTM的状态state，它会贯穿所有串联在⼀起的LSTM单元，从第⼀个LSTM单元⼀直流向最后⼀个LSTM单元，其中只有少量的线性⼲预和改变。

状态state在这条隧道中传递时，LSTM单元可以对其添加或删除信息，这些对信息流的修改操作由LSTM中的Gates控制。

这些Gates中包含了⼀个Sigmoid层和⼀个向量点乘的操作，这个Sigmoid层的输出是0-1之间的值，它直接控制了信息传递的⽐例。

每个LSTM单元中包含了3个这样的Gates，⽤来维护和控制单元的状态信息。凭借对状态信息的存储和修改，LSTM单元就可以实现长程记忆。

Tensorflow实现LSTM

下⾯我们就使⽤LSTM来实现⼀个语⾔模型，给定上⽂的语境，即历史出现的单词，语⾔模型可以预测下⼀个单词出现的概率，使⽤的数据集：PTB

#%%

# Licend under the Apache Licen, Version 2.0 (the "Licen");

# you may not u this file except in compliance with the Licen.

ysb# You may obtain a copy of the Licen at

# www.apache/licens/LICENSE-2.0

# Unless required by applicable law or agreed to in writing, software

# distributed under the Licen is distributed on an "AS IS" BASIS,

# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

# See the Licen for the specific language governing permissions and

# limitations under the Licen.

红小胖snoopy# ==============================================================================

import time

import numpy as np

import tensorflow as tf

import reader

#flags = tf.flags

#logging = tf.logging

#flags.DEFINE_string("save_path", None,

# "Model output directory.")

#flags.DEFINE_bool("u_fp16", Fal,

# "Train using 16-bit floats instead of 32bit floats")

#FLAGS = flags.FLAGS

#def data_type():

# return tf.float16 if FLAGS.u_fp16 el tf.float32

class PTBInput(object):

"""The input data."""

def__init__(lf, config, data, name=None):

lf.batch_size = batch_size = config.batch_size

lf.num_steps = num_steps = config.num_steps

lf.epoch_size = ((len(data) // batch_size) - 1) // num_steps

lf.input_data, lf.targets = reader.ptb_producer(

data, batch_size, num_steps, name=name)

class PTBModel(object):

"""The PTB model."""

def__init__(lf, is_training, config, input_):

lf._input = input_

batch_size = input_.batch_size

num_steps = input_.num_steps

size = config.hidden_size

vocab_size = config.vocab_size

# Slightly better results can be obtained with forget gate bias

# initialized to 1 but the hyperparameters of the model would need to be # different than reported in the paper.

def lstm_cell():

ib.rnn.BasicLSTMCell(

如何画彩妆size, forget_bias=0.0, state_is_tuple=True)

attn_cell = lstm_cell

if is_training and config.keep_prob < 1:

def attn_cell():

ib.rnn.DropoutWrapper(

lstm_cell(), output_keep_prob=config.keep_prob)

cell = MultiRNNCell(

[attn_cell() for _ in range(config.num_layers)], state_is_tuple=True) lf._initial_state = _state(batch_size, tf.float32)

with tf.device("/cpu:0"):

embedding = tf.get_variable(

"embedding", [vocab_size, size], dtype=tf.float32)

inputs = bedding_lookup(embedding, input_.input_data)

if is_training and config.keep_prob < 1:

inputs = tf.nn.dropout(inputs, config.keep_prob)

# Simplified version of models/tutorials/rnn/rnn.py's rnn().

# This builds an unrolled LSTM for tutorial purpos only.

# In general, u the rnn() or state_saving_rnn() from rnn.py.

# The alternative version of the code below is:

# inputs = tf.unstack(inputs, num=num_steps, axis=1)

# outputs, state = (cell, inputs,

# initial_state=lf._initial_state)

outputs = []

state = lf._initial_state

with tf.variable_scope("RNN"):

for time_step in range(num_steps):

if time_step > 0: tf.get_variable_scope().reu_variables()

(cell_output, state) = cell(inputs[:, time_step, :], state)

outputs.append(cell_output)

output = tf.at(outputs, 1), [-1, size])

softmax_w = tf.get_variable(

"softmax_w", [size, vocab_size], dtype=tf.float32)

softmax_b = tf.get_variable("softmax_b", [vocab_size], dtype=tf.float32) logits = tf.matmul(output, softmax_w) + softmax_b

loss = tf.contrib.legacy_q2q.quence_loss_by_example(

head[logits],

[tf.reshape(input_.targets, [-1])],

[tf.ones([batch_size * num_steps], dtype=tf.float32)])

lf._cost = cost = tf.reduce_sum(loss) / batch_size

lf._final_state = state

if not is_training:

return

lf._lr = tf.Variable(0.0, trainable=Fal)

tvars = tf.trainable_variables()

grads, _ = tf.clip_by_global_adients(cost, tvars),

config.max_grad_norm)

optimizer = tf.train.GradientDescentOptimizer(lf._lr)

lf._train_op = optimizer.apply_gradients(

zip(grads, tvars),

global__or_create_global_step())

lf._new_lr = tf.placeholder(

tf.float32, shape=[], name="new_learning_rate")

lf._lr_update = tf.assign(lf._lr, lf._new_lr)

def assign_lr(lf, ssion, lr_value):

ssion.run(lf._lr_update, feed_dict={lf._new_lr: lr_value})

@property

def input(lf):

return lf._input

@property

def initial_state(lf):

return lf._initial_state

@property

def cost(lf):欧亨利二十年后

return lf._cost

@property

def final_state(lf):

return lf._final_state

dangrous@property

def lr(lf):

return lf._lr

@property

def train_op(lf):

return lf._train_op

class SmallConfig(object):

"""Small config."""

init_scale = 0.1

learning_rate = 1.0

max_grad_norm = 5

num_layers = 2

num_steps = 20

hidden_size = 200

max_epoch = 4

max_max_epoch = 13

keep_prob = 1.0

lr_decay = 0.5

batch_size = 20

paul george

vocab_size = 10000

class MediumConfig(object):

"""Medium config."""

init_scale = 0.05

learning_rate = 1.0

max_grad_norm = 5

num_layers = 2

num_steps = 35

hidden_size = 650

max_epoch = 6

max_max_epoch = 39

keep_prob = 0.5

lr_decay = 0.8

batch_size = 20

vocab_size = 10000

class LargeConfig(object):妆点

"""Large config."""

init_scale = 0.04

learning_rate = 1.0

max_grad_norm = 10

num_layers = 2

num_steps = 35

hidden_size = 1500

max_epoch = 14

max_max_epoch = 55

checklistkeep_prob = 0.35

lr_decay = 1 / 1.15

batch_size = 20

vocab_size = 10000

class TestConfig(object):

"""Tiny config, for testing."""

init_scale = 0.1

learning_rate = 1.0

max_grad_norm = 1

num_layers = 1

num_steps = 2

hidden_size = 2

max_epoch = 1

max_max_epoch = 1

keep_prob = 1.0

lr_decay = 0.5

batch_size = 20

vocab_size = 10000

def run_epoch(ssion, model, eval_op=None, verbo=Fal): """Runs the model on the given data."""

start_time = time.time()

costs = 0.0

iters = 0

state = ssion.run(model.initial_state)

fetches = {

"cost": st,

"final_state": model.final_state,

lingfeng

}

if eval_op is not None:

fetches["eval_op"] = eval_op

for step in range(model.input.epoch_size):

feed_dict = {}

for i, (c, h) in enumerate(model.initial_state):

feed_dict[c] = state[i].c

feed_dict[h] = state[i].h

vals = ssion.run(fetches, feed_dict)

cost = vals["cost"]

state = vals["final_state"]

本文发布于:2023-07-04 18:38:40，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/78/1078403.html

上一篇：随机微分方程多步法

下一篇：高一英语译林版3教案：Uint3 BacktothePastperiod3含解析

标签：信息神经参数单元需要分析梯度

留言与评论（共有 0 条评论）