Deep LSTM siamese network for text similarity源码分析 - 服务器托管|北京服务器租用|机房托管租用|IDC托管租用|机房机柜带宽租用-价格及费用咨询

上文讲解了基于SimaseLSTM来计算两个句子或单词之间相关性的原理和个人的一些经验这里来分析一下源码是怎么实现的：

Github实现地址：https://github.com/dhwajraj/deep-siamese-text-similarity

具体实现的是文章SiameseRecurrent Architectures for Learning Sentence Similarity所提出的网络结构，实现代码主要分为模型定义siamese_network.py，数据处理input_helpers.py，模型训练train.py三个部分。

第一部分是输入训练样本处理函数input_helpers.py：

class InputHelper(object):
     #读取输入的文件，文件分为三行，行与行之间用t分隔开，前两行为需要计算相似度的两个句子，后两行为类别标记
     def getTsvData(self, filepath):
         print("Loading training data from "+filepath)
         x1=[]
         x2=[]
         y=[]
         # positive samples from file
         for line in open(filepath):
             l=line.strip().split("t")
             if len(l) 0.5:
                x1.append(l[0].lower())
                x2.append(l[1].lower())
             else:
                x1.append(l[1].lower())
                x2.append(l[0].lower())
   #默认原始数据为正样本
             y.append(1)#np.array([0,1]))
         # generate random negative samples
         combined = np.asarray(x1+x2)
         shuffle_indices = np.random.permutation(np.arange(len(combined)))
         combined_shuff = combined[shuffle_indices]
#对两个句子进行拼接，做为负样本
         for i in xrange(len(combined)):
             x1.append(combined[i])
             x2.append(combined_shuff[i])
             y.append(0) #np.array([1,0]))
         return np.asarray(x1),np.asarray(x2),np.asarray(y)




     def getTsvTestData(self, filepath):
#获得测试数据
         print("Loading testing/labelled data from "+filepath)
         x1=[]
         x2=[]
         y=[]
         # positive samples from file
         for line in open(filepath):
             l=line.strip().split("t")
             if len(l)

第二部分是模型的定义siamese_network.py：

class SiameseLSTM(object):
     """
     A LSTM based deep Siamese network for text similarity.
     Uses an character embedding layer, followed by a biLSTM and Energy Loss layer.
     """

#定义了一个双向的lstm结构，虽然tensorflow提供了双向rnn的函数tf.nn.bidirectional_rnn()，本人觉得这个并不好使用，因为不方便修改网络的结构

#实际训练时用起来也不方便
     def BiRNN(self, x, dropout, scope, embedding_size, sequence_length):
         n_input=embedding_size
         n_steps=sequence_length
         n_hidden=n_steps
         n_layers=3
         # Prepare data shape to match `bidirectional_rnn` function requirements
         # Current data input shape: (batch_size, n_steps, n_input) (?, seq_len, embedding_size)
         # Required shape: 'n_steps' tensors list of shape (batch_size, n_input)
         # Permuting batch_size and n_steps
         x = tf.transpose(x, [1, 0, 2])
         # Reshape to (n_steps*batch_size, n_input)
         x = tf.reshape(x, [-1, n_input])
         # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
         x = tf.split(0, n_steps, x)
         print(x)
         # Define lstm cells with tensorflow
         # Forward direction cell
#定义正向LSTM，也可以替换为GRU，本人实践发现训练速度更快
         with tf.name_scope("fw"+scope),tf.variable_scope("fw"+scope):
             print(tf.get_variable_scope().name)
             fw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
             lstm_fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell,output_keep_prob=dropout)
             lstm_fw_cell_m=tf.nn.rnn_cell.MultiRNNCell([lstm_fw_cell]*n_layers, state_is_tuple=True)
         # Backward direction cell
#定义反向LSTM
         with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
             print(tf.get_variable_scope().name)
             bw_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0, state_is_tuple=True)
             lstm_bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell,output_keep_prob=dropout)
             lstm_bw_cell_m = tf.nn.rnn_cell.MultiRNNCell([lstm_bw_cell]*n_layers, state_is_tuple=True)
         # Get lstm cell output
         #try:
#把正负向LSTM包到一起
         with tf.name_scope("bw"+scope),tf.variable_scope("bw"+scope):
             outputs, _, _ = tf.nn.bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x, dtype=tf.float32)
             #         except Exception: # Old TensorFlow version only returns outputs not states
             #             outputs = tf.nn.bidirectional_rnn(lstm_fw_cell_m, lstm_bw_cell_m, x,
             #                                             dtype=tf.float32)

#将正向输出和负向输出拼接到一起，并取出最后时刻得输出，这里这样做并不合适，因为当句子的长度比maxSenLen短时，实际不足的部分是

#补0的，相当于引入噪声，这里最好求平均，tf.reduce_sum(outputs,0)做为输出，并且为提升效果，还可以串接一个全连接层
return outputs[-1]
     
     def contrastive_loss(self, y,d,batch_size):
#论文中的cos目标函数
         tmp= y *tf.square(d)
         #tmp= tf.mul(y,tf.square(d))
         tmp2 = (1-y) *tf.square(tf.maximum((1 - d),0))
         return tf.reduce_sum(tmp +tmp2)/batch_size/2
     
     def __init__(
       self, sequence_length, vocab_size, embedding_size, hidden_units, l2_reg_lambda, batch_size):


       # Placeholders for input, output and dropout
       self.input_x1 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x1")
       self.input_x2 = tf.placeholder(tf.int32, [None, sequence_length], name="input_x2")
       self.input_y = tf.placeholder(tf.float32, [None], name="input_y")
       self.dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prob")


       # Keeping track of l2 regularization loss (optional)
       l2_loss = tf.constant(0.0, name="l2_loss")
           
       # 把两个输入转换为句子向量
       with tf.name_scope("embedding"):
           self.W = tf.Variable(
               tf.random_uniform([vocab_size, embedding_size], -1.0, 1.0),
               trainable=True,name="W")
           self.embedded_chars1 = tf.nn.embedding_lookup(self.W, self.input_x1)
           #self.embedded_chars_expanded1 = tf.expand_dims(self.embedded_chars1, -1)
           self.embedded_chars2 = tf.nn.embedding_lookup(self.W, self.input_x2)
           #self.embedded_chars_expanded2 = tf.expand_dims(self.embedded_chars2, -1)
 #构建两个双向LSTM并获得输出，
       # Create a convolution + maxpool layer for each filter size
       with tf.name_scope("output"):
#两个双向LSTM处于不同的variable_scope下，所以是两套参数
         self.out1=self.BiRNN(self.embedded_chars1, self.dropout_keep_prob, "side1", embedding_size, sequence_length)
         self.out2=self.BiRNN(self.embedded_chars2, self.dropout_keep_prob, "side2", embedding_size, sequence_length)
         #计算两个堆叠的双LSTM的输出特征的余弦相似度
self.distance = tf.sqrt(tf.reduce_sum(tf.square(tf.sub(self.out1,self.out2)),1,keep_dims=True))
         self.distance = tf.div(self.distance, tf.add(tf.sqrt(tf.reduce_sum(tf.square(self.out1),1,keep_dims=True)),tf.sqrt(tf.reduce_sum(tf.square(self.out2),1,keep_dims=True))))
         self.distance = tf.reshape(self.distance, [-1], name="distance")
       with tf.name_scope("loss"):
 #与target对比，构造损失函数
           self.loss = self.contrastive_loss(self.input_y,self.distance, batch_size) 
       with tf.name_scope("accuracy"):
 #这部分实际没用，当采用softmax做为损失函数时可以使用
           correct_predictions = tf.equal(self.distance, self.input_y)
           self.accuracy=tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")

最后一部分，模型训练模型训练train.py：

#! /usr/bin/env python


 import tensorflow as tf
 import numpy as np
 import re
 import os
 import time
 import datetime
 import gc
 from input_helpers import InputHelper
 from siamese_network import SiameseLSTM
 from tensorflow.contrib import learn
 import gzip
 from random import random
 # Parameters
 # ==================================================
 #超参数的定义
 tf.flags.DEFINE_integer("embedding_dim", 100, "Dimensionality of character embedding (default: 300)")
 tf.flags.DEFINE_float("dropout_keep_prob", 0.5, "Dropout keep probability (default: 0.5)")
 tf.flags.DEFINE_float("l2_reg_lambda", 0.0, "L2 regularizaion lambda (default: 0.0)")
 tf.flags.DEFINE_string("training_files", "person_match.train2", "training file (default: None)")
 tf.flags.DEFINE_integer("hidden_units", 50, "Number of hidden units in softmax regression layer (default:50)")


 # Training parameters
 tf.flags.DEFINE_integer("batch_size", 64, "Batch Size (default: 64)")
 tf.flags.DEFINE_integer("num_epochs", 300, "Number of training epochs (default: 200)")
 tf.flags.DEFINE_integer("evaluate_every", 1000, "Evaluate model on dev set after this many steps (default: 100)")
 tf.flags.DEFINE_integer("checkpoint_every", 1000, "Save model after this many steps (default: 100)")
 # Misc Parameters
 tf.flags.DEFINE_boolean("allow_soft_placement", True, "Allow device soft device placement")
 tf.flags.DEFINE_boolean("log_device_placement", False, "Log placement of ops on devices")


 #打印输出各个超参数的值
 FLAGS = tf.flags.FLAGS
 FLAGS._parse_flags()
 print("nParameters:")
 for attr, value in sorted(FLAGS.__flags.items()):
     print("{}={}".format(attr.upper(), value))
 print("")


 if FLAGS.training_files==None:
     print "Input Files List is empty. use --training_files argument."
     exit()
  
 #设置训练样本最大长度，> 30个词的句子被截断，小于30则补0
 max_document_length=30
 inpH = InputHelper()
 #读取训练样本，词表
 train_set, dev_set, vocab_processor,sum_no_of_batches = inpH.getDataSets(FLAGS.training_files,max_document_length, 10, FLAGS.batch_size)


 # Training
 # ==================================================
 print("starting graph def")
 #配置并启动图
 with tf.Graph().as_default():
     session_conf = tf.ConfigProto(
       allow_soft_placement=FLAGS.allow_soft_placement,
       log_device_placement=FLAGS.log_device_placement)
     sess = tf.Session(config=session_conf)
     print("started session")
     with sess.as_default():
#初始化simasemodel
         siameseModel = SiameseLSTM(
             sequence_length=max_document_length,
             vocab_size=len(vocab_processor.vocabulary_),
             embedding_size=FLAGS.embedding_dim,
             hidden_units=FLAGS.hidden_units,
             l2_reg_lambda=FLAGS.l2_reg_lambda,
             batch_size=FLAGS.batch_size)


         # Define Training procedure
#创建全局step并创建adam优化器
         global_step = tf.Variable(0, name="global_step", trainable=False)
         optimizer = tf.train.AdamOptimizer(1e-3)
         print("initialized siameseModel object")
     #计算梯度并得到对应的variable
     grads_and_vars=optimizer.compute_gradients(siameseModel.loss)
#传入梯度和变量，初始化一个training op
     tr_op_set = optimizer.apply_gradients(grads_and_vars, global_step=global_step)
     print("defined training_ops")
     # Keep track of gradient values and sparsity (optional)
#获取梯度相关信息给tensorbord
     grad_summaries = []
     for g, v in grads_and_vars:
         if g is not None:
             grad_hist_summary = tf.histogram_summary("{}/grad/hist".format(v.name), g)
             sparsity_summary = tf.scalar_summary("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
             grad_summaries.append(grad_hist_summary)
             grad_summaries.append(sparsity_summary)
     grad_summaries_merged = tf.merge_summary(grad_summaries)
     print("defined gradient summaries")
     # Output directory for models and summaries
#定义输出路径
     timestamp = str(int(time.time()))
     out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
     print("Writing to {}n".format(out_dir))


     # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
     checkpoint_dir = os.path.abspath(os.path.join(out_dir, "checkpoints"))
     checkpoint_prefix = os.path.join(checkpoint_dir, "model")
     if not os.path.exists(checkpoint_dir):
         os.makedirs(checkpoint_dir)
     saver = tf.train.Saver(tf.all_variables(), max_to_keep=100)


     # Write vocabulary
     vocab_processor.save(os.path.join(checkpoint_dir, "vocab"))


     # Initialize all variables
     sess.run(tf.initialize_all_variables())
     
     print("init all variables")
     graph_def = tf.get_default_graph().as_graph_def()
     graphpb_txt = str(graph_def)
     with open(os.path.join(checkpoint_dir, "graphpb.txt"), 'w') as f:
         f.write(graphpb_txt)




     def train_step(x1_batch, x2_batch, y_batch):
         """
         A single training step
         """
#随机打乱两个输入句子之间的位置关系
         if random()>0.5:
             feed_dict = {
                              siameseModel.input_x1: x1_batch,
                              siameseModel.input_x2: x2_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         else:
             feed_dict = {
                              siameseModel.input_x1: x2_batch,
                              siameseModel.input_x2: x1_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         _, step, loss, accuracy, dist = sess.run([tr_op_set, global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance],  feed_dict)
         time_str = datetime.datetime.now().isoformat()
#获取最后的输出值 >= 0.5为0，小于0.5为1
         d = np.copy(dist)
         d[d>=0.5]=999.0
         d[d1.0]=0
         accuracy = np.mean(y_batch==d)
         print("TRAIN {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
         print y_batch, dist, d


     def dev_step(x1_batch, x2_batch, y_batch):
         """
         A single training step
#同训练
         """ 
         if random()>0.5:
             feed_dict = {
                              siameseModel.input_x1: x1_batch,
                              siameseModel.input_x2: x2_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         else:
             feed_dict = {
                              siameseModel.input_x1: x2_batch,
                              siameseModel.input_x2: x1_batch,
                              siameseModel.input_y: y_batch,
                              siameseModel.dropout_keep_prob: FLAGS.dropout_keep_prob,
             }
         step, loss, accuracy, dist = sess.run([global_step, siameseModel.loss, siameseModel.accuracy, siameseModel.distance],  feed_dict)
         time_str = datetime.datetime.now().isoformat()
         d = np.copy(dist)
         d[d>=0.5]=999.0
         d[d1.0]=0
         accuracy = np.mean(y_batch==d)
         print("DEV {}: step {}, loss {:g}, acc {:g}".format(time_str, step, loss, accuracy))
         print y_batch, dist, d
         return accuracy


     # 产生训练batch，进入主要循环
     batches=inpH.batch_iter(
                 list(zip(train_set[0], train_set[1], train_set[2])), FLAGS.batch_size, FLAGS.num_epochs)


     ptr=0
     max_validation_acc=0.0
     for nn in xrange(sum_no_of_batches*FLAGS.num_epochs):
         batch = batches.next()
         if len(batch)= max_validation_acc:
                 max_validation_acc = sum_acc
                 saver.save(sess, checkpoint_prefix, global_step=current_step)
#保存模型
                 tf.train.write_graph(sess.graph.as_graph_def(), checkpoint_prefix, "graph"+str(nn)+".pb", as_text=False)
                 print("Saved model {} with sum_accuracy={} checkpoint to {}n".format(nn, max_validation_acc, checkpoint_prefix))

服务器托管，北京服务器托管，服务器租用 http://www.fwqtg.net
机房租用，北京机房租用，IDC机房托管， http://www.fwqtg.net

相关推荐: Jan 2023-Prioritizing Samples in Reinforcement Learning with Reducible Loss

1 Introduction 本文建议根据样本的可学习性进行抽样，而不是从经验回放中随机抽样。如果有可能减少代理对该样本的损失，则认为该样本是可学习的。我们将可以减少样本损失的数量称为其可减少损失(ReLo)。这与Schaul等人[2016]的vanill…

服务器托管，北京服务器托管，服务器租用，机房机柜带宽租用