当处理文本中的单词时,传统的one-hot encode会产生仅有一个元素为1,其余均为0的长向量,这对于网络计算是极大地浪费,使用词嵌入技术(Embeddings)可以有效地解决这一问题。Embeddings将单词转化为整数,并且将weight matrix看作一个lookup table(如左图所示),从而避免了稀疏向量与矩阵的直接相乘。
本文首先介绍在RNN中使用Embeddings进行情感分析(sentiment analysis),所使用的网络结构如右图所示;接着介绍一种特殊的词嵌入模型Word2Vec,用来将单词转化成包含语义解释(semantic meaning)的向量。
Sentiment Analysis
使用的数据下载,其中reviews.txt中已将大写字母全部转化为小写字母
- 数据前处理
### 读取数据 with open('sentiment/reviews.txt', 'r') as f: reviews = f.read() #评论 with open('sentiment/labels.txt', 'r') as f: labels = f.read() #情感标签 ### 去掉标点符号 from string import punctuation all_text = ''.join([c for c in reviews if c not in punctuation]) ### 获取评论和单词 reviews = all_text.split('\n') all_text = ' '.join(reviews) words = all_text.split() ### 将单词转化为整数 from collections import Counter counts = Counter(words) vocab = sorted(counts, key=counts.get, reverse=True) #将单词按出现次数从多到少排序 vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)} #将单词从1开始编码 reviews_ints = [] for each in reviews: reviews_ints.append([vocab_to_int[word] for word in each.split()]) ### 将标签转化为整数 import numpy as np labels = labels.split('\n') labels = np.array([1 if each == 'positive' else 0 for each in labels]) ### 去掉长度为0的评论 non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0] reviews_ints = [reviews_ints[ii] for ii in non_zero_idx] labels = np.array([labels[ii] for ii in non_zero_idx]) ### 截取评论中的前200个单词,不足200单词的评论在左边补0 seq_len = 200 features = np.zeros((len(reviews_ints), seq_len), dtype=int) for i, row in enumerate(reviews_ints): features[i, -len(row):] = np.array(row)[:seq_len] ### 拆分训练、验证和测试集 split_frac = 0.8 split_idx = int(len(features)*0.8) train_x, val_x = features[:split_idx], features[split_idx:] train_y, val_y = labels[:split_idx], labels[split_idx:] test_idx = int(len(val_x)*0.5) val_x, test_x = val_x[:test_idx], val_x[test_idx:] val_y, test_y = val_y[:test_idx], val_y[test_idx:]
- 搭建网络
### 超参数设置 lstm_size = 256 lstm_layers = 1 batch_size = 500 learning_rate = 0.001 embed_size = 300 #Size of the embedding vectors(number of units in the embedding layer) ### Input import tensorflow as tf graph = tf.Graph() #Create the graph object with graph.as_default(): inputs_ = tf.placeholder(tf.int32, [None, None], name='inputs') labels_ = tf.placeholder(tf.int32, [None, None], name='labels') keep_prob = tf.placeholder(tf.float32, name='keep_prob') ### Embed Layer n_words = len(vocab_to_int) + 1 #Adding 1 because we use 0's for padding, dictionary started at 1 with graph.as_default(): embedding = tf.Variable(tf.random_uniform((n_words, embed_size), -1, 1)) embed = tf.nn.embedding_lookup(embedding, inputs_) #3D tensor (batch_size, seq_len, embed_size) ### LSTM Layer with graph.as_default(): lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size) drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob) cell = tf.contrib.rnn.MultiRNNCell([drop] * lstm_layers) initial_state = cell.zero_state(batch_size, tf.float32) #Getting an initial state ### Output layer with graph.as_default(): outputs, final_state = tf.nn.dynamic_rnn(cell, embed, initial_state=initial_state) logits = tf.contrib.layers.fully_connected(outputs[:, -1], 1, activation_fn=None) #从最后一步的输出(batch_size, lstm_size)建立全连接层 predictions = tf.nn.sigmoid(logits) ### Loss function and Optimizer ### Two options for loss function: ### cost = tf.losses.mean_squared_error(labels_, predictions) ### cost = cross-entropy with graph.as_default(): loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels_, tf.float32), logits=logits) cost = tf.reduce_mean(loss) #cross-entropy optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost) ### Validation and Test Accuracy with graph.as_default(): correct_pred = tf.equal(tf.cast(tf.round(predictions), tf.int32), labels_) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
- 训练网络
def get_batches(x, y, batch_size=100): n_batches = len(x)//batch_size x, y = x[:n_batches*batch_size], y[:n_batches*batch_size] for ii in range(0, len(x), batch_size): yield x[ii:ii+batch_size], y[ii:ii+batch_size] ### Train and Validation epochs = 20 best_validation_acc = 0.0 #Best validation accuracy seen so far with graph.as_default(): saver = tf.train.Saver() with tf.Session(graph=graph) as sess: sess.run(tf.global_variables_initializer()) iteration = 1 for e in range(epochs): state = sess.run(initial_state) for x, y in get_batches(train_x, train_y, batch_size): feed = {inputs_: x, labels_: y[:, None], keep_prob: 0.5, initial_state: state} loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed) if iteration%5==0: print("Epoch: {}/{}".format(e, epochs), \ "Iteration: {}".format(iteration), \ "Train loss: {:.3f}".format(loss)) if iteration%25==0: val_acc = [] val_state = sess.run(cell.zero_state(batch_size, tf.float32)) for xv, yv in get_batches(val_x, val_y, batch_size): feed = {inputs_: xv, labels_: yv[:, None], keep_prob: 1, initial_state: val_state} batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed) val_acc.append(batch_acc) validation_acc = np.mean(val_acc) print("Val acc: {:.3f}".format(validation_acc)) if validation_acc > best_validation_acc: best_validation_acc = validation_acc #Update the best-known validation accuracy saver.save(sess, "checkpoints/sentiment_best_validation.ckpt") iteration += 1 saver.save(sess, "checkpoints/sentiment_last_iteration.ckpt")
- 检验网络
test_acc = [] with tf.Session(graph=graph) as sess: saver.restore(sess, 'checkpoints/sentiment_best_validation.ckpt') #should also check sentiment_last_iteration.ckpt test_state = sess.run(cell.zero_state(batch_size, tf.float32)) for xt, yt in get_batches(test_x, test_y, batch_size): feed = {inputs_: xt, labels_: yt[:, None], keep_prob: 1, initial_state: test_state} batch_acc, test_state = sess.run([accuracy, final_state], feed_dict=feed) test_acc.append(batch_acc) print("Test accuracy: {:.3f}".format(np.mean(test_acc)))
Word2Vec
Word2Vec是可以结合上下文语义生成词向量的一种算法,经常在相同语义下出现的词语,它们生成的词向量也很相似(如下图所示),本文所使用的训练数据是Matt Mahoney整理的维基百科文章
Word2Vec主要有两种架构形式,分别为CBOW(Continuous Bag-Of-Words)和Skip-gram,两种方法的示意图如下图所示。CBOW是用周围词预测中心词,从而利用中心词的预测结果情况,不断地去调整周围词的向量;Skip-gram是用中心词来预测周围的词,利用周围的词的预测结果情况,不断地调整中心词的词向量。
本文主要对Skip-gram进行介绍,采用的网络结构如下图所示。
- 数据前处理
from collections import Counter def process(text): text = text.lower() ### Replace punctuation with tokens so we can use them in our model text = text.replace('.', ' <PERIOD> ') text = text.replace(',', ' <COMMA> ') text = text.replace('"', ' <QUOTATION_MARK> ') text = text.replace(';', ' <SEMICOLON> ') text = text.replace('!', ' <EXCLAMATION_MARK> ') text = text.replace('?', ' <QUESTION_MARK> ') text = text.replace('(', ' <LEFT_PAREN> ') text = text.replace(')', ' <RIGHT_PAREN> ') text = text.replace('--', ' <HYPHENS> ') text = text.replace(':', ' <COLON> ') ### Remove all words with 5 or fewer occurences words = text.split() word_counts = Counter(words) trimmed_words = [word for word in words if word_counts[word] > 5] return trimmed_words ### 读取并处理数据 with open('data/text8') as f: text = f.read() words = process(text) ### 将单词编码为整数 word_counts = Counter(words) sorted_vocab = sorted(word_counts, key=word_counts.get, reverse=True) int_to_vocab = {ii: word for ii, word in enumerate(sorted_vocab)} vocab_to_int = {word: ii for ii, word in int_to_vocab.items()} int_words = [vocab_to_int[word] for word in words] ### Subsampling, 随机去掉一些高频词(例如the) ### for each word in the training data, discard the word with probability 1-sqrt(threshold/word frequency) import random threshold = 1e-5 word_counts = Counter(int_words) total_count = len(int_words) freqs = {word: count/total_count for word, count in word_counts.items()} p_drop = {word: 1 - np.sqrt(threshold/freqs[word]) for word in word_counts} train_words = [word for word in int_words if random.random() < (1 - p_drop[word])]
- 获取batch
### 找到所要训练的单词周围的单词 def get_target(words, idx, window_size=5): ### Get a list of words in a window around an index R = np.random.randint(1, window_size+1) start = idx - R if (idx - R) > 0 else 0 stop = idx + R target_words = set(words[start:idx] + words[idx+1:stop+1]) return list(target_words) def get_batches(words, batch_size, window_size=5): ### Create a generator of word batches as a tuple (inputs, targets) ### inputs(or targets) is a list of integers n_batches = len(words)//batch_size words = words[:n_batches*batch_size] #only full batches for idx in range(0, len(words), batch_size): x, y = [], [] batch = words[idx:idx+batch_size] for ii in range(len(batch)): batch_x = batch[ii] batch_y = get_target(batch, ii, window_size) y.extend(batch_y) x.extend([batch_x]*len(batch_y)) yield x, y
- 搭建网络,结构如下图所示
import tensorflow as tf train_graph = tf.Graph() ### Input with train_graph.as_default(): inputs = tf.placeholder(tf.int32, [None], name='inputs') labels = tf.placeholder(tf.int32, [None, None], name='labels') ### Embed n_vocab = len(int_to_vocab) n_embedding = 200 #number of embedding features with train_graph.as_default(): embedding = tf.Variable(tf.random_uniform((n_vocab, n_embedding), -1, 1)) embed = tf.nn.embedding_lookup(embedding, inputs) ### Negative Sampling ### The size of our word vocabulary means that our skip-gram neural network has a tremendous number of weights ### All of the weights would be updated slightly by every one of our training samples. This makes training the network very inefficient ### With negative sampling, we are instead going to randomly select just a small number of “negative” words(for which we want the network to output a 0) to update the weights for ### We will also still update the weights for our “positive” word((for which we want the network to output a 1) ### Essentially, the probability for selecting a word as a negative sample is related to its frequency, with more frequent words being more likely to be selected as negative samples n_sampled = 100 #number of negative labels to sample with train_graph.as_default(): ### negative sampling is for training only ### note the shape of softmax_w is (n_vocab, n_embedding) ### if we want to calculate the full softmax loss, use: ### logits = tf.matmul(embed, tf.transpose(softmax_w)) ### logits = tf.nn.bias_add(logits, softmax_b) softmax_w = tf.Variable(tf.truncated_normal((n_vocab, n_embedding), stddev=0.1)) softmax_b = tf.Variable(tf.zeros(n_vocab)) loss = tf.nn.sampled_softmax_loss(softmax_w, softmax_b, labels, embed, n_sampled, n_vocab) #calculate the loss using negative sampling cost = tf.reduce_mean(loss) optimizer = tf.train.AdamOptimizer().minimize(cost) ### Normalize each word's vector with train_graph.as_default(): norm = tf.sqrt(tf.reduce_sum(tf.square(embedding), 1, keep_dims=True)) normalized_embedding = embedding / norm
- 训练和验证网络
epochs = 10 batch_size = 1000 window_size = 10 with train_graph.as_default(): saver = tf.train.Saver() with tf.Session(graph=train_graph) as sess: iteration = 1 loss = 0 sess.run(tf.global_variables_initializer()) for e in range(1, epochs+1): batches = get_batches(train_words, batch_size, window_size) for x, y in batches: feed = {inputs: x, labels: np.array(y)[:, None]} #labels shoud be a 2D tensor (len(y), 1) train_loss, _ = sess.run([cost, optimizer], feed_dict=feed) loss += train_loss if iteration % 100 == 0: print("Epoch {}/{}".format(e, epochs), \ "Iteration: {}".format(iteration), \ "Avg. Training loss: {:.4f}".format(loss/100)) loss = 0 iteration += 1 save_path = saver.save(sess, "checkpoints/text8.ckpt") embed_mat = sess.run(normalized_embedding) ### Use T-SNE to visualize word vectors import matplotlib.pyplot as plt from sklearn.manifold import TSNE viz_words = 500 tsne = TSNE() embed_tsne = tsne.fit_transform(embed_mat[:viz_words, :]) fig, ax = plt.subplots(figsize=(14, 14)) for idx in range(viz_words): plt.scatter(*embed_tsne[idx, :], color='steelblue') plt.annotate(int_to_vocab[idx], (embed_tsne[idx, 0], embed_tsne[idx, 1]), alpha=0.7)