Sentiment analysis code
This appendix provides the code for the sentiment analysis scenario that is presented in Chapter 5, “Working with data and creating models in IBM PowerAI” on page 121, and describes how the files are structured.
This appendix contains the following topic:
Sentiment analysis with TensorFlow
This section describes sentiment analysis with the TensorFlow code.
How the code is organized
The sentiment analysis code is organized into three Python (.py) files:
data_prep.py: This file is used to prepare the data. You can run it independently and generate the output data set and dictionaries to be used in further steps.
sentiment_neural_net.py: This file contains the functions that are responsible for model creation and training. Because of the way the code is prepared, you can run the training from the use_neural_net.py file.
use_neural_net.py: From this file, you can run the training and use the neural network to providing sentences to the get_sentiment function. If you do not want to run the training function, comment out the line that calls the training function.
The code is also available on GitHub.
Sentiment analysis code
This section presents the sentiment analysis code.
Example A-1 shows the data_prep.py preparation file.
Example A-1 The data_prep.py file
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import re
import random
import pandas as pd
import pickle
import os
from collections import Counter
 
lemm = WordNetLemmatizer()
 
 
def rand_list(lines, max_value):
randlist = []
for _ in range(lines):
num = random.randint(0, max_value-1)
while num in randlist:
num = random.randint(0, max_value - 1)
randlist.append(num)
 
return randlist
 
 
def shuffler(input_ds, output_ds):
df_source = pd.read_csv(input_ds, '<SP>', error_bad_lines=False)
df_shuffled = df_source.iloc[np.random.permutation(len(df_source))]
df_shuffled.to_csv(output_ds, 'µ', index=False)
 
def smaller_dataset_gen(ds, newds, dsrows, num_lines=1000):
count = 0
with open(ds, 'r', 5000, 'latin-1') as raw_ds:
with open(newds, 'w', 5000) as target_ds:
selected_lines = rand_list(num_lines, dsrows)
for line in raw_ds:
if len(selected_lines) == 0:
break
 
if count in selected_lines:
target_ds.write(line)
selected_lines.remove(count)
count += 1
 
print("New dataset created with {} lines".format(num_lines))
 
 
def clean_dataset(ds, ods):
with open(ds, 'r', 30000, 'latin-1') as raw_ds:
with open('tempds.csv', 'w', 20000) as cleaned_ds:
for line in raw_ds:
result = re.search('^"(d)",.*,"(.*)"$', line)
new_line = result.group(1) + '<SP>' + result.group(2) + ' '
cleaned_ds.write(new_line)
 
shuffler('tempds.csv', ods)
os.remove('tempds.csv')
print("Dataset cleanup done")
 
 
def create_word_dict(source_ds):
word_dict = []
with open(source_ds, 'r', 30000, 'latin-1') as ds:
for line in ds:
text = line.split('µ')[1]
words = word_tokenize(text.lower())
lemm_words = [lemm.lemmatize(w) for w in words]
word_dict += list(lemm_words)
 
word_count = Counter(word_dict)
 
cleaned_word_dict = [word for word in word_count if 1000 >
word_count[word] > 60]
dict_size = len(cleaned_word_dict)
 
print("Word dictionary size: {}".format(dict_size))
with open('word_dict.pickle', 'wb') as wd:
pickle.dump(cleaned_word_dict, wd)
 
print("Word dictionary generated and saved")
return dict_size
 
 
def sentence_to_vector(word_dict_file, cleaned_ds, output_file):
 
with open(cleaned_ds, 'r', 30000, 'latin-1') as ds:
with open(word_dict_file, 'rb') as wd:
word_dict = pickle.load(wd)
num_lines = 0
with open(output_file, 'wb') as hv:
 
for line in ds:
# print(line)
hot_vector = np.zeros(len(word_dict))
if line.count('µ') == 1:
sentiment, text = line.split('µ')
words = word_tokenize(text.lower())
lemm_words = [lemm.lemmatize(w) for w in words]
for word in lemm_words:
if word in word_dict:
hot_vector[word_dict.index(word)] += 1
hot_vector = list(hot_vector)
 
clean_sentiment = re.search('.*(d).*', sentiment)
 
if int(clean_sentiment.group(1)) == 0:
sentiment = [1, 0]
else:
sentiment = [0, 1]
 
# print(hot_vector, sentiment)
num_lines += 1
 
pickle.dump([hot_vector, sentiment], hv)
 
print('Hot vectors file generated with {}
lines'.format(num_lines))
return num_lines
 
 
 
clean_dataset('trainingandtestdata/testdata.manual.2009.06.14.csv', 'test.csv')
 
with open('data_details.pkl', 'wb') as details:
dict_size = create_word_dict('small_train.csv')
train_size = sentence_to_vector('word_dict.pickle', 'small_train.csv',
'train_hot_vectors.pickle')
test_size = sentence_to_vector('word_dict.pickle', 'test.csv',
'test_hot_vectors.pickle')
details_sizes = {'dict': dict_size, 'train': train_size, 'test': test_size}
pickle.dump(details_sizes, details)
Model and training
Example A-2 shows the sentiment_neural_net.py model and training file.
Example A-2 The sentiment_neural_net.py file
import tensorflow as tf
import pickle
 
x = tf.placeholder('float')
y = tf.placeholder('float')
 
batch_size = 1000
num_epochs = 1
 
 
def load_details():
with open('data_details.pkl', 'rb') as details:
det = pickle.load(details)
return det
 
 
line_sizes = load_details()
 
 
# Creates the neural network model
def ff_neural_net(input_data):
neurons_hl1 = 1500
neurons_hl2 = 1500
neurons_hl3 = 1500
 
output_neurons = 2
 
l1_weight = tf.Variable(tf.random_normal([line_sizes['dict'], neurons_hl1]),
name='w1')
l1_bias = tf.Variable(tf.random_normal([neurons_hl1]), name='b1')
 
l2_weight = tf.Variable(tf.random_normal([neurons_hl1, neurons_hl2]),
name='w2')
l2_bias = tf.Variable(tf.random_normal([neurons_hl2]), name='b2')
 
l3_weight = tf.Variable(tf.random_normal([neurons_hl2, neurons_hl3]),
name='w3')
l3_bias = tf.Variable(tf.random_normal([neurons_hl3]), name='b3')
 
output_weight = tf.Variable(tf.random_normal([neurons_hl3, output_neurons]),
name='wo')
output_bias = tf.Variable(tf.random_normal([output_neurons]), name='bo')
 
l1 = tf.add(tf.matmul(input_data, l1_weight), l1_bias)
l1 = tf.nn.relu(l1)
 
l2 = tf.add(tf.matmul(l1, l2_weight), l2_bias)
l2 = tf.nn.relu(l2)
 
l3 = tf.add(tf.matmul(l2, l3_weight), l3_bias)
l3 = tf.nn.relu(l3)
 
output = tf.matmul(l3, output_weight) + output_bias
 
return output
 
 
def training(in_placeholder):
nn_output = ff_neural_net(in_placeholder)
saver = tf.train.Saver()
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
logits=nn_output, labels=y))
 
optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)
 
defined graph
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(num_epochs):
epoch_loss = 0
buffer_train = []
buffer_label = []
with open('train_hot_vectors.pickle', 'rb') as train_hot_vec:
for i in range(line_sizes['train']):
hot_vector_line = pickle.load(train_hot_vec)
buffer_train.append(hot_vector_line[0])
buffer_label.append(hot_vector_line[1])
 
if len(buffer_train) >= batch_size:
_, cost_iter = sess.run([optimizer, cost],
feed_dict={in_placeholder:
buffer_train, y: buffer_label})
epoch_loss += cost_iter
buffer_train = []
buffer_label = []
 
print('Epoch {} completed. Total loss: {}'.format(
epoch+1, epoch_loss))
 
correct = tf.equal(tf.argmax(nn_output, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
 
with open('test_hot_vectors.pickle', 'rb') as train_hot_vec:
buffer_test = []
buffer_test_label = []
for i in range(line_sizes['test']):
test_hot_vector_line = pickle.load(train_hot_vec)
buffer_test.append(test_hot_vector_line[0])
buffer_test_label.append(test_hot_vector_line[1])
 
print('Accuracy using test dataset: {}'
.format(accuracy.eval({in_placeholder: buffer_test,
y: buffer_test_label})))
saver.save(sess, "model.ckpt")
Using the model
Example A-3 shows the use_neural_net.py model.
Example A-3 The use_neural_net.py file
import tensorflow as tf
import pickle
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sentiment_neural_net import ff_neural_net
from sentiment_neural_net import training
lemm = WordNetLemmatizer()
 
x = tf.placeholder('float')
 
 
def get_sentiment(input_data):
tf.reset_default_graph()
pl = tf.placeholder('float')
nn_output = ff_neural_net(pl)
saver = tf.train.Saver()
with open('word_dict.pickle', 'rb') as f:
word_dict = pickle.load(f)
 
with tf.Session() as sess:
# sess.run(tf.global_variables_initializer())
# saver = tf.train.Saver()
saver.restore(sess, "model.ckpt")
words = word_tokenize(input_data.lower())
lemm_words = [lemm.lemmatize(w) for w in words]
hot_vector = np.zeros(len(word_dict))
 
for word in lemm_words:
if word.lower() in word_dict:
index_value = word_dict.index(word.lower())
hot_vector[index_value] += 1
 
hot_vector = np.array(list(hot_vector))
 
result = (sess.run(tf.argmax(nn_output.eval(
feed_dict={pl: [hot_vector]}), 1)))
if result[0] == 0:
print('Negative:', input_data)
elif result[0] == 1:
print('Positive:', input_data)
 
 
# Uncomment the row below to train the model
# training(x)
 
 
get_sentiment('Lebron is a beast... nobody in the NBA comes even close')
get_sentiment("This was the best store i've ever seen.")
get_sentiment("Why do you hate the world")
get_sentiment("we always need to do good things to help each other")
 
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset