Appendix A. Sentiment analysis code

Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

Sentiment analysis code

This appendix provides the code for the sentiment analysis scenario that is presented in Chapter 5, “Working with data and creating models in IBM PowerAI” on page 121, and describes how the files are structured.

This appendix contains the following topic:

•Sentiment analysis with TensorFlow

Sentiment analysis with TensorFlow

This section describes sentiment analysis with the TensorFlow code.

How the code is organized

The sentiment analysis code is organized into three Python (.py) files:

•data_prep.py: This file is used to prepare the data. You can run it independently and generate the output data set and dictionaries to be used in further steps.

•sentiment_neural_net.py: This file contains the functions that are responsible for model creation and training. Because of the way the code is prepared, you can run the training from the use_neural_net.py file.

•use_neural_net.py: From this file, you can run the training and use the neural network to providing sentences to the get_sentiment function. If you do not want to run the training function, comment out the line that calls the training function.

The code is also available on GitHub.

Sentiment analysis code

This section presents the sentiment analysis code.

Example A-1 shows the data_prep.py preparation file.

Example A-1 The data_prep.py file

from nltk import word_tokenize

from nltk.stem import WordNetLemmatizer

import numpy as np

import re

import random

import pandas as pd

import pickle

import os

from collections import Counter

lemm = WordNetLemmatizer()

def rand_list(lines, max_value):

randlist = []

for _ in range(lines):

num = random.randint(0, max_value-1)

while num in randlist:

num = random.randint(0, max_value - 1)

randlist.append(num)

return randlist

def shuffler(input_ds, output_ds):

df_source = pd.read_csv(input_ds, '<SP>', error_bad_lines=False)

df_shuffled = df_source.iloc[np.random.permutation(len(df_source))]

df_shuffled.to_csv(output_ds, 'µ', index=False)

def smaller_dataset_gen(ds, newds, dsrows, num_lines=1000):

count = 0

with open(ds, 'r', 5000, 'latin-1') as raw_ds:

with open(newds, 'w', 5000) as target_ds:

selected_lines = rand_list(num_lines, dsrows)

for line in raw_ds:

if len(selected_lines) == 0:

break

if count in selected_lines:

target_ds.write(line)

selected_lines.remove(count)

count += 1

print("New dataset created with {} lines".format(num_lines))

def clean_dataset(ds, ods):

with open(ds, 'r', 30000, 'latin-1') as raw_ds:

with open('tempds.csv', 'w', 20000) as cleaned_ds:

for line in raw_ds:

result = re.search('^"(d)",.*,"(.*)"$', line)

new_line = result.group(1) + '<SP>' + result.group(2) + ' '

cleaned_ds.write(new_line)

shuffler('tempds.csv', ods)

os.remove('tempds.csv')

print("Dataset cleanup done")

def create_word_dict(source_ds):

word_dict = []

with open(source_ds, 'r', 30000, 'latin-1') as ds:

for line in ds:

text = line.split('µ')[1]

words = word_tokenize(text.lower())

lemm_words = [lemm.lemmatize(w) for w in words]

word_dict += list(lemm_words)

word_count = Counter(word_dict)

cleaned_word_dict = [word for word in word_count if 1000 >

word_count[word] > 60]

dict_size = len(cleaned_word_dict)

print("Word dictionary size: {}".format(dict_size))

with open('word_dict.pickle', 'wb') as wd:

pickle.dump(cleaned_word_dict, wd)

print("Word dictionary generated and saved")

return dict_size

def sentence_to_vector(word_dict_file, cleaned_ds, output_file):

with open(cleaned_ds, 'r', 30000, 'latin-1') as ds:

with open(word_dict_file, 'rb') as wd:

word_dict = pickle.load(wd)

num_lines = 0

with open(output_file, 'wb') as hv:

for line in ds:

# print(line)

hot_vector = np.zeros(len(word_dict))

if line.count('µ') == 1:

sentiment, text = line.split('µ')

words = word_tokenize(text.lower())

lemm_words = [lemm.lemmatize(w) for w in words]

for word in lemm_words:

if word in word_dict:

hot_vector[word_dict.index(word)] += 1

hot_vector = list(hot_vector)

clean_sentiment = re.search('.*(d).*', sentiment)

if int(clean_sentiment.group(1)) == 0:

sentiment = [1, 0]

else:

sentiment = [0, 1]

# print(hot_vector, sentiment)

num_lines += 1

pickle.dump([hot_vector, sentiment], hv)

print('Hot vectors file generated with {}

lines'.format(num_lines))

return num_lines

clean_dataset('trainingandtestdata/testdata.manual.2009.06.14.csv', 'test.csv')

with open('data_details.pkl', 'wb') as details:

dict_size = create_word_dict('small_train.csv')

train_size = sentence_to_vector('word_dict.pickle', 'small_train.csv',

'train_hot_vectors.pickle')

test_size = sentence_to_vector('word_dict.pickle', 'test.csv',

'test_hot_vectors.pickle')

details_sizes = {'dict': dict_size, 'train': train_size, 'test': test_size}

pickle.dump(details_sizes, details)

Model and training

Example A-2 shows the sentiment_neural_net.py model and training file.

Example A-2 The sentiment_neural_net.py file

import tensorflow as tf

import pickle

x = tf.placeholder('float')

y = tf.placeholder('float')

batch_size = 1000

num_epochs = 1

def load_details():

with open('data_details.pkl', 'rb') as details:

det = pickle.load(details)

return det

line_sizes = load_details()

# Creates the neural network model

def ff_neural_net(input_data):

neurons_hl1 = 1500

neurons_hl2 = 1500

neurons_hl3 = 1500

output_neurons = 2

l1_weight = tf.Variable(tf.random_normal([line_sizes['dict'], neurons_hl1]),

name='w1')

l1_bias = tf.Variable(tf.random_normal([neurons_hl1]), name='b1')

l2_weight = tf.Variable(tf.random_normal([neurons_hl1, neurons_hl2]),

name='w2')

l2_bias = tf.Variable(tf.random_normal([neurons_hl2]), name='b2')

l3_weight = tf.Variable(tf.random_normal([neurons_hl2, neurons_hl3]),

name='w3')

l3_bias = tf.Variable(tf.random_normal([neurons_hl3]), name='b3')

output_weight = tf.Variable(tf.random_normal([neurons_hl3, output_neurons]),

name='wo')

output_bias = tf.Variable(tf.random_normal([output_neurons]), name='bo')

l1 = tf.add(tf.matmul(input_data, l1_weight), l1_bias)

l1 = tf.nn.relu(l1)

l2 = tf.add(tf.matmul(l1, l2_weight), l2_bias)

l2 = tf.nn.relu(l2)

l3 = tf.add(tf.matmul(l2, l3_weight), l3_bias)

l3 = tf.nn.relu(l3)

output = tf.matmul(l3, output_weight) + output_bias

return output

def training(in_placeholder):

nn_output = ff_neural_net(in_placeholder)

saver = tf.train.Saver()

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(

logits=nn_output, labels=y))

optimizer = tf.train.AdamOptimizer(learning_rate=0.001).minimize(cost)

defined graph

with tf.Session() as sess:

sess.run(tf.global_variables_initializer())

for epoch in range(num_epochs):

epoch_loss = 0

buffer_train = []

buffer_label = []

with open('train_hot_vectors.pickle', 'rb') as train_hot_vec:

for i in range(line_sizes['train']):

hot_vector_line = pickle.load(train_hot_vec)

buffer_train.append(hot_vector_line[0])

buffer_label.append(hot_vector_line[1])

if len(buffer_train) >= batch_size:

_, cost_iter = sess.run([optimizer, cost],

feed_dict={in_placeholder:

buffer_train, y: buffer_label})

epoch_loss += cost_iter

buffer_train = []

buffer_label = []

print('Epoch {} completed. Total loss: {}'.format(

epoch+1, epoch_loss))

correct = tf.equal(tf.argmax(nn_output, 1), tf.argmax(y, 1))

accuracy = tf.reduce_mean(tf.cast(correct, 'float'))

with open('test_hot_vectors.pickle', 'rb') as train_hot_vec:

buffer_test = []

buffer_test_label = []

for i in range(line_sizes['test']):

test_hot_vector_line = pickle.load(train_hot_vec)

buffer_test.append(test_hot_vector_line[0])

buffer_test_label.append(test_hot_vector_line[1])

print('Accuracy using test dataset: {}'

.format(accuracy.eval({in_placeholder: buffer_test,

y: buffer_test_label})))

saver.save(sess, "model.ckpt")

Using the model

Example A-3 shows the use_neural_net.py model.

Example A-3 The use_neural_net.py file

import tensorflow as tf

import pickle

import numpy as np

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from sentiment_neural_net import ff_neural_net

from sentiment_neural_net import training

lemm = WordNetLemmatizer()

x = tf.placeholder('float')

def get_sentiment(input_data):

tf.reset_default_graph()

pl = tf.placeholder('float')

nn_output = ff_neural_net(pl)

saver = tf.train.Saver()

with open('word_dict.pickle', 'rb') as f:

word_dict = pickle.load(f)

with tf.Session() as sess:

# sess.run(tf.global_variables_initializer())

# saver = tf.train.Saver()

saver.restore(sess, "model.ckpt")

words = word_tokenize(input_data.lower())

lemm_words = [lemm.lemmatize(w) for w in words]

hot_vector = np.zeros(len(word_dict))

for word in lemm_words:

if word.lower() in word_dict:

index_value = word_dict.index(word.lower())

hot_vector[index_value] += 1

hot_vector = np.array(list(hot_vector))

result = (sess.run(tf.argmax(nn_output.eval(

feed_dict={pl: [hot_vector]}), 1)))

if result[0] == 0:

print('Negative:', input_data)

elif result[0] == 1:

print('Positive:', input_data)

# Uncomment the row below to train the model

# training(x)

get_sentiment('Lebron is a beast... nobody in the NBA comes even close')

get_sentiment("This was the best store i've ever seen.")

get_sentiment("Why do you hate the world")

get_sentiment("we always need to do good things to help each other")

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for Appendix A. Sentiment analysis code

Create new playlist

Sign In

Sign Up

Table of Contents for
Appendix A. Sentiment analysis code