Building a sentiment analysis model using NLTK - lukegenung/twitter-sentiment-bot GitHub Wiki

Instructions

Building a sentiment analysis model using NLTK.

Code

Collect the training data

Use NLTK’s Twitter samples for pre-labeled training data:

from nltk.corpus import twitter_samples

# set nltk twitter samples as list of strings
pos_sample_tweets = twitter_samples.strings('positive_tweets.json')
neg_sample_tweets = twitter_samples.strings('negative_tweets.json')

Combine any Twitter samples collected and labeled separately:

import csv

def import_csv(csv_filename):
	data = []

	# add '.csv' type if not included in csv_filename
	if not csv_filename.endswith('.csv'):
		csv_filename = csv_filename + '.csv'

	# save csv data
	with open(csv_filename, newline='') as csv_file:
		csv_reader = csv.reader(csv_file, delimiter=',')
		for row in csv_reader:
			data.extend(row)

	return data

# get custom tweets as list of strings
pos_custom_tweets = import_csv('positive_tweets.csv')
neg_custom_tweets = import_csv('negative_tweets.csv')

# combine nltk twitter samples and custom tweets
positive_tweets = pos_sample_tweets + pos_custom_tweets
negative_tweets = neg_sample_tweets + neg_custom_tweets

Process the training data

Break tweets into words using NLTK’s Twitter tokenizer:

from nltk.tokenize import casual_tokenize

# tokenize tweets
positive_tweet_tokens = [casual_tokenize(i) for i in positive_tweets]
negative_tweet_tokens = [casual_tokenize(i) for i in negative_tweets]

Remove noise including: URL links, @ mentions, punctuation and special characters
Clean tweets: Reduce words to their word stems and remove stop words

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag
import re, string

def remove_noise(tweet_tokens, stop_words = ()):
	'''
	Utility function to get cleaned alphanumeric word tokens.
	Remove URL hyperlinks, @ mentions, punctuation and specials characters.
	Removes stop words and normalizes word forms.

	Arguments:
		tweet_tokens: Tokens to be cleaned.
		stop_words: Stop words as list of strings.

	Returns:
		cleaned_tokens: Cleaned tokens.
	'''

	cleaned_tokens = []

for token, tag in pos_tag(tweet_tokens):
		# replace URL hyperlinks with an empty string
		token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
					   '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
		# replace @ mentions with an empty string
		token = re.sub("(@[A-Za-z0-9_]+)",'', token)

		# normalize word forms using lemmatizer
		if tag.startswith("NN"):
			pos = 'n'
		elif tag.startswith('VB'):
			pos = 'v'
		else:
			pos = 'a'

		lemmatizer = WordNetLemmatizer()

		# get normalized tokens
		token = lemmatizer.lemmatize(token, pos)

		# remove punctuation, special characters and stop words
		if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
			cleaned_tokens.append(token.lower())

	return cleaned_tokens

# set cleaned tokens lists
positive_cleaned_tokens_list = []
negative_cleaned_tokens_list = []

stop_words = stopwords.words('english')

# get cleaned positive tokens
for tokens in positive_tweet_tokens:
			positive_cleaned_tokens_list.append(helpers.remove_noise(tokens, stop_words))

# get cleaned negative tokens
for tokens in negative_tweet_tokens:
			negative_cleaned_tokens_list.append(helpers.remove_noise(tokens, stop_words))

Validate processed data before training the sentiment analysis model:

def get_all_words(cleaned_tokens_list):
	'''
	Utility function to iterate through a list of tokens.

	Arguments:
		cleaned_tokens_list: A list of tokens.

	Yields:
		token: Each token from given list.
	'''
	for tokens in cleaned_tokens_list:
		for token in tokens:
			yield token

# convert tokens into iterable word lists
all_pos_words = get_all_words(positive_cleaned_tokens_list)
all_neg_words = get_all_words(negative_cleaned_tokens_list)

# get frequency distribution of word lists
freq_dist_pos = FreqDist(all_pos_words)
freq_dist_neg = FreqDist(all_neg_words)

# print top 10 positive and negative words
print('Top 10 positive and negative words:')
print(freq_dist_pos.most_common(10))
print(freq_dist_neg.most_common(10))

Train the model

Split the data into a training dataset and a testing dataset:

import random

def get_tweets_for_model(cleaned_tokens_list):
	'''
	Utility function to format tokens as a dictionary for a model.

	Arguments:
		cleaned_tokens_list: A list of tokens.

	Yields:
		Generator function for tokens as a dictionary.
	'''
	for tweet_tokens in cleaned_tokens_list:
		yield dict([token, True] for token in tweet_tokens)

# convert tokens to a dictionary for modelling
positive_tokens_for_model = get_tweets_for_model(positive_cleaned_tokens_list)
negative_tokens_for_model = get_tweets_for_model(negative_cleaned_tokens_list)

# assign a label to positive tokens
positive_dataset = [(tweet_dict, "Positive")
					 for tweet_dict in positive_tokens_for_model]

# assign a label to negative tokens
negative_dataset = [(tweet_dict, "Negative")
					 for tweet_dict in negative_tokens_for_model]

# set dataset and randomize to train model
dataset = positive_dataset + negative_dataset
random.shuffle(dataset)

# split the data into a 70:30 ratio among 10K tweets
train_data = dataset[:7000]
test_data = dataset[7000:]

Build a Naïve Bayes Classifier using NLTK:

from nltk import NaiveBayesClassifier

# train a Naive Bayes model
classifier = NaiveBayesClassifier.train(train_data)

Validate model accuracy before using with new datasets:

from nltk import classify

# print model accuracy
print("Model accuracy is:", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))