MVP: User Stories - Abhishekkalra88/Entity-Media-Sentiment-Analyzer GitHub Wiki
class media_analyzer():
def __init__(self,query,number_results):
self.query = query
self.number_of_results = number_results
# function to extract links to google pages
def link_extractor (self):
self.html_link = []
query = self.query
numb = self.number_of_results
for j in search(query, tld="com", num=numb, stop=numb, pause=4):
self.html_link.append(j)
return self.html_link
# function to extract text from search pages
def text_extractor (self,source_list): # takes in list of html links
self.text_list = []
for link in source_list:
if requests.get(link).status_code == 200:
soup = BeautifulSoup(requests.get(link).text, 'html.parser')
page_text = soup.find_all('p')
text = ""
for element in page_text:
if element.text != "":
text += (element.text).encode("ascii", errors="ignore").decode(errors="ignore").lower()
text += " "
self.text_list.append(text)
else:
pass
return self.text_list # returns a list of teext parsed from searched html links
# function to remove punctuation, remove stopwords, and tokenize string content to words
@staticmethod
def cleaner(review): # takes in string
stop_words = set(stopwords.words('english'))
word_list = word_tokenize(review)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in word_list]
text = ""
for word in stripped:
if not word in stop_words:
text += word
text += " "
return text # retunrs string
#function to generate text summary statistics
@classmethod
def text_summary(cls,text_list):# takes in a list
wordcount = {}
for i in text_list:
strng = cls.cleaner(i.lower()) #takes string,returns string
for word in strng.lower().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
return sorted(wordcount.items(),key=lambda kv:(kv[1], kv[0]), reverse = True)
@classmethod
def TF_ML_ABC_Keywords(cls):
ABC_keywords = []
TF_keywords = []
ML_keywords = []
f = open("ABC_Keywords.txt", "r",errors = 'ignore')
for x in f:
ABC_keywords.append(((cls.cleaner(x)).lower()).strip())
f = open("AML_Keywords.txt", "r",errors = 'ignore')
for x in f:
ML_keywords.append(((cls.cleaner(x)).lower()).strip())
f = open("TF_Keywords.txt", "r",errors = 'ignore')
for x in f:
TF_keywords.append(((cls.cleaner(x)).lower()).strip())
return ABC_keywords,TF_keywords,ML_keywords
@classmethod
def text_summary_keywords(cls,text_list,keyword_list):# takes in a list
wordcount = {}
for i in text_list:
strng = cls.cleaner(i.lower()) #takes string,returns string
for word in strng.lower().split():
if word in keyword_list:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
return sorted(wordcount.items(),key=lambda kv:(kv[1], kv[0]), reverse = True)
# function for sentiment analysis using NLTK's inbult sentiment analyzer
@classmethod
def polarity_calculator(cls,string_data): # takes in string
sia = SIA()
strng = cls.cleaner(string_data)
pol_score = sia.polarity_scores((strng))
pol_score['article'] = string_data
return pol_score #returns dict
# funtion to generate the new score i.e good media or negative media coverage on the bases of search history
@classmethod
def entity_score(cls,score_list):# takes in list
df = pd.DataFrame.from_records(score_list)
df['label'] = 0
df.loc[df['compound'] > 0.2, 'label'] = 1
df.loc[df['compound'] < -0.2, 'label'] = -1
gk = df.groupby('label')
keys_out = gk.groups.keys()
#ow_o = len(gk.get_group(1))
#ow_t = len(gk.get_group(-1))
if 1 in keys_out and -1 in keys_out:
score = len(gk.get_group(1)) - len(gk.get_group(-1))
elif 1 in keys_out:
score = len(gk.get_group(1))
elif -1 in keys_out:
score = len(gk.get_group(-1))
else:
score = 0
#entity = media_analyzer(self.query,self.number_results,self.entity_score)
return score #,ow_o,ow_t
# FLASK / REACT APP LOADING....
import flask
application = flask.Flask(__name__) # a Flask object
@application.route('/', methods=['POST', 'GET']) #entity_search
def ask_entity():
return flask.render_template('Landing_Template_fl.html')
@application.route('/details', methods=['POST', 'GET'])
def entity_analysis():
entity_name = flask.request.form.get('ent')
page = int(flask.request.form.get('page'))
worddoc = flask.request.form.get('narrative')
no_id = flask.request.form.get('no_id') # from a GET (URL)
media_analyzer(entity_name,page)
if entity_name:
page = page
inst = media_analyzer(entity_name,page)
out_1 = inst.link_extractor()
out_2 = inst.text_extractor(out_1)
out_3 = []
for i in out_2:
score = media_analyzer.polarity_calculator(i)
out_3.append(score)
final_score = int(media_analyzer.entity_score(out_3))
#print(gk)
#print(ow_t)
if final_score > 0:
sentiment = "Postitve"
elif final_score < 0:
sentiment = "Negative"
elif final_score == 0:
sentiment = "Neutral"
keyword_list = (media_analyzer.text_summary(out_2)[:5])
ABC_keywords,TF_keywords,ML_keywords = media_analyzer.TF_ML_ABC_Keywords()
ABC_list = (media_analyzer.text_summary_keywords(out_2,ABC_keywords)[:5])
TF_list = (media_analyzer.text_summary_keywords(out_2,TF_keywords)[:5])
ML_list = (media_analyzer.text_summary_keywords(out_2,ML_keywords)[:5])
if worddoc == 'Yes':
#inst1 = case_narrative(entity_name,final_score,out_1,keyword_list,page,TF_list,ML_list,ABC_list)
pass
msg = flask.render_template('Result_Template.html', obj = inst, score = final_score, word_s = keyword_list, final_sentiment = sentiment,word_TF = TF_list,word_ML= ML_list, word_ABC = ABC_list)
#writing to the database
url_string = ""
keyword_string = ""
ML_string = ""
ABC_string = ""
TF_string = ""
for i in out_1:
url_string += str(i)
url_string += ","
for i in keyword_list:
for a in i:
keyword_string += str(a)
keyword_string += " "
keyword_string += ","
for j in ML_list:
for a in j:
ML_string += str(a)
ML_string += " "
ML_string += ","
for k in ABC_list:
for a in k:
ABC_string += str(a)
ABC_string += " "
ABC_string += ","
for d in TF_list:
for a in d:
TF_string += str(a)
TF_string += " "
TF_string += ","
database_operations.database_write(entity_name,page,url_string,final_score,keyword_string,ML_string,ABC_string,TF_string)
elif no_id:
msg = 'No Object Details.'
else:
raise ValueError('\nraised error: no "name" or "no_name" params passed in request')
return '<PRE>{}</PRE>'.format(msg)
if __name__ == '__main__':
application.run(debug=True, port=5000) # app starts serving in debug mode on port 5000 Output:
AWS Cloud URL : http://entity-media-sentiment-analyzer-dev.us-west-2.elasticbeanstalk.com/?
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
cleaning up the word features
def cleaner(review): # takes in string
stop_words = set(stopwords.words('english'))
table = str.maketrans('', '', string.punctuation)
stripped = review.translate(table)
text = ""
if not stripped in stop_words:
text = stripped
return text # retunrs string
all_words_1 = []
for w in movie_reviews.words():
if cleaner(w.lower()):
all_words_1.append(w.lower())
Define the feature extractor
all_words = nltk.FreqDist(w.lower() for w in all_words_1)
word_features = list(all_words)[:2000] # selecting the top 2000 word features
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
#Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
#Evaluating the classifier accuracy
#print(nltk.classify.accuracy(classifier, test_set))
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, train_set)
#Training classifier
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key, value))
"""
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [neg]: 0.8113207547169812
F-measure [pos]: 0.7872340425531915
Precision [neg]: 0.7818181818181819
Precision [pos]: 0.8222222222222222
Recall [neg]: 0.8431372549019608
Recall [pos]: 0.7551020408163265
"""
#Vader Classifier
#http://akashsenta.com/blog/sentiment-analysis-with-vader-with-python/
def cleaner_sentence(review): # takes in string
stop_words = set(stopwords.words('english'))
word_list = word_tokenize(review)
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in word_list]
text = ""
for word in stripped:
if not word in stop_words:
text += word
text += " "
return text # retunrs string
#function for sentiment analysis using NLTK's inbult sentiment analyzer
def polarity_calculator(string_data): # takes in string
sia = SIA()
strng = cleaner_sentence(string_data)
pol_score = sia.polarity_scores((strng))
pol_score['article'] = string_data
return pol_score #returns dict
#function to generate the new score i.e good media or negative media coverage on the bases of search history
#Scoring Methodology https://github.com/cjhutto/vaderSentiment#about-the-scoring
data_corpora = [[movie_reviews.raw(fileid),fileid[:3]] for fileid in movie_reviews.fileids()]
score_list = []
label =[]
for i in data_corpora:
score_list.append(polarity_calculator(i[0]))
label.append(i[1])
df = pd.DataFrame(score_list)
df['label'] = label
df['score'] = df['compound'].apply(lambda score: 'pos' if score > 0.05 else ('neg' if score < 0.05 else score ))
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
accuracy_score(df['label'],df['score']) #0.61
print(classification_report(df['label'],df['score']))Script:
class database_operations():
@classmethod
def database_write(cls,entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list):
#db_details
host="database-a.cx7qexz64alu.us-west-2.rds.amazonaws.com"
port=3306
dbname="EMSAdatabase"
user="XXXXXXXXXXX"
password="XXXXXXXXXXX"
conn = pymysql.connect(host, user=user,port=port,
passwd=password, db=dbname )
mycursor = conn.cursor()
mycursor.execute ("CREATE TABLE IF NOT EXISTS EMSAdatabase.EMSA_SEARCH_HISTORY (rowid int AUTO_INCREMENT PRIMARY KEY, Entity_Name varchar(255),page int, url varchar(255),Score int, common_keywords varchar(255),ML_keywords varchar(255),ABC_keywords varchar(255),TF_keywords varchar(255));")
sql = "INSERT INTO EMSAdatabase.EMSA_SEARCH_HISTORY ( entity_name,page,url,score,common_keywords,ML_keywords,ABC_keywords,TF_keywords) VALUES (%s, %s,%s,%s,%s,%s,%s,%s)"
val = (entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list)
mycursor.execute(sql, val)
conn.commit()
#mycursor.execute("INSERT INTO EMSAdatabase.EMSA_SEARCH_HISTORY VALUES (?, ?, ?,?,?,?,?,?)", (entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list))
pdf = FPDF(orientation = 'P', unit = 'mm', format='Letter')
pdf.add_page()
pdf.set_font("Arial", 'B',size = 18)
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Case Narrative Template",ln = 1, align = 'C')
pdf.line(5.0,5.0,205.0,5.0)
pdf.set_font("Arial", size = 15)
pdf.set_text_color(10, 10, 10)
pdf.cell(200, 10, txt = "Entity Name : " + entity_name, ln = 2, align = 'C')
pdf.set_font("Arial", size = 12)
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Introduction ", ln = 3, align = 'C')
pdf.set_text_color(10, 10, 10)
pdf.multi_cell(0,10,summ)
pdf.set_font("Arial", 'B',size = 10)
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Entity Media analyser", ln = 18)
pdf.set_text_color(10, 10, 10)
pdf.cell(200, 10, txt = "Sentiment : "+ sentiment, ln = 19 )
pdf.cell(200, 10, txt = "Final Score :" + str(final_score), ln = 20)
pdf.set_font("Arial", 'B',size = 10)
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Top 5 Key words in the webpage", ln = 22 )
pdf.set_text_color(10, 10, 10)
# TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS
for i in keyword_list:
pdf.multi_cell(0,10, (str(i)))
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Top 5 Money Laundering Key words in the webpage", ln = 28)
pdf.set_text_color(10, 10, 10)
#TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS
for i in ML_list:
pdf.multi_cell(0,10, (str(i)))
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Top 5 Terrorist Financing Key words in the webpage", ln = 34 )
pdf.set_text_color(10, 10, 10)
#TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS
for i in TF_list:
pdf.multi_cell(0,10, (str(i)))
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Top 5 Bribery & Corruption Key words in the webpage", ln = 46)
pdf.set_text_color(10, 10, 10)
#TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS
for i in ABC_list:
pdf.multi_cell(0,10, (str(i)))
pdf.set_text_color(50, 143, 220)
pdf.cell(200, 10, txt = "Insights", ln = 65, align = 'C' )
pdf.set_text_color(10, 10, 10)
pdf.multi_cell(0,10,"Based on the below web page search, Focal entity "+entity_name+" appear in " +str(page)+ " web pages and overall sentiment is " + sentiment + " and sentiment final score is "+ str(final_score))
pdf.set_text_color(50, 143, 220)
pdf.cell(200,10,txt="Sources", ln = 70, align = 'C' )
pdf.set_text_color(10, 10, 10)
for i in out_1:
pdf.multi_cell(0,10, (i))
pdf.output("case_summary"+ entity_name +".pdf",'F') Output: Case Narrative sample template