MVP: User Stories - Abhishekkalra88/Entity-Media-Sentiment-Analyzer GitHub Wiki

Entity Media Sentiment Analyzer MVP - Architecture Diagram

EMSA MVP Architecture Diagram

[EMSA-010] Application User Input Interface (Web based)

Script:

Landing_Template_Fl.html

Output:

[EMSA-020]Input Information Ingestor, Webpage Extractor and Sentiment Generator

Script:

    class media_analyzer():
    

    def __init__(self,query,number_results):
        self.query = query
        self.number_of_results = number_results
 
    # function to extract links to google pages
    def link_extractor (self):
        self.html_link = []
        query = self.query
        numb = self.number_of_results
        for j in search(query, tld="com", num=numb, stop=numb, pause=4):
            self.html_link.append(j)
        return self.html_link
    
    # function to extract text from search pages
    def text_extractor (self,source_list): # takes in list of html links
        self.text_list = []
        for link in source_list:
            if requests.get(link).status_code == 200:
                soup = BeautifulSoup(requests.get(link).text, 'html.parser')
                page_text = soup.find_all('p')
                text = ""
                for element in page_text:
                    if element.text != "":
                        text += (element.text).encode("ascii", errors="ignore").decode(errors="ignore").lower()
                        text += " "
                self.text_list.append(text)
            else:
                pass
        return self.text_list # returns a list of teext parsed from searched html links
    
    # function to remove punctuation, remove stopwords, and tokenize string content to words
    @staticmethod
    def cleaner(review): # takes in string
        stop_words = set(stopwords.words('english'))
        word_list = word_tokenize(review)
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in word_list]
        text = ""
        for word in stripped:
            if not word in stop_words:
                text += word
                text += " "
        return text  # retunrs string
    
    #function to generate text summary statistics
    @classmethod
    def text_summary(cls,text_list):# takes in a list
        wordcount = {}
        for i in text_list:
            strng = cls.cleaner(i.lower())  #takes string,returns string
            for word in strng.lower().split():
                if word not in wordcount:
                    wordcount[word] = 1
                else:
                    wordcount[word] += 1
        return sorted(wordcount.items(),key=lambda kv:(kv[1], kv[0]), reverse = True)
    
    @classmethod
    def TF_ML_ABC_Keywords(cls):
        ABC_keywords = []
        TF_keywords = []
        ML_keywords = []
        f = open("ABC_Keywords.txt", "r",errors = 'ignore')
        for x in f:
            ABC_keywords.append(((cls.cleaner(x)).lower()).strip())
        f = open("AML_Keywords.txt", "r",errors = 'ignore')
        for x in f:
            ML_keywords.append(((cls.cleaner(x)).lower()).strip())
        f = open("TF_Keywords.txt", "r",errors = 'ignore')
        for x in f:
            TF_keywords.append(((cls.cleaner(x)).lower()).strip())
        return ABC_keywords,TF_keywords,ML_keywords
    
    @classmethod
    def text_summary_keywords(cls,text_list,keyword_list):# takes in a list
        wordcount = {}
        for i in text_list:
            strng = cls.cleaner(i.lower())  #takes string,returns string
            for word in strng.lower().split():
                if word in keyword_list: 
                    if word not in wordcount:
                        wordcount[word] = 1
                    else:
                        wordcount[word] += 1
        return sorted(wordcount.items(),key=lambda kv:(kv[1], kv[0]), reverse = True)
    
    
    # function for sentiment analysis using NLTK's inbult sentiment analyzer
    @classmethod
    def polarity_calculator(cls,string_data): # takes in string
        sia = SIA()
        strng = cls.cleaner(string_data)
        pol_score = sia.polarity_scores((strng))
        pol_score['article'] = string_data
        return pol_score  #returns dict
    
    # funtion to generate the new score i.e good media or negative media coverage on the bases of search history 
    @classmethod
    def entity_score(cls,score_list):# takes in list
        df = pd.DataFrame.from_records(score_list)
        df['label'] = 0
        df.loc[df['compound'] > 0.2, 'label'] = 1
        df.loc[df['compound'] < -0.2, 'label'] = -1
        gk = df.groupby('label')
        keys_out = gk.groups.keys()
        
        #ow_o = len(gk.get_group(1))
        #ow_t = len(gk.get_group(-1))
        
        if 1 in keys_out and -1 in keys_out:
           score = len(gk.get_group(1)) - len(gk.get_group(-1))
        elif 1 in keys_out:
           score = len(gk.get_group(1))
        elif -1 in keys_out:
           score = len(gk.get_group(-1))
        else:
           score = 0
        #entity = media_analyzer(self.query,self.number_results,self.entity_score)
        return score #,ow_o,ow_t

# FLASK / REACT APP LOADING....
import flask
application = flask.Flask(__name__)           # a Flask object

@application.route('/', methods=['POST', 'GET']) #entity_search
def ask_entity():
    return flask.render_template('Landing_Template_fl.html')

@application.route('/details', methods=['POST', 'GET'])
def entity_analysis():

    entity_name = flask.request.form.get('ent')
    page = int(flask.request.form.get('page'))
    worddoc = flask.request.form.get('narrative')
    no_id = flask.request.form.get('no_id')  # from a GET (URL)
    media_analyzer(entity_name,page)
    
    if entity_name:
        page = page
        inst = media_analyzer(entity_name,page)
        out_1 = inst.link_extractor()
        out_2 = inst.text_extractor(out_1)
        out_3 = []
        for i in out_2:
            score = media_analyzer.polarity_calculator(i)
            out_3.append(score)
        final_score = int(media_analyzer.entity_score(out_3))
        
        #print(gk)
        #print(ow_t)
        
        if final_score > 0:
            sentiment = "Postitve"
        elif final_score < 0:
            sentiment = "Negative"
        
        elif final_score == 0:
            sentiment = "Neutral"

        keyword_list = (media_analyzer.text_summary(out_2)[:5])
        ABC_keywords,TF_keywords,ML_keywords = media_analyzer.TF_ML_ABC_Keywords()
        ABC_list = (media_analyzer.text_summary_keywords(out_2,ABC_keywords)[:5])
        TF_list = (media_analyzer.text_summary_keywords(out_2,TF_keywords)[:5])
        ML_list = (media_analyzer.text_summary_keywords(out_2,ML_keywords)[:5])

        
        if worddoc == 'Yes':
            #inst1 = case_narrative(entity_name,final_score,out_1,keyword_list,page,TF_list,ML_list,ABC_list)
            pass
        
        
        msg = flask.render_template('Result_Template.html', obj = inst, score = final_score, word_s = keyword_list, final_sentiment = sentiment,word_TF = TF_list,word_ML= ML_list, word_ABC = ABC_list)
        
        #writing to the database
        url_string = ""
        keyword_string = ""
        ML_string = ""
        ABC_string = ""
        TF_string = ""
        
        for i in out_1:
            url_string += str(i)
            url_string += ","
            
        for i in keyword_list:
            for a in i:
                keyword_string += str(a)
                keyword_string += " "
            keyword_string += ","
            
        for j in ML_list:     
            for a in j: 
                ML_string += str(a)
                ML_string += " "
            ML_string += ","
        
        for k in ABC_list:
            for a in k:
                ABC_string += str(a)
                ABC_string += " "
       
            ABC_string += ","
        
        for d in TF_list:
            for a in d:
                TF_string += str(a)
                TF_string += " "  
            TF_string += ","
    
        database_operations.database_write(entity_name,page,url_string,final_score,keyword_string,ML_string,ABC_string,TF_string)
    
    elif no_id:
        msg = 'No Object Details.'

    else:
        
        raise ValueError('\nraised error:  no "name" or "no_name" params passed in request')

    return '<PRE>{}</PRE>'.format(msg)

if __name__ == '__main__':
    application.run(debug=True, port=5000)    # app starts serving in debug mode on port 5000        

[EMSA-030]Hosting the Entity Media Analyzer on Cloud Server

AWSEB - Entity Media Sentiment Analyzer Application

Output:

AWS Cloud URL : http://entity-media-sentiment-analyzer-dev.us-west-2.elasticbeanstalk.com/?

[EMSA-040] Sentiment Classifier Bench-marking

Script:

documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

cleaning up the word features
def cleaner(review): # takes in string
    stop_words = set(stopwords.words('english'))
    table = str.maketrans('', '', string.punctuation)
    stripped = review.translate(table)
    text = ""
    if not stripped in stop_words:
        text = stripped 
    return text  # retunrs string

all_words_1 = []

for w in movie_reviews.words():
    if cleaner(w.lower()):
        all_words_1.append(w.lower())

Define the feature extractor
        
all_words = nltk.FreqDist(w.lower() for w in all_words_1)
word_features = list(all_words)[:2000] # selecting the top 2000 word features



def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

#Train Naive Bayes classifier
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]

classifier = nltk.NaiveBayesClassifier.train(train_set)

#Evaluating the classifier accuracy
#print(nltk.classify.accuracy(classifier, test_set))

trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, train_set)
#Training classifier
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key, value))
"""
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [neg]: 0.8113207547169812
F-measure [pos]: 0.7872340425531915
Precision [neg]: 0.7818181818181819
Precision [pos]: 0.8222222222222222
Recall [neg]: 0.8431372549019608
Recall [pos]: 0.7551020408163265
"""
#Vader Classifier
#http://akashsenta.com/blog/sentiment-analysis-with-vader-with-python/


    
def cleaner_sentence(review): # takes in string
    stop_words = set(stopwords.words('english'))
    word_list = word_tokenize(review)
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in word_list]
    text = ""
    for word in stripped:
        if not word in stop_words:
            text += word
            text += " "
    return text  # retunrs string

#function for sentiment analysis using NLTK's inbult sentiment analyzer

def polarity_calculator(string_data): # takes in string
    sia = SIA()
    strng = cleaner_sentence(string_data)
    pol_score = sia.polarity_scores((strng))
    pol_score['article'] = string_data
    return pol_score  #returns dict
    
#function to generate the new score i.e good media or negative media coverage on the bases of search history
#Scoring Methodology https://github.com/cjhutto/vaderSentiment#about-the-scoring
    
data_corpora = [[movie_reviews.raw(fileid),fileid[:3]] for fileid in movie_reviews.fileids()]

score_list = []
label =[]
for i in data_corpora:
    score_list.append(polarity_calculator(i[0]))
    label.append(i[1])

df = pd.DataFrame(score_list)
df['label'] = label

df['score'] = df['compound'].apply(lambda score: 'pos' if score > 0.05 else ('neg' if score < 0.05 else score ))
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
accuracy_score(df['label'],df['score']) #0.61
print(classification_report(df['label'],df['score']))

[EMSA-050]Application Integration with a database

Script:

class database_operations():
    
    
    @classmethod
    def database_write(cls,entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list):
        #db_details
        host="database-a.cx7qexz64alu.us-west-2.rds.amazonaws.com"
        port=3306
        dbname="EMSAdatabase"
        user="XXXXXXXXXXX"
        password="XXXXXXXXXXX"
        
        conn = pymysql.connect(host, user=user,port=port,
                           passwd=password, db=dbname )
        mycursor = conn.cursor()
        mycursor.execute ("CREATE TABLE IF NOT EXISTS EMSAdatabase.EMSA_SEARCH_HISTORY (rowid int AUTO_INCREMENT PRIMARY KEY, Entity_Name varchar(255),page int, url varchar(255),Score int,  common_keywords varchar(255),ML_keywords varchar(255),ABC_keywords varchar(255),TF_keywords varchar(255));")
        sql = "INSERT INTO EMSAdatabase.EMSA_SEARCH_HISTORY ( entity_name,page,url,score,common_keywords,ML_keywords,ABC_keywords,TF_keywords) VALUES (%s, %s,%s,%s,%s,%s,%s,%s)"
        val = (entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list)
        mycursor.execute(sql, val)
        conn.commit()
        #mycursor.execute("INSERT INTO EMSAdatabase.EMSA_SEARCH_HISTORY VALUES (?, ?, ?,?,?,?,?,?)", (entity_name,page,url,final_score,common_keyword,ML_list,ABC_list,TF_list))

Output

AWS RDS - MYSQL Database

[EMSA-060] Application Results User Interface (Web based)

Script:

Results_Template.html

Output:

Issue: [EMSA-070]Case Narrative Generation

Script:

        pdf = FPDF(orientation = 'P', unit = 'mm', format='Letter')
        pdf.add_page()
        pdf.set_font("Arial", 'B',size = 18) 
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Case Narrative Template",ln = 1, align = 'C') 
        pdf.line(5.0,5.0,205.0,5.0)
        pdf.set_font("Arial", size = 15) 
        pdf.set_text_color(10, 10, 10)
        pdf.cell(200, 10, txt = "Entity Name : " + entity_name, ln = 2, align = 'C') 
        pdf.set_font("Arial", size = 12) 
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Introduction ", ln = 3, align = 'C') 
        pdf.set_text_color(10, 10, 10)
        pdf.multi_cell(0,10,summ)
        pdf.set_font("Arial", 'B',size = 10) 
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Entity Media analyser", ln = 18)
        pdf.set_text_color(10, 10, 10)
        pdf.cell(200, 10, txt = "Sentiment : "+ sentiment, ln = 19 ) 
        pdf.cell(200, 10, txt = "Final Score :" + str(final_score), ln = 20) 
        pdf.set_font("Arial", 'B',size = 10) 
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Top 5 Key words in the webpage", ln = 22 ) 
        pdf.set_text_color(10, 10, 10)
        # TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS    
        for i in keyword_list:
            pdf.multi_cell(0,10, (str(i)))

        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Top 5 Money Laundering Key words in the webpage", ln = 28)
        pdf.set_text_color(10, 10, 10)
   #TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS    
        for i in ML_list:
            pdf.multi_cell(0,10, (str(i)))   

        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Top 5 Terrorist Financing Key words in the webpage", ln = 34 )
        pdf.set_text_color(10, 10, 10)
   #TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS    
        for i in TF_list:
            pdf.multi_cell(0,10, (str(i)))            
  
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200, 10, txt = "Top 5 Bribery & Corruption Key words in the webpage", ln = 46)
        pdf.set_text_color(10, 10, 10)
  
   #TOP 5 KEY WORD LIST FROM MEDIA ANALYSER CLASS    
        for i in ABC_list:
            pdf.multi_cell(0,10, (str(i)))

        pdf.set_text_color(50, 143, 220)
        
        pdf.cell(200, 10, txt = "Insights", ln = 65, align = 'C' )
        pdf.set_text_color(10, 10, 10)
        pdf.multi_cell(0,10,"Based on the below web page search, Focal entity "+entity_name+" appear in " +str(page)+ " web pages and overall sentiment is " + sentiment + " and sentiment final score is "+ str(final_score))
        pdf.set_text_color(50, 143, 220)
        pdf.cell(200,10,txt="Sources", ln = 70, align = 'C' )
        pdf.set_text_color(10, 10, 10)
       

        for i in  out_1:
            pdf.multi_cell(0,10, (i))         
        pdf.output("case_summary"+ entity_name +".pdf",'F') 

Output: Case Narrative sample template

⚠️ **GitHub.com Fallback** ⚠️