Demo entry 6658038

test

   

Submitted by anonymous on Nov 06, 2017 at 02:23
Language: Python 3. Code size: 4.2 kB.

import nltk
from nltk.tokenize import *
from nltk.util import ngrams
from nltk.classify import *
import preprocessor as p
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

#1. Load text data and sentiment data into a dataframe. Remove all data with neutral sentiment
print ('Loading data.....')
fields = ['airline_sentiment', 'text']
data = pd.read_csv(r'Tweets.csv', usecols=fields)
data = data[data['airline_sentiment'] != "neutral"]
data = data.reset_index(drop=True)
n =len(data)



#2. Preprocessing data by nltk.tokenize,nltk.pos_tag and preprocessor package
print ('Preprocessing.....')
for i, row in data.iterrows():
    s = p.clean(row[1])
    s = word_tokenize(s)
    s = nltk.pos_tag(s)
    data.set_value(i,'text',s)

    if row[0] == 'positive':
        data.set_value(i,'airline_sentiment',1)
    if row[0] == 'negative':
        data.set_value(i,'airline_sentiment',0)        

#3. Load postive and negative words from txt file
positive = []
with open("positive-words.txt") as file:
    for word in file: 
        word = word.strip()
        positive.append(word)

negative = []
with open("negative-words.txt") as file:
    for word in file: 
        word = word.strip()
        negative.append(word)


#4. Create a new dataframe to store all feature values and y values
print ('Creating Data.....')
df = pd.DataFrame(0,index = range(n),columns=['PUM','NUM','PBM','NBM','PTM','NTM','y'])
df.y = data.airline_sentiment

#5. Unigram
print ('Caluculating Unigram.....')
for index, row in data.iterrows():
    PUM = 0
    NUM = 0
    add = 1
    for i in range(len(row[1])):
        if row[1][i][1] == 'JJ' or row[1][i][1] == 'RB':
            try:
                for j in range(1,4):
                    if row[1][i-j][0] == "not" or row[1][i-j][0] == "n't":
                       a = a*-1
                       #print ('HERE',i)
                if row[1][i][0] in positive:
                    PUM += add
                elif row[1][i][0] in negative:
                    NUM += add
            except:
                continue
    #print (i,PUM,NUM)
    df.set_value(index,'PUM',PUM)
    df.set_value(index,'NUM',NUM) 
        
#6. Bigram
print ('Caluculating Bigram.....')
for index, row in data.iterrows():
    PBM = 0
    NBM = 0
    add = 1
    bigram = ngrams(row[1],2)
    for pair in bigram:
        if pair[0][1] == 'JJ' or pair[0][1] == 'RB' or pair[1][1] == 'JJ' or pair[1][1] == 'RB':
            if pair[0][0] in positive or pair[1][0] in positive:
                PBM += add
            elif pair[0][0] in negative or pair[1][0] in negative:
                NBM += add
        df.set_value(index,'PBM',PBM)
        df.set_value(index,'NBM',NBM) 

#7. Trigram
print ('Caluculating Trigram.....')
for index, row in data.iterrows():
    PTM = 0
    NTM = 0
    add = 1
    trigram = ngrams(row[1],3)
    for pair in trigram:
        if pair[1][1] == 'JJ' or pair[1][1] == 'RB':
            if pair[1][0] in positive:
                PTM += add
            elif pair[1][0] in negative:
                NTM += add
        df.set_value(index,'PTM',PTM)
        df.set_value(index,'NTM',NTM) 


#8. Split Tranning and Testing data and use Three different ML algorithms
print ('Spliting data.....')
featuresets = [(row[:5],row[6]) for index, row in df.iterrows()]
split = int(n*0.8)
trainset = featuresets[:split]
testset = featuresets[split:]

print ('Training and Testing by using Naive Bayes Classifier.....')
classifier = NaiveBayesClassifier.train(trainset)
print ('accuracy by using Naive Bayes Classifier:',nltk.classify.util.accuracy(classifier,testset))

print ('Training and Testing by using Max Entropy.....')
classifier = MaxentClassifier.train(trainset)
print ('accuracy by using Max Entropy:',nltk.classify.util.accuracy(classifier,testset))


print ('Training and Testing by using SVM.....')
classifier = SklearnClassifier(SVC()).train(trainset)
print ('accuracy by using SVM:',nltk.classify.util.accuracy(classifier,testset))

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).