-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsentiment analysis.py
More file actions
64 lines (53 loc) · 1.9 KB
/
sentiment analysis.py
File metadata and controls
64 lines (53 loc) · 1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 12 12:49:21 2016
@author: YI
"""
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from senti_classifier import senti_classifier
#read csv and convert datetime format to date format
def read(a):
tweet=pd.read_csv(a)
date=tweet['time']
text=tweet['text']
day=pd.to_datetime(date,format='%Y-%m-%d')
day=day.map(lambda x: x.strftime('%Y-%m-%d'))
text = map(lambda x: [x], text)
tweet=pd.DataFrame(text,index=day,columns=['Text'])
return tweet
tweet=read('amazon.csv')
#filter data based on date
sub226=tweet.loc[['2016-02-26']]
#create a function to pre-process the text
def words(text):
text=re.sub(r'''(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''', \
'',text,flags=re.MULTILINE)
letters=re.sub('[^a-zA-Z]', ' ', text)
words=letters.lower().split()
stop=set(stopwords.words('english'))
meanfulwords=[w for w in words if not w in stop]
return(' '.join(meanfulwords))
#create a loop to get the clean text
def corpus(sub):
num_text=sub['Text'].size
cleantext=[]
for i in xrange(0,num_text):
cleantext.append(words(sub['Text'][i]))
return cleantext
sub=corpus(sub226)
#calculate the sentiment score
def sentiment_score(dataset, sample):
count = net_sum = 0
for sentence in dataset:
if count%sample==0:
pos_score, neg_score = senti_classifier.polarity_scores([sentence])
#print "pos_score: " + str(pos_score) + " neg_score" + str(neg_score)
count += 1
if (pos_score - neg_score)>0:
net_sum += 1
length = count / sample + 1
score = net_sum * 1.0 / length
return score