forked from dewarim/data-tools-for-reddit
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscoreCommentsJson.py
More file actions
92 lines (76 loc) · 2.84 KB
/
scoreCommentsJson.py
File metadata and controls
92 lines (76 loc) · 2.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
# -*- coding: utf-8 -*-
"""
Read original comment data in .bz2 JSON format and compute sentiment score.
Write sentiment data to tab-separated file.
Print current comment count every 1000 comments.
Based on: https://github.com/megansquire/masteringDM/blob/master/ch5/scoreLinusEmail.py by megan
Use nltkDownload.py first to download the required data files for sentiment analysis.
Needs a python with some extras - you can use the community edition of Anaconda:
https://www.continuum.io/downloads
"""
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize
import bz2
import json
import sys
import os.path
sid = SentimentIntensityAnalyzer()
archive = "data/RC_2007-10.bz2"
if len(sys.argv) > 1:
# when started via "python scoreCommentsJson.py /data/location/reddit_comments.bz2,
# the first element of sys.argv seems to be the script name, not the path, like in Java.
archive = sys.argv[1]
else:
print("No command line arguments given - trying to work with default example data file "+archive)
print("Working on file: " + archive)
score_file_name = archive.replace("bz2", "sentiment")
if os.path.exists(score_file_name):
print("sentiment file already exists")
sys.exit()
bz_file = bz2.BZ2File(archive, 'rb', 1000000)
score_file = open(score_file_name, 'w')
commentCount = 0
while True:
line = bz_file.readline().decode('utf8')
if len(line) == 0:
break
comment = json.loads(line)
# print(comment)
id = comment["id"]
body = comment["body"]
# variables to hold the overall average compound score for message
finalScore = 0
roundedFinalScore = 0
# variables to hold the highest positive score in the message
# and highest negative score in the message
maxPosScore = 0
maxNegScore = 0
# print("===")
commentLines = tokenize.sent_tokenize(body)
for line in commentLines:
ss = sid.polarity_scores(line)
# uncomment these lines if you want to print out sentences & scores
'''
line = line.replace('\n', ' ').replace('\r', '')
print(line)
for k in sorted(ss):
print(' {0}: {1}\n'.format(k,ss[k]), end='')
'''
lineCompoundScore = ss['compound']
finalScore += lineCompoundScore
if ss['pos'] > maxPosScore:
maxPosScore = ss['pos']
elif ss['neg'] > maxNegScore:
maxNegScore = ss['neg']
# roundedFinalScore is the average compound score for the entire message
commentLength = len(commentLines)
if commentLength == 0:
commentLength = 1
roundedFinalScore = round(finalScore / commentLength, 4)
score_file.write("{0}\t{1}\t{2}\t{3}\n".format(roundedFinalScore, maxPosScore, maxNegScore, id))
commentCount += 1
if commentCount % 1000 == 0:
print(commentCount)
# break
bz_file.close()
score_file.close()