-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathFileReader.py
More file actions
130 lines (94 loc) · 3.77 KB
/
FileReader.py
File metadata and controls
130 lines (94 loc) · 3.77 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
from sklearn import externals
import pickle
import re
#This is the way we read through the user input text
class FileReader:
@staticmethod
def textSplit(text):
#if we want to do things by smaller granularity
#.*\.\n+(.*[^.]\n+). regex to get all headers
text = re.sub(r'(\n[0-9][0-9]*[\.\)]*)\s*', '', text)
#regex to remove headers
text = re.sub(r'(?!.*([\.\;\,\:]))(.*[^\.])', '', text)
#regex to remove line numbers and list
temp = text.split(".")
rawdata = []
for i in range(0, len(temp), 4):
if (i + 3 < len(temp)):
rawdata.append(temp[i] + temp[i+1] + temp[i+2] + temp[i+3])
elif (i + 2 < len(temp)):
rawdata.append(temp[i] + temp[i+1] + temp[i+2])
elif (i + 1 < len(temp)):
rawdata.append(temp[i] + temp[i+1])
else:
rawdata.append(temp[i])
#Import the classifier from the pickle file
#classifier = externals.joblib.load("EULA_Classifier.pkl")
#Uncomment when classifier works
#labels = rslt.predictLabels(data,classifier)
classifier = externals.joblib.load("EULA_Classifier.pkl")
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))
data = vectorizer.transform(rawdata)
all_labels = []
for clf in classifier:
labels = clf.predict(data)
all_labels.append(labels)
print (len(all_labels))
final_labels = []
for i in range(len(rawdata)):
curr_labels = {}
for j in all_labels:
if j[i] in curr_labels:
curr_labels[j[i]] += 1
else:
curr_labels[j[i]] = 1
if (len(curr_labels) == len(all_labels)):
final_labels.append(0)
else:
final_labels.append(max(curr_labels, key=curr_labels.get))
#labelvals = [FileReader.labelnames[num] for num in labels]
return dict(zip(rawdata, final_labels))
@staticmethod
def fileSplit(fle):
temp = fle.split(".")
rawdata = []
for i in range(0, len(temp), 4):
if (i + 3 < len(temp)):
rawdata.append(temp[i] + temp[i+1] + temp[i+2] + temp[i+3])
elif (i + 2 < len(temp)):
rawdata.append(temp[i] + temp[i+1] + temp[i+2])
elif (i + 1 < len(temp)):
rawdata.append(temp[i] + temp[i+1])
else:
rawdata.append(temp[i])
classifier = externals.joblib.load("EULA_Classifier.pkl")
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))
data = vectorizer.transform(rawdata)
all_labels = []
for clf in classifier:
labels = clf.predict(data)
all_labels.append(labels)
print (len(all_labels))
final_labels = []
for i in range(len(rawdata)):
curr_labels = {}
for j in all_labels:
if j[i] in curr_labels:
curr_labels[j[i]] += 1
else:
curr_labels[j[i]] = 1
if (len(curr_labels) == len(all_labels)):
final_labels.append(0)
else:
final_labels.append(max(curr_labels, key=curr_labels.get))
return dict(zip(rawdata, final_labels))
@staticmethod
def modifyText(data=[],labels=[]):
modFile = open("LabeledData.txt","w")
for i in xrange(0,len(data)):
if labels[i] != "None":
modFile.write("-----------------" + labels[i] + "--------------------\n")
modFile.write(data[i])
modFile.close()
return modFile