-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathEnsembleTrainingLoop.py
More file actions
257 lines (189 loc) · 10 KB
/
EnsembleTrainingLoop.py
File metadata and controls
257 lines (189 loc) · 10 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
import pandas as pd
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
import re
import torch
import torch.optim as optim
import torch.nn as nn
import numpy as np
import torchtext
from torchtext import data
import spacy
import en_core_web_sm
import argparse
import os
import math
import matplotlib.pyplot as plt
from EnsembleModel import ComboCNN
X_test = np.load('test_X.npy', allow_pickle=True)
X_train = np.load('train_X.npy', allow_pickle=True)
X_valid = np.load('valid_X.npy', allow_pickle=True)
y_test = np.load('test_y.npy', allow_pickle=True)
y_train = np.load('train_y.npy', allow_pickle=True)
y_valid = np.load('valid_y.npy', allow_pickle=True)
This is just text that has been stripped of grammar and non-alphanumeric characters, and is all lower case
corpus = pd.read_csv(r'C:\Users\Pranav\Desktop\Reddit Classifier Project Data Processing\data\word_embedding\processed_combined_text.csv')
# Now, we need to add in the label for each sentence, which is the file it came from
list0 = [0]*5000
list1 = [1]*5000
list2 = [2]*5000
list3 = [3]*5000
list4 = [4]*5000
list5 = [5]*5000
list6 = [6]*5000
list7 = [7]*5000
list8 = [8]*5000
list9 = [9]*5000
list10 = [10]*5000
list11 = [11]*5000
list12 = [12]*5000
list13 = [13]*5000
list14 = [14]*5000
list15 = [15]*5000
list16 = [16]*5000
list17 = [17]*5000
list18 = [18]*5000
list19 = [19]*5000
total_list = list0 + list1 + list2 + list3 + list4 + list5 + list6 + list7 + list8 + list9 + list10 + list11 + list12 + list13 + list14 + list15 + list16 + list17 + list18 + list19
df = pd.DataFrame(total_list, columns=['label'])
processed_with_label = pd.concat([corpus, df], axis=1)
processed_with_label.to_csv('processed_corpus_labelled.csv')
# Drop the redundant indexing column
processed_with_label.drop('Unnamed: 0', axis=1, inplace=True)
# Shuffle the data around
processed_with_label = processed_with_label.sample(frac=1, random_state=42).reset_index(drop=True)
train_set = processed_with_label[0:70000]
valid_set = processed_with_label[70000:85000]
test_set = processed_with_label[85000:100000]
train_set.to_csv('my_train.csv')
valid_set.to_csv('my_valid.csv')
test_set.to_csv('my_test.csv')
def main(args):
TEXT = data.Field(sequential=True,lower=True, tokenize='spacy', include_lengths=True)
LABELS = data.Field(sequential=False, use_vocab=False)
# 3.2.2
train_data, val_data, test_data = data.TabularDataset.splits(
path='./data/', train='my_train.csv',
validation='my_valid.csv', test='my_test.csv', format='csv',
skip_header=True, fields=[('text', TEXT), ('label', LABELS)])
# 3.2.3
train_iter, val_iter, test_iter = data.BucketIterator.splits(
(train_data, val_data, test_data), batch_sizes=(args.batch_size, args.batch_size, args.batch_size),
sort_key=lambda x: len(x.text), device=None, sort_within_batch=True, repeat=False)
# 3.2.4
TEXT.build_vocab(train_data, val_data, test_data)
# 4.1
TEXT.vocab.load_vectors(torchtext.vocab.GloVe(name='6B', dim=100))
vocab = TEXT.vocab
# Initialize the baseline_model, optimizer, loss function, and some auxiliary
# functions for plotting accuracy and loss
choose_model = args.model
if choose_model == 'combocnn':
model = ComboCNN(args.emb_dim, TEXT.vocab, args.num_filt, [2,3])
optimizer = optim.Adam(model.parameters(), lr=args.lr)
loss_fnc = nn.CrossEntropyLoss(reduction='mean')
loss_array = [0]*args.epochs
acc_array = [0]*args.epochs
valid_loss = [0]*args.epochs
valid_acc = [0]*args.epochs
epoch_array = [0]*args.epochs
for i in range(args.epochs):
epoch_array[i]= i
loss_train = 0
correct_train = 0
loss_valid = 0
correct_valid = 0
def accuracy(prediction, label): # Need a way to measure accuracy. We do this by taking the max index of the output tensor
num_correct = 0
pred = prediction.clone().detach().numpy() # We need to copy the values of these tensors since they have autograd and we dont wanna mess that up by detaching that or returning a processed version
lab = label.clone().detach().numpy() # So, we use .clone() to copy, .detach() to remove from the autograd computation graph, and .numpy() to make manipulations easier
for i in range(len(pred)):
if np.argmax(pred[i]) == np.argmax(lab[i]): # Each element of prediction and label is a length 10 vector. In each vector, the index of the max element refers to the class that's most strongly predicted.
num_correct += 1 # If the indices where they have their max elements line up, it means the predicted strongest class is the same as the actual class. Hence, it's a correct prediction.
#acc = float(num_correct/len(pred))
return num_correct
def turn1hot(tensor_array): # A function to one-hot encode the labels of each batch
nparr = tensor_array.clone().detach().numpy() # turn the tensor into a numpy array after detaching it from any computational graphs
arr = nparr.tolist() # Turn the array into a normal list for easy manipulation
for i in range(len(arr)): # For loop does the process for the whole batch
val_at_index = nparr[i]
zero_vec = [0]*20 # Create a list of 0s of length number of classes
zero_vec[int(val_at_index)] += 1 # Make the appropriate class's index a 1
arr[i] = zero_vec
return np.asarray(arr) # Return the one hot encoded list as an array
# Training loop for train and validation set
for i in range(0,args.epochs,1):
model.train()
for batch_index, batch_data in enumerate(train_iter):
optimizer.zero_grad() # Zero gradients at the very start of each batch
input_data, input_data_length = batch_data.text
#input_data = input_data.float()
#input_data_length = input_data_length.float()
labels = batch_data.label
onehotlabel = turn1hot(labels)
inputlabels = torch.from_numpy(onehotlabel).long()
if args.model == 'rnn':
predictions = model(input_data, input_data_length) # Make a forward pass
else:
predictions = model(input_data)
loss = loss_fnc(predictions, labels) # Compute the loss real quick ya feel
loss_train += loss
correct_train += accuracy(predictions, inputlabels) # Get the accuracy for this epoch by running this through the accuracy function
loss.backward()
optimizer.step()
model.eval()
for batch_index, batch_data in enumerate(val_iter):
input_data, input_data_length = batch_data.text
labels = batch_data.label
onehotlabel = turn1hot(labels)
inputlabels = torch.from_numpy(onehotlabel).long()
if args.model == 'rnn':
predictions = model(input_data, input_data_length) # Make a forward pass
else:
predictions = model(input_data)
loss = loss_fnc(predictions, labels) # Compute the loss real quick ya feel
loss_valid += loss
correct_valid += accuracy(predictions, inputlabels) # Get the accuracy for this epoch by running this through the accuracy function
loss_array[i] += float(loss_train/(math.ceil(len(train_data)/args.batch_size))) # loss_train sums the mean of loss per batch. so for the whole epoch, we take the mean loss of the means of each batch. So, loss_train divide by how many batches we had. We have len(overfit_data) amount of data, and args.batch_size size of batch. So divide the two
acc_array[i] += float(correct_train/len(train_data))
valid_loss[i] += float(loss_valid/(math.ceil(len(val_data)/args.batch_size))) # loss_train sums the mean of loss per batch. so for the whole epoch, we take the mean loss of the means of each batch. So, loss_train divide by how many batches we had. We have len(overfit_data) amount of data, and args.batch_size size of batch. So divide the two
valid_acc[i] += float(correct_valid/len(val_data))
#
loss_train = 0
correct_train = 0
loss_valid = 0
correct_valid = 0
"""PLOTTING DATA"""
font = {'size': 10}
plt.figure(1) # Figure 1 is for training and validation loss + accuracy
plt.subplot(211)
plt.plot(epoch_array, loss_array, 'r', epoch_array, valid_loss, 'g')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['training loss', 'validation loss'])
plt.subplot(212)
plt.plot(epoch_array, acc_array, 'r', epoch_array, valid_acc, 'g')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['training accuracy', 'validation accuracy'])
plt.show()
print("validation acc: " + str(valid_acc[-1]) + " training acc: " + str(acc_array[-1]))
######
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--batch-size', type=int, default=64)
parser.add_argument('--lr', type=float, default=0.001)
parser.add_argument('--epochs', type=int, default=25)
parser.add_argument('--model', type=str, default='combocnn',
help="Model type: baseline,rnn,cnn, combocnn (Default: baseline)")
parser.add_argument('--emb-dim', type=int, default=100)
parser.add_argument('--rnn-hidden-dim', type=int, default=100)
parser.add_argument('--num-filt', type=int, default=50)
args = parser.parse_args()
main(args)