-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNotes2Vect.py
More file actions
123 lines (74 loc) · 2.95 KB
/
Notes2Vect.py
File metadata and controls
123 lines (74 loc) · 2.95 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# coding: utf-8
# # Notes 2 Vect
#
# The goal of this Notebook is to figure out if we can create an embedding for each specific note.
# In[54]:
#Preamble
import sys
import numpy as np
import gensim
import logging as log
import csv
import pickle as pkl
from nltk.tokenize import word_tokenize
# Logging config
log.basicConfig(format='%(levelname)s : %(message)s', level=log.INFO)
log.info("Running with vec_size={}".format(sys.argv[1]))
# Read in the .csv notes data:
# In[35]:
def load_notes():
"Load in the notes from the .csv file"
global notes
file_name = 'data/disch_full.csv'
with open(file_name) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
next(csv_reader) # skip the header
notes = [row[3] for row in csv_reader]
log.info("Done loading notes with length {}".format(len(notes)))
# In[56]:
def load_pickle_file(name):
"Get the value from the pickle file and set it as the variable value"
with open("data/{}.pkl".format(name), "rb") as pickle_file:
globals()[name] = np.array(pkl.load(pickle_file))
# In[57]:
#load_pickle_file("tagged_notes")
# In[36]:
load_notes()
# ## Vectors
#
# Now that all the notes are in the `notes` variable, we can begin training for our vectors. Reference material: [Article 1](https://medium.com/@mishra.thedeepak/doc2vec-simple-implementation-example-df2afbbfbad5) and [article 2](https://radimrehurek.com/gensim/models/doc2vec.html#gensim.models.doc2vec.TaggedDocument).
# In[41]:
tagged_notes = [gensim.models.doc2vec.TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(notes)]
log.info("Loaded in tagged_notes")
# In[52]:
def save_pickle_file(name, data):
"Save out the data as a pickle file"
with open("data/{}.pkl".format(name), "wb") as pickle_file:
pkl.dump(data, pickle_file)
# In[55]:
#save_pickle_file("tagged_notes", tagged_notes)
# In[50]:
def make_model(max_epochs = 5, vec_size = 100, alpha = 0.025):
"Make the model for notes 2 vect"
global model
model = gensim.models.doc2vec.Doc2Vec(vector_size=vec_size,
alpha=alpha,
min_alpha=0.00025,
min_count=1,
dm=1,
workers=2
)
model.build_vocab(tagged_notes)
for epoch in range(max_epochs):
log.info('iteration {0}'.format(epoch))
model.train(tagged_notes,
total_examples=model.corpus_count,
epochs=model.iter)
# decrease the learning rate
model.alpha -= 0.0002
# fix the learning rate, no decay
model.min_alpha = model.alpha
model.save("models/{}_{}_notes2vect.w2v".format(vec_size, max_epochs))
log.info("Model Saved")
# In[51]:
make_model(vec_size = int(sys.argv[1]))