-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathtrain_word2vec.py
More file actions
38 lines (23 loc) · 785 Bytes
/
train_word2vec.py
File metadata and controls
38 lines (23 loc) · 785 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
from gensim.models.word2vec import Word2Vec
from ml_utils import tokenize
import json
import sys
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def preprocess(filename):
with open(filename) as f:
data = json.load(f)
data = [d['article'] for d in data]
data = [tokenize(d) for d in data]
return data
def main(filename):
out_filename = filename + '.w2v.json'
model = Word2Vec(preprocess(filename), size=200, iter=100, workers=4)
word2vec = dict(zip(model.wv.index2word, model.wv.syn0.tolist()))
with open(out_filename, 'w') as f:
json.dump(word2vec, f)
if __name__ == '__main__':
if len(sys.argv) != 2:
print("blah")
else:
main(sys.argv[1])