From ff5fabcbe64c58ec02047b2c1cecb97f651afca8 Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Thu, 15 Dec 2016 17:20:33 +0300 Subject: [PATCH 1/7] - fix requirements - fix paths - add how to run instruction --- README.md | 22 ++++++++++++++++++---- requirements.txt | 7 +++++++ seq2seq/translate.py | 8 +++++--- 3 files changed, 30 insertions(+), 7 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index b9b7069..6facaca 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,24 @@ Set of (Tensorflow) implementations which generate comments from code. Thesis for the B.sc. AI. # How to execute: -## Seq2seq: -- Enter tf: source ~/tensorflow/bin/activate -- Execute code: python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu -- Or interactive mode (only works when the model has been trained): python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode +## Seq2seq: +- create/activate virtualenv +```bash +source ~/tensorflow/bin/activate +``` +- install requirements +```bash +pip install -r requirements.txt +``` +- Execute code: +```bash +python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu +``` + +- Or interactive mode (only works when the model has been trained): +```bash +python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode +``` ### Options - add --evaluate to see the score with a trained model on the development file (default False) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..22db177 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +funcsigs==1.0.2 +mock==2.0.0 +numpy==1.11.2 +pbr==1.10.0 +protobuf==3.1.0 +six==1.10.0 +tensorflow==0.12.0rc1 diff --git a/seq2seq/translate.py b/seq2seq/translate.py index ee42e6f..0cde51a 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -52,7 +52,9 @@ from evaluation.meteor.meteor import Meteor import warnings -warnings.filterwarnings("ignore", category=DeprecationWarning) +warnings.filterwarnings("ignore", category=DeprecationWarning) + +DATA_PATH = os.path.dirname(os.path.abspath(__file__)) tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.") @@ -66,8 +68,8 @@ tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") tf.app.flags.DEFINE_integer("code_vocab_size", 100000, "Program vocabulary size.") tf.app.flags.DEFINE_integer("en_vocab_size", 100000, "English vocabulary size.") -tf.app.flags.DEFINE_string("data_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/data/", "Data directory") -tf.app.flags.DEFINE_string("train_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/train/", "Training directory.") +tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory") +tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.") tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.") tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.") tf.app.flags.DEFINE_string("translated_dev_code", "dev/translated.en", "The dev file with Code translated into English.") From c6239cd6087c558623aae762acea89fbc96219cf Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Fri, 16 Dec 2016 20:58:12 +0300 Subject: [PATCH 2/7] Add logging --- seq2seq/translate.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/seq2seq/translate.py b/seq2seq/translate.py index 0cde51a..a2675b4 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -52,6 +52,11 @@ from evaluation.meteor.meteor import Meteor import warnings +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', + datefmt='%m-%d %H:%M',) +logger = logging.getLogger(__file__) + warnings.filterwarnings("ignore", category=DeprecationWarning) DATA_PATH = os.path.dirname(os.path.abspath(__file__)) @@ -181,7 +186,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code): # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) - + # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] @@ -415,14 +420,16 @@ def self_test(): def main(_): + logger.info("start") if FLAGS.self_test: self_test() elif FLAGS.decode: decode() - elif FLAGS.evaluate: + elif FLAGS.evaluate: evaluate() else: train() + logger.info("stop") if __name__ == "__main__": tf.app.run() From 0d8cc22c8cfbe7216c3447b9e600c1021f3942ac Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Sat, 24 Dec 2016 18:49:27 +0300 Subject: [PATCH 3/7] Null and out of range checks Deprecated methods Readme update --- README.md | 1 + dataset_generation/getDocStrings.py | 12 ++++++------ ptr/main.py | 2 +- seq2seq/evaluation/bleu/multi-bleu.perl | 9 ++++++--- seq2seq/seq2seq_model.py | 2 +- seq2seq/translate.py | 8 ++++---- 6 files changed, 19 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 6facaca..cc747fe 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ source ~/tensorflow/bin/activate ``` - install requirements ```bash +pip install --upgrade pip pip install -r requirements.txt ``` - Execute code: diff --git a/dataset_generation/getDocStrings.py b/dataset_generation/getDocStrings.py index c08c593..34e50dc 100644 --- a/dataset_generation/getDocStrings.py +++ b/dataset_generation/getDocStrings.py @@ -1,6 +1,6 @@ from os.path import basename, splitext import sys -import util +import util commentList = ["# ", "#!"] @@ -83,7 +83,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): # loop through all the lines in the source, get the comment # and the corresponding code with open(commentFile, "a") as commentF: - with open(codeFile, "a") as codeF: + with open(codeFile, "a") as codeF: for i in xrange(startLine, len(source)): # print "i in comment loop is:" , i globalI = i @@ -117,10 +117,10 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): # first if we are at another indentation level, we found an deeper # docstring, thus exit - if currIndent != indentation or not inComment: + if currIndent != indentation or not inComment: # print ">>>It is a new comment, return error" return(i,False) - + # otherwise end the comment else: # print ">>>Closed comment" @@ -153,7 +153,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%") return(i, True) - + # if we are still here, add the current line to the code code.append(line.strip()) @@ -200,6 +200,6 @@ def isDef(source, startLine, i): if __name__ == '__main__': import sys - + with open(sys.argv[1]) as fp: make_pairs(fp) \ No newline at end of file diff --git a/ptr/main.py b/ptr/main.py index a8210f6..5494aea 100644 --- a/ptr/main.py +++ b/ptr/main.py @@ -133,7 +133,7 @@ def step(self): with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess: merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("/tmp/pointer_logs", sess.graph) - init = tf.initialize_all_variables() + init = tf.global_variables_initializer() sess.run(init) for i in range(10000): encoder_input_data, decoder_input_data, targets_data = dataset.next_batch( diff --git a/seq2seq/evaluation/bleu/multi-bleu.perl b/seq2seq/evaluation/bleu/multi-bleu.perl index 8a1871a..778efba 100644 --- a/seq2seq/evaluation/bleu/multi-bleu.perl +++ b/seq2seq/evaluation/bleu/multi-bleu.perl @@ -149,9 +149,12 @@ sub add_to_ref { printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; exit(1); } - -if ($length_translation<$length_reference) { - $brevity_penalty = exp(1-$length_reference/$length_translation); +if($length_translation==0){ + $brevity_penalty = exp(0); +}else{ + if ($length_translation<$length_reference) { + $brevity_penalty = exp(1-$length_reference/$length_translation); + } } $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + my_log( $bleu[2] ) + diff --git a/seq2seq/seq2seq_model.py b/seq2seq/seq2seq_model.py index 12a400e..1809110 100644 --- a/seq2seq/seq2seq_model.py +++ b/seq2seq/seq2seq_model.py @@ -175,7 +175,7 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) - self.saver = tf.train.Saver(tf.all_variables()) + self.saver = tf.train.Saver(tf.global_variables()) def step(self, session, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only): diff --git a/seq2seq/translate.py b/seq2seq/translate.py index a2675b4..64d4812 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -196,7 +196,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code): outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write translated sentence to translation file. - translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n") + translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]) + "\n") # print ("> %s" % sentence) # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) @@ -224,7 +224,7 @@ def create_model(session, forward_only): model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") - session.run(tf.initialize_all_variables()) + session.run(tf.global_variables_initializer()) return model @@ -393,7 +393,7 @@ def decode(): if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. - print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) + print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline() @@ -406,7 +406,7 @@ def self_test(): # Create model with vocabularies of 10, 2 small buckets, 2 layers of 32. model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2, 5.0, 32, 0.3, 0.99, num_samples=8) - sess.run(tf.initialize_all_variables()) + sess.run(tf.global_variables_initializer()) # Fake data set for both the (3, 3) and (6, 6) bucket. data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])], From 79e4d952974be791769b68078030a657b72c0603 Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Sat, 24 Dec 2016 19:14:42 +0300 Subject: [PATCH 4/7] Mode logging and running instructions --- .idea/misc.xml | 4 ++++ README.md | 16 ++++++++++------ seq2seq/translate.py | 8 ++++++-- 3 files changed, 20 insertions(+), 8 deletions(-) create mode 100644 .idea/misc.xml diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..b30885f --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/README.md b/README.md index cc747fe..e20f674 100644 --- a/README.md +++ b/README.md @@ -3,23 +3,27 @@ Set of (Tensorflow) implementations which generate comments from code. Thesis fo # How to execute: ## Seq2seq: -- create/activate virtualenv +- Create/activate virtualenv ```bash source ~/tensorflow/bin/activate ``` -- install requirements +- Install requirements ```bash pip install --upgrade pip pip install -r requirements.txt ``` -- Execute code: +- Run training. Note, the trainig process has infinite loop in it: ```bash -python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu +python translate.py --num_layers=3 +``` +- Run evaluation (only works when the model has been trained): +```bash +python translate.py --num_layers=3 --evaluate ``` -- Or interactive mode (only works when the model has been trained): +- Run interactive translation mode (only works when the model has been trained): ```bash -python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode +python translate.py --num_layers=3 --step_per_checkpoint=50 --decode ``` ### Options diff --git a/seq2seq/translate.py b/seq2seq/translate.py index 64d4812..2c4c840 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -420,16 +420,20 @@ def self_test(): def main(_): - logger.info("start") + logger.info("Start") if FLAGS.self_test: + logger.info("Self testing") self_test() elif FLAGS.decode: + logger.info("Decoding") decode() elif FLAGS.evaluate: + logger.info("Evaluating") evaluate() else: + logger.info("Training") train() - logger.info("stop") + logger.info("Stop") if __name__ == "__main__": tf.app.run() From c9ff532b62de8a04d92f73ca55d036c7dd88a564 Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Sat, 24 Dec 2016 19:24:59 +0300 Subject: [PATCH 5/7] Refactored flags for clarity --- seq2seq/translate.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/seq2seq/translate.py b/seq2seq/translate.py index 2c4c840..b3174eb 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -61,6 +61,21 @@ DATA_PATH = os.path.dirname(os.path.abspath(__file__)) +tf.app.flags.DEFINE_boolean("decode", False, + "Set to True for interactive decoding.") +tf.app.flags.DEFINE_boolean("evaluate", False, + "Run evaluation metrics on the output.") +tf.app.flags.DEFINE_boolean("self_test", False, + "Run a self-test if this is set to True.") + +tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") +tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200, + "How many training steps to do per checkpoint.") + +tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory") +tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.") +tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.") +tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.") tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.") tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99, @@ -70,24 +85,12 @@ tf.app.flags.DEFINE_integer("batch_size", 64, "Batch size to use during training.") tf.app.flags.DEFINE_integer("size", 256, "Size of each model layer.") -tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") tf.app.flags.DEFINE_integer("code_vocab_size", 100000, "Program vocabulary size.") tf.app.flags.DEFINE_integer("en_vocab_size", 100000, "English vocabulary size.") -tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory") -tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.") -tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.") -tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.") tf.app.flags.DEFINE_string("translated_dev_code", "dev/translated.en", "The dev file with Code translated into English.") tf.app.flags.DEFINE_integer("max_train_data_size", 0, "Limit on the size of training data (0: no limit).") -tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200, - "How many training steps to do per checkpoint.") -tf.app.flags.DEFINE_boolean("decode", False, - "Set to True for interactive decoding.") -tf.app.flags.DEFINE_boolean("self_test", False, - "Run a self-test if this is set to True.") -tf.app.flags.DEFINE_boolean("evaluate", False, - "Run evaluation metrics on the output.") + FLAGS = tf.app.flags.FLAGS data_dir = FLAGS.data_dir + FLAGS.dataset + "/" From adec52f301cf17fa753d608d7095b88be360339d Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Sun, 25 Dec 2016 13:22:45 +0300 Subject: [PATCH 6/7] Restore path check --- seq2seq/translate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/seq2seq/translate.py b/seq2seq/translate.py index b3174eb..7fd53d1 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -222,7 +222,7 @@ def create_model(session, forward_only): FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=forward_only) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): + if ckpt and ckpt.model_checkpoint_path: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: From e5497c5b7b6f362e0ff8205dd1eae5c4ed3ddcf4 Mon Sep 17 00:00:00 2001 From: Yuriy Vaskin Date: Sun, 25 Dec 2016 13:56:07 +0300 Subject: [PATCH 7/7] Cleanup --- .idea/misc.xml | 4 ---- 1 file changed, 4 deletions(-) delete mode 100644 .idea/misc.xml diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index b30885f..0000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,4 +0,0 @@ - - - - \ No newline at end of file