diff --git a/README.md b/README.md index b9b7069..e20f674 100644 --- a/README.md +++ b/README.md @@ -2,10 +2,29 @@ Set of (Tensorflow) implementations which generate comments from code. Thesis for the B.sc. AI. # How to execute: -## Seq2seq: -- Enter tf: source ~/tensorflow/bin/activate -- Execute code: python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu -- Or interactive mode (only works when the model has been trained): python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode +## Seq2seq: +- Create/activate virtualenv +```bash +source ~/tensorflow/bin/activate +``` +- Install requirements +```bash +pip install --upgrade pip +pip install -r requirements.txt +``` +- Run training. Note, the trainig process has infinite loop in it: +```bash +python translate.py --num_layers=3 +``` +- Run evaluation (only works when the model has been trained): +```bash +python translate.py --num_layers=3 --evaluate +``` + +- Run interactive translation mode (only works when the model has been trained): +```bash +python translate.py --num_layers=3 --step_per_checkpoint=50 --decode +``` ### Options - add --evaluate to see the score with a trained model on the development file (default False) diff --git a/dataset_generation/getDocStrings.py b/dataset_generation/getDocStrings.py index c08c593..34e50dc 100644 --- a/dataset_generation/getDocStrings.py +++ b/dataset_generation/getDocStrings.py @@ -1,6 +1,6 @@ from os.path import basename, splitext import sys -import util +import util commentList = ["# ", "#!"] @@ -83,7 +83,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): # loop through all the lines in the source, get the comment # and the corresponding code with open(commentFile, "a") as commentF: - with open(codeFile, "a") as codeF: + with open(codeFile, "a") as codeF: for i in xrange(startLine, len(source)): # print "i in comment loop is:" , i globalI = i @@ -117,10 +117,10 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): # first if we are at another indentation level, we found an deeper # docstring, thus exit - if currIndent != indentation or not inComment: + if currIndent != indentation or not inComment: # print ">>>It is a new comment, return error" return(i,False) - + # otherwise end the comment else: # print ">>>Closed comment" @@ -153,7 +153,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket): commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%") return(i, True) - + # if we are still here, add the current line to the code code.append(line.strip()) @@ -200,6 +200,6 @@ def isDef(source, startLine, i): if __name__ == '__main__': import sys - + with open(sys.argv[1]) as fp: make_pairs(fp) \ No newline at end of file diff --git a/ptr/main.py b/ptr/main.py index a8210f6..5494aea 100644 --- a/ptr/main.py +++ b/ptr/main.py @@ -133,7 +133,7 @@ def step(self): with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess: merged = tf.merge_all_summaries() writer = tf.train.SummaryWriter("/tmp/pointer_logs", sess.graph) - init = tf.initialize_all_variables() + init = tf.global_variables_initializer() sess.run(init) for i in range(10000): encoder_input_data, decoder_input_data, targets_data = dataset.next_batch( diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..22db177 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,7 @@ +funcsigs==1.0.2 +mock==2.0.0 +numpy==1.11.2 +pbr==1.10.0 +protobuf==3.1.0 +six==1.10.0 +tensorflow==0.12.0rc1 diff --git a/seq2seq/evaluation/bleu/multi-bleu.perl b/seq2seq/evaluation/bleu/multi-bleu.perl index 8a1871a..778efba 100644 --- a/seq2seq/evaluation/bleu/multi-bleu.perl +++ b/seq2seq/evaluation/bleu/multi-bleu.perl @@ -149,9 +149,12 @@ sub add_to_ref { printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n"; exit(1); } - -if ($length_translation<$length_reference) { - $brevity_penalty = exp(1-$length_reference/$length_translation); +if($length_translation==0){ + $brevity_penalty = exp(0); +}else{ + if ($length_translation<$length_reference) { + $brevity_penalty = exp(1-$length_reference/$length_translation); + } } $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) + my_log( $bleu[2] ) + diff --git a/seq2seq/seq2seq_model.py b/seq2seq/seq2seq_model.py index 12a400e..1809110 100644 --- a/seq2seq/seq2seq_model.py +++ b/seq2seq/seq2seq_model.py @@ -175,7 +175,7 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) - self.saver = tf.train.Saver(tf.all_variables()) + self.saver = tf.train.Saver(tf.global_variables()) def step(self, session, encoder_inputs, decoder_inputs, target_weights, bucket_id, forward_only): diff --git a/seq2seq/translate.py b/seq2seq/translate.py index ee42e6f..7fd53d1 100644 --- a/seq2seq/translate.py +++ b/seq2seq/translate.py @@ -52,8 +52,30 @@ from evaluation.meteor.meteor import Meteor import warnings -warnings.filterwarnings("ignore", category=DeprecationWarning) +import logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', + datefmt='%m-%d %H:%M',) +logger = logging.getLogger(__file__) +warnings.filterwarnings("ignore", category=DeprecationWarning) + +DATA_PATH = os.path.dirname(os.path.abspath(__file__)) + +tf.app.flags.DEFINE_boolean("decode", False, + "Set to True for interactive decoding.") +tf.app.flags.DEFINE_boolean("evaluate", False, + "Run evaluation metrics on the output.") +tf.app.flags.DEFINE_boolean("self_test", False, + "Run a self-test if this is set to True.") + +tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") +tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200, + "How many training steps to do per checkpoint.") + +tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory") +tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.") +tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.") +tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.") tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.") tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99, @@ -63,24 +85,12 @@ tf.app.flags.DEFINE_integer("batch_size", 64, "Batch size to use during training.") tf.app.flags.DEFINE_integer("size", 256, "Size of each model layer.") -tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.") tf.app.flags.DEFINE_integer("code_vocab_size", 100000, "Program vocabulary size.") tf.app.flags.DEFINE_integer("en_vocab_size", 100000, "English vocabulary size.") -tf.app.flags.DEFINE_string("data_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/data/", "Data directory") -tf.app.flags.DEFINE_string("train_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/train/", "Training directory.") -tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.") -tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.") tf.app.flags.DEFINE_string("translated_dev_code", "dev/translated.en", "The dev file with Code translated into English.") tf.app.flags.DEFINE_integer("max_train_data_size", 0, "Limit on the size of training data (0: no limit).") -tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200, - "How many training steps to do per checkpoint.") -tf.app.flags.DEFINE_boolean("decode", False, - "Set to True for interactive decoding.") -tf.app.flags.DEFINE_boolean("self_test", False, - "Run a self-test if this is set to True.") -tf.app.flags.DEFINE_boolean("evaluate", False, - "Run evaluation metrics on the output.") + FLAGS = tf.app.flags.FLAGS data_dir = FLAGS.data_dir + FLAGS.dataset + "/" @@ -179,7 +189,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code): # Get output logits for the sentence. _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs, target_weights, bucket_id, True) - + # This is a greedy decoder - outputs are just argmaxes of output_logits. outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits] @@ -189,7 +199,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code): outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Write translated sentence to translation file. - translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n") + translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]) + "\n") # print ("> %s" % sentence) # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) @@ -212,12 +222,12 @@ def create_model(session, forward_only): FLAGS.learning_rate, FLAGS.learning_rate_decay_factor, forward_only=forward_only) ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir) - if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path): + if ckpt and ckpt.model_checkpoint_path: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") - session.run(tf.initialize_all_variables()) + session.run(tf.global_variables_initializer()) return model @@ -386,7 +396,7 @@ def decode(): if data_utils.EOS_ID in outputs: outputs = outputs[:outputs.index(data_utils.EOS_ID)] # Print out French sentence corresponding to outputs. - print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])) + print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab])) print("> ", end="") sys.stdout.flush() sentence = sys.stdin.readline() @@ -399,7 +409,7 @@ def self_test(): # Create model with vocabularies of 10, 2 small buckets, 2 layers of 32. model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2, 5.0, 32, 0.3, 0.99, num_samples=8) - sess.run(tf.initialize_all_variables()) + sess.run(tf.global_variables_initializer()) # Fake data set for both the (3, 3) and (6, 6) bucket. data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])], @@ -413,14 +423,20 @@ def self_test(): def main(_): + logger.info("Start") if FLAGS.self_test: + logger.info("Self testing") self_test() elif FLAGS.decode: + logger.info("Decoding") decode() - elif FLAGS.evaluate: + elif FLAGS.evaluate: + logger.info("Evaluating") evaluate() else: + logger.info("Training") train() + logger.info("Stop") if __name__ == "__main__": tf.app.run()