thaije · vaskinyy · Dec 15, 2016 · Dec 16, 2016 · Dec 24, 2016 · Dec 24, 2016
diff --git a/README.md b/README.md
@@ -2,10 +2,29 @@
 Set of (Tensorflow) implementations which generate comments from code. Thesis for the B.sc. AI. 
 
 # How to execute:
-## Seq2seq: 
-- Enter tf: source ~/tensorflow/bin/activate
-- Execute code: python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu
-- Or interactive mode (only works when the model has been trained): python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode
+## Seq2seq:
+- Create/activate virtualenv
+```bash
+source ~/tensorflow/bin/activate
+```
+- Install requirements
+```bash
+pip install --upgrade pip
+pip install -r requirements.txt
+```
+- Run training. Note, the trainig process has infinite loop in it:
+```bash
+python translate.py --num_layers=3
+```
+- Run evaluation (only works when the model has been trained): 
+```bash
+python translate.py --num_layers=3 --evaluate
+```
+
+- Run interactive translation mode (only works when the model has been trained): 
+```bash
+python translate.py --num_layers=3 --step_per_checkpoint=50 --decode
+```
 
 ### Options
 - add --evaluate to see the score with a trained model on the development file (default False)

diff --git a/dataset_generation/getDocStrings.py b/dataset_generation/getDocStrings.py
@@ -1,6 +1,6 @@
 from os.path import basename, splitext
 import sys
-import util 
+import util
 
 
 commentList = ["# ", "#!"]
@@ -83,7 +83,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):
     # loop through all the lines in the source, get the comment 
     # and the corresponding code
     with open(commentFile, "a") as commentF:
-        with open(codeFile, "a") as codeF:      
+        with open(codeFile, "a") as codeF:
             for i in xrange(startLine, len(source)):
                 # print "i in comment loop is:" , i
                 globalI = i
@@ -117,10 +117,10 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):
 
                     # first if we are at another indentation level, we found an deeper
                     # docstring, thus exit
-                    if currIndent != indentation or not inComment: 
+                    if currIndent != indentation or not inComment:
                         # print ">>>It is a new comment, return error"
                         return(i,False)
-                    
+
                     # otherwise end the comment
                     else:
                         # print ">>>Closed comment"
@@ -153,7 +153,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):
                     commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%")
 
                     return(i, True)
-                
+
                 # if we are still here, add the current line to the code
                 code.append(line.strip())
 
@@ -200,6 +200,6 @@ def isDef(source, startLine, i):
 
 if __name__ == '__main__':
     import sys
-    
+
     with open(sys.argv[1]) as fp:
         make_pairs(fp)
diff --git a/ptr/main.py b/ptr/main.py
@@ -133,7 +133,7 @@ def step(self):
         with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess:
             merged = tf.merge_all_summaries()
             writer = tf.train.SummaryWriter("/tmp/pointer_logs", sess.graph)
-            init = tf.initialize_all_variables()
+            init = tf.global_variables_initializer()
             sess.run(init)
             for i in range(10000):
                 encoder_input_data, decoder_input_data, targets_data = dataset.next_batch(

diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,7 @@
+funcsigs==1.0.2
+mock==2.0.0
+numpy==1.11.2
+pbr==1.10.0
+protobuf==3.1.0
+six==1.10.0
+tensorflow==0.12.0rc1
diff --git a/seq2seq/evaluation/bleu/multi-bleu.perl b/seq2seq/evaluation/bleu/multi-bleu.perl
@@ -149,9 +149,12 @@ sub add_to_ref {
   printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
   exit(1);
 }
-
-if ($length_translation<$length_reference) {
-  $brevity_penalty = exp(1-$length_reference/$length_translation);
+if($length_translation==0){
+  $brevity_penalty = exp(0);
+}else{
+ if ($length_translation<$length_reference) {
+   $brevity_penalty = exp(1-$length_reference/$length_translation);
+ }
 }
 $bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
 				my_log( $bleu[2] ) +

diff --git a/seq2seq/seq2seq_model.py b/seq2seq/seq2seq_model.py
@@ -175,7 +175,7 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
         self.updates.append(opt.apply_gradients(
             zip(clipped_gradients, params), global_step=self.global_step))
 
-    self.saver = tf.train.Saver(tf.all_variables())
+    self.saver = tf.train.Saver(tf.global_variables())
 
   def step(self, session, encoder_inputs, decoder_inputs, target_weights,
            bucket_id, forward_only):

diff --git a/seq2seq/translate.py b/seq2seq/translate.py
@@ -52,8 +52,30 @@
 from evaluation.meteor.meteor import Meteor
 
 import warnings
-warnings.filterwarnings("ignore", category=DeprecationWarning) 
+import logging
+logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
+                    datefmt='%m-%d %H:%M',)
+logger = logging.getLogger(__file__)
 
+warnings.filterwarnings("ignore", category=DeprecationWarning)
+
+DATA_PATH = os.path.dirname(os.path.abspath(__file__))
+
+tf.app.flags.DEFINE_boolean("decode", False,
+                            "Set to True for interactive decoding.")
+tf.app.flags.DEFINE_boolean("evaluate", False,
+                            "Run evaluation metrics on the output.")
+tf.app.flags.DEFINE_boolean("self_test", False,
+                            "Run a self-test if this is set to True.")
+
+tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
+tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
+                            "How many training steps to do per checkpoint.")
+
+tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory")
+tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.")
+tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.")
+tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.")
 
 tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
 tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
@@ -63,24 +85,12 @@
 tf.app.flags.DEFINE_integer("batch_size", 64,
                             "Batch size to use during training.")
 tf.app.flags.DEFINE_integer("size", 256, "Size of each model layer.")
-tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
 tf.app.flags.DEFINE_integer("code_vocab_size", 100000, "Program vocabulary size.")
 tf.app.flags.DEFINE_integer("en_vocab_size", 100000, "English vocabulary size.")
-tf.app.flags.DEFINE_string("data_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/data/", "Data directory")
-tf.app.flags.DEFINE_string("train_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/train/", "Training directory.")
-tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.")
-tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.")
 tf.app.flags.DEFINE_string("translated_dev_code", "dev/translated.en", "The dev file with Code translated into English.")
 tf.app.flags.DEFINE_integer("max_train_data_size", 0,
                             "Limit on the size of training data (0: no limit).")
-tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
-                            "How many training steps to do per checkpoint.")
-tf.app.flags.DEFINE_boolean("decode", False,
-                            "Set to True for interactive decoding.")
-tf.app.flags.DEFINE_boolean("self_test", False,
-                            "Run a self-test if this is set to True.")
-tf.app.flags.DEFINE_boolean("evaluate", False, 
-                            "Run evaluation metrics on the output.")
+
 
 FLAGS = tf.app.flags.FLAGS
 data_dir = FLAGS.data_dir  + FLAGS.dataset + "/"
@@ -179,7 +189,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code):
                     # Get output logits for the sentence.
                     _, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
                                                                     target_weights, bucket_id, True)
-                                                                    
+
 
                     # This is a greedy decoder - outputs are just argmaxes of output_logits.
                     outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
@@ -189,7 +199,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code):
                         outputs = outputs[:outputs.index(data_utils.EOS_ID)]
 
                     # Write translated sentence to translation file.
-                    translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n")
+                    translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]) + "\n")
 
                     # print ("> %s" % sentence)
                     # print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
@@ -212,12 +222,12 @@ def create_model(session, forward_only):
       FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
       forward_only=forward_only)
   ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
-  if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
+  if ckpt and ckpt.model_checkpoint_path:
     print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
     model.saver.restore(session, ckpt.model_checkpoint_path)
   else:
     print("Created model with fresh parameters.")
-    session.run(tf.initialize_all_variables())
+    session.run(tf.global_variables_initializer())
   return model
 
 
@@ -386,7 +396,7 @@ def decode():
             if data_utils.EOS_ID in outputs:
                 outputs = outputs[:outputs.index(data_utils.EOS_ID)]
             # Print out French sentence corresponding to outputs.
-            print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
+            print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]))
             print("> ", end="")
             sys.stdout.flush()
             sentence = sys.stdin.readline()
@@ -399,7 +409,7 @@ def self_test():
     # Create model with vocabularies of 10, 2 small buckets, 2 layers of 32.
     model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2,
                                        5.0, 32, 0.3, 0.99, num_samples=8)
-    sess.run(tf.initialize_all_variables())
+    sess.run(tf.global_variables_initializer())
 
     # Fake data set for both the (3, 3) and (6, 6) bucket.
     data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])],
@@ -413,14 +423,20 @@ def self_test():
 
 
 def main(_):
+    logger.info("Start")
     if FLAGS.self_test:
+        logger.info("Self testing")
         self_test()
     elif FLAGS.decode:
+        logger.info("Decoding")
         decode()
-    elif FLAGS.evaluate:  
+    elif FLAGS.evaluate:
+        logger.info("Evaluating")
         evaluate()
     else:
+        logger.info("Training")
         train()
+    logger.info("Stop")
 
 if __name__ == "__main__":
     tf.app.run()