Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,29 @@
Set of (Tensorflow) implementations which generate comments from code. Thesis for the B.sc. AI.

# How to execute:
## Seq2seq:
- Enter tf: source ~/tensorflow/bin/activate
- Execute code: python translate.py --size=256 --num_layers=3 --steps_per_checkpoint=50 --bleu
- Or interactive mode (only works when the model has been trained): python translate.py --size=350 --num_layers=3 --step_per_checkpoint=50 --decode
## Seq2seq:
- Create/activate virtualenv
```bash
source ~/tensorflow/bin/activate
```
- Install requirements
```bash
pip install --upgrade pip
pip install -r requirements.txt
```
- Run training. Note, the trainig process has infinite loop in it:
```bash
python translate.py --num_layers=3
```
- Run evaluation (only works when the model has been trained):
```bash
python translate.py --num_layers=3 --evaluate
```

- Run interactive translation mode (only works when the model has been trained):
```bash
python translate.py --num_layers=3 --step_per_checkpoint=50 --decode
```

### Options
- add --evaluate to see the score with a trained model on the development file (default False)
Expand Down
12 changes: 6 additions & 6 deletions dataset_generation/getDocStrings.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from os.path import basename, splitext
import sys
import util
import util


commentList = ["# ", "#!"]
Expand Down Expand Up @@ -83,7 +83,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):
# loop through all the lines in the source, get the comment
# and the corresponding code
with open(commentFile, "a") as commentF:
with open(codeFile, "a") as codeF:
with open(codeFile, "a") as codeF:
for i in xrange(startLine, len(source)):
# print "i in comment loop is:" , i
globalI = i
Expand Down Expand Up @@ -117,10 +117,10 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):

# first if we are at another indentation level, we found an deeper
# docstring, thus exit
if currIndent != indentation or not inComment:
if currIndent != indentation or not inComment:
# print ">>>It is a new comment, return error"
return(i,False)

# otherwise end the comment
else:
# print ">>>Closed comment"
Expand Down Expand Up @@ -153,7 +153,7 @@ def filterDocString(source, startLine, codeFile, commentFile, maxBucket):
commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%")

return(i, True)

# if we are still here, add the current line to the code
code.append(line.strip())

Expand Down Expand Up @@ -200,6 +200,6 @@ def isDef(source, startLine, i):

if __name__ == '__main__':
import sys

with open(sys.argv[1]) as fp:
make_pairs(fp)
2 changes: 1 addition & 1 deletion ptr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def step(self):
with tf.Session(config = tf.ConfigProto(gpu_options = gpu_options)) as sess:
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter("/tmp/pointer_logs", sess.graph)
init = tf.initialize_all_variables()
init = tf.global_variables_initializer()
sess.run(init)
for i in range(10000):
encoder_input_data, decoder_input_data, targets_data = dataset.next_batch(
Expand Down
7 changes: 7 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
funcsigs==1.0.2
mock==2.0.0
numpy==1.11.2
pbr==1.10.0
protobuf==3.1.0
six==1.10.0
tensorflow==0.12.0rc1
9 changes: 6 additions & 3 deletions seq2seq/evaluation/bleu/multi-bleu.perl
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,12 @@ sub add_to_ref {
printf "BLEU = 0, 0/0/0/0 (BP=0, ratio=0, hyp_len=0, ref_len=0)\n";
exit(1);
}

if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
if($length_translation==0){
$brevity_penalty = exp(0);
}else{
if ($length_translation<$length_reference) {
$brevity_penalty = exp(1-$length_reference/$length_translation);
}
}
$bleu = $brevity_penalty * exp((my_log( $bleu[1] ) +
my_log( $bleu[2] ) +
Expand Down
2 changes: 1 addition & 1 deletion seq2seq/seq2seq_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
self.updates.append(opt.apply_gradients(
zip(clipped_gradients, params), global_step=self.global_step))

self.saver = tf.train.Saver(tf.all_variables())
self.saver = tf.train.Saver(tf.global_variables())

def step(self, session, encoder_inputs, decoder_inputs, target_weights,
bucket_id, forward_only):
Expand Down
58 changes: 37 additions & 21 deletions seq2seq/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,30 @@
from evaluation.meteor.meteor import Meteor

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%m-%d %H:%M',)
logger = logging.getLogger(__file__)

warnings.filterwarnings("ignore", category=DeprecationWarning)

DATA_PATH = os.path.dirname(os.path.abspath(__file__))

tf.app.flags.DEFINE_boolean("decode", False,
"Set to True for interactive decoding.")
tf.app.flags.DEFINE_boolean("evaluate", False,
"Run evaluation metrics on the output.")
tf.app.flags.DEFINE_boolean("self_test", False,
"Run a self-test if this is set to True.")

tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
"How many training steps to do per checkpoint.")

tf.app.flags.DEFINE_string("data_dir", os.path.join(DATA_PATH, "data/"), "Data directory")
tf.app.flags.DEFINE_string("train_dir", os.path.join(DATA_PATH, "data/"), "Training directory.")
tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.")
tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.")

tf.app.flags.DEFINE_float("learning_rate", 0.5, "Learning rate.")
tf.app.flags.DEFINE_float("learning_rate_decay_factor", 0.99,
Expand All @@ -63,24 +85,12 @@
tf.app.flags.DEFINE_integer("batch_size", 64,
"Batch size to use during training.")
tf.app.flags.DEFINE_integer("size", 256, "Size of each model layer.")
tf.app.flags.DEFINE_integer("num_layers", 1, "Number of layers in the model.")
tf.app.flags.DEFINE_integer("code_vocab_size", 100000, "Program vocabulary size.")
tf.app.flags.DEFINE_integer("en_vocab_size", 100000, "English vocabulary size.")
tf.app.flags.DEFINE_string("data_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/data/", "Data directory")
tf.app.flags.DEFINE_string("train_dir", "/home/tjalling/Desktop/thesis/tensorflow/implementations/seq2seq/train/", "Training directory.")
tf.app.flags.DEFINE_string("dataset", "allCode", "Specify the name of which dataset to use.")
tf.app.flags.DEFINE_string("dev_files", "dev/10pt.random", "The file path to the English dev file, relative from the data_dir.")
tf.app.flags.DEFINE_string("translated_dev_code", "dev/translated.en", "The dev file with Code translated into English.")
tf.app.flags.DEFINE_integer("max_train_data_size", 0,
"Limit on the size of training data (0: no limit).")
tf.app.flags.DEFINE_integer("steps_per_checkpoint", 200,
"How many training steps to do per checkpoint.")
tf.app.flags.DEFINE_boolean("decode", False,
"Set to True for interactive decoding.")
tf.app.flags.DEFINE_boolean("self_test", False,
"Run a self-test if this is set to True.")
tf.app.flags.DEFINE_boolean("evaluate", False,
"Run evaluation metrics on the output.")


FLAGS = tf.app.flags.FLAGS
data_dir = FLAGS.data_dir + FLAGS.dataset + "/"
Expand Down Expand Up @@ -179,7 +189,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code):
# Get output logits for the sentence.
_, _, output_logits = model.step(sess, encoder_inputs, decoder_inputs,
target_weights, bucket_id, True)


# This is a greedy decoder - outputs are just argmaxes of output_logits.
outputs = [int(np.argmax(logit, axis=1)) for logit in output_logits]
Expand All @@ -189,7 +199,7 @@ def translate_file(source_path=dev_code_file, target_path=translated_dev_code):
outputs = outputs[:outputs.index(data_utils.EOS_ID)]

# Write translated sentence to translation file.
translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]) + "\n")
translated_file.write(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]) + "\n")

# print ("> %s" % sentence)
# print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
Expand All @@ -212,12 +222,12 @@ def create_model(session, forward_only):
FLAGS.learning_rate, FLAGS.learning_rate_decay_factor,
forward_only=forward_only)
ckpt = tf.train.get_checkpoint_state(FLAGS.train_dir)
if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path):
if ckpt and ckpt.model_checkpoint_path:
print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
model.saver.restore(session, ckpt.model_checkpoint_path)
else:
print("Created model with fresh parameters.")
session.run(tf.initialize_all_variables())
session.run(tf.global_variables_initializer())
return model


Expand Down Expand Up @@ -386,7 +396,7 @@ def decode():
if data_utils.EOS_ID in outputs:
outputs = outputs[:outputs.index(data_utils.EOS_ID)]
# Print out French sentence corresponding to outputs.
print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs]))
print(" ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs if output in rev_en_vocab]))
print("> ", end="")
sys.stdout.flush()
sentence = sys.stdin.readline()
Expand All @@ -399,7 +409,7 @@ def self_test():
# Create model with vocabularies of 10, 2 small buckets, 2 layers of 32.
model = seq2seq_model.Seq2SeqModel(10, 10, [(3, 3), (6, 6)], 32, 2,
5.0, 32, 0.3, 0.99, num_samples=8)
sess.run(tf.initialize_all_variables())
sess.run(tf.global_variables_initializer())

# Fake data set for both the (3, 3) and (6, 6) bucket.
data_set = ([([1, 1], [2, 2]), ([3, 3], [4]), ([5], [6])],
Expand All @@ -413,14 +423,20 @@ def self_test():


def main(_):
logger.info("Start")
if FLAGS.self_test:
logger.info("Self testing")
self_test()
elif FLAGS.decode:
logger.info("Decoding")
decode()
elif FLAGS.evaluate:
elif FLAGS.evaluate:
logger.info("Evaluating")
evaluate()
else:
logger.info("Training")
train()
logger.info("Stop")

if __name__ == "__main__":
tf.app.run()
Expand Down