From 079aa9b07c2d0b67e8f5791f71ea2fa5a2231dbb Mon Sep 17 00:00:00 2001
From: suryadesu <desusurya99@gmail.com>
Date: Sat, 5 Dec 2020 01:09:12 +0530
Subject: [PATCH 1/3] changing python2 to python3

---
 src/csharp/CSharpTemplate.py             | 10 ++++-----
 src/model/buildData.lua                  |  2 +-
 src/model/buildData.py                   | 25 +++++++++++++----------
 src/model/buildData.sh                   |  8 ++++----
 src/sql/SqlTemplate.py                   | 14 ++++++-------
 src/sql/regexp_tokenizer.py              |  4 ++--
 src/sqlparse/sqlparse/engine/grouping.py |  6 ++----
 src/sqlparse/sqlparse/filters.py         |  4 ++--
 src/sqlparse/sqlparse/lexer.py           | 20 ++++++++----------
 src/sqlparse/sqlparse/sql.py             | 26 ++++++++++++------------
 10 files changed, 59 insertions(+), 60 deletions(-)

diff --git a/src/csharp/CSharpTemplate.py b/src/csharp/CSharpTemplate.py
index f724846..ee43258 100644
--- a/src/csharp/CSharpTemplate.py
+++ b/src/csharp/CSharpTemplate.py
@@ -31,11 +31,11 @@ def parseCSharp(code):
 
 
 if __name__ == '__main__':
-  print parseCSharp("public Boolean SomeValue {     get { return someValue; }     set { someValue = value; } }")
-  print parseCSharp("Console.WriteLine('cat'); int mouse = 5; int cat = 0.4; int cow = 'c'; int moo = \"mouse\"; ")
-  print parseCSharp("int i = 4;  // i is assigned the literal value of '4' \n int j = i   // j is assigned the value of i.  Since i is a variable,               //it can change and is not a 'literal'")
+  print(parseCSharp("public Boolean SomeValue {     get { return someValue; }     set { someValue = value; } }"))
+  print(parseCSharp("Console.WriteLine('cat'); int mouse = 5; int cat = 0.4; int cow = 'c'; int moo = \"mouse\"; "))
+  print(parseCSharp("int i = 4;  // i is assigned the literal value of '4' \n int j = i   // j is assigned the value of i.  Since i is a variable,               //it can change and is not a 'literal'"))
   try:
-    print parseCSharp('string `fixed = Regex.Replace(input, "\s*()","$1");');
+    print(parseCSharp('string `fixed = Regex.Replace(input, "\s*()","$1");'));
   except:
-    print "Error"
+    print("Error")
 
diff --git a/src/model/buildData.lua b/src/model/buildData.lua
index f7887c4..0c1311b 100644
--- a/src/model/buildData.lua
+++ b/src/model/buildData.lua
@@ -86,7 +86,7 @@ function main()
 	cmd:text()
 	opt = cmd:parse(arg)
 	local working_dir = os.getenv("CODENN_WORK")
-
+    print(working_dir .. "/vocab." .. opt.language)
 	local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r')
 	local vocab = JSON:decode(vocabFile:read())
 	vocabFile:close()
diff --git a/src/model/buildData.py b/src/model/buildData.py
index 4142f8f..ed9910a 100644
--- a/src/model/buildData.py
+++ b/src/model/buildData.py
@@ -13,11 +13,11 @@
 END = 4
 
 def tokenizeNL(nl):
-  nl = nl.strip().decode('utf-8').encode('ascii', 'replace')
+  nl = nl.strip()
   return re.findall(r"[\w]+|[^\s\w]", nl)
 
 def tokenizeCode(code, lang):
-  code = code.strip().decode('utf-8').encode('ascii', 'replace')
+  code = code.strip()
   typedCode = None
   if lang == "sql":
     query = SqlTemplate(code, regex=True)
@@ -43,9 +43,12 @@ def buildVocab(filename, code_unk_threshold, nl_unk_threshold, lang):
   tokens = collections.Counter()
 
   for line in open(filename, "r"):
-    qid, rid, nl, code, weight = line.strip().split('\t')
-    tokens.update(tokenizeCode(code, lang))
-    words.update(tokenizeNL(nl))
+    if len(line.strip().split('\t')) == 5:
+        qid, rid, nl, code, weight = line.strip().split('\t')
+        tokens.update(tokenizeCode(code, lang))
+        words.update(tokenizeNL(nl))
+    #tokens.update(tokenizeCode(code, lang))
+    #words.update(tokenizeNL(nl))
 
   token_count = END + 1
   nl_count = END + 1
@@ -83,10 +86,10 @@ def get_data(filename, vocab, dont_skip, max_code_length, max_nl_length):
   dataset = []
   skipped = 0
   for line in open(filename, 'r'):
-
-    qid, rid, nl, code, wt = line.strip().split('\t')
-    codeToks  = tokenizeCode(code, vocab["lang"])
-    nlToks = tokenizeNL(nl)
+    if len(line.strip().split('\t'))==5:
+        qid, rid, nl, code, wt = line.strip().split('\t')
+        codeToks  = tokenizeCode(code, vocab["lang"])
+        nlToks = tokenizeNL(nl)
 
     datasetEntry = {"id": rid, "code": code, "code_sizes": len(codeToks), "code_num":[], "nl_num":[]}
 
@@ -108,8 +111,8 @@ def get_data(filename, vocab, dont_skip, max_code_length, max_nl_length):
     else:
       skipped += 1
 
-  print 'Total size = ' + str(len(dataset))
-  print 'Total skipped = ' + str(skipped)
+  print('Total size = ' + str(len(dataset)))
+  print('Total skipped = ' + str(skipped))
 
   f = open(os.environ["CODENN_WORK"] + '/' + os.path.basename(filename) + "." + lang, 'w')
   f.write(json.dumps(dataset))
diff --git a/src/model/buildData.sh b/src/model/buildData.sh
index c5716dc..0e03ff4 100755
--- a/src/model/buildData.sh
+++ b/src/model/buildData.sh
@@ -15,9 +15,9 @@ SQL_UNK_THRESHOLD=3
 CSHARP_UNK_THRESHOLD=2
 NL_UNK_THRESHOLD=2
 
-python buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD
-python buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD
+python3 buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD
+#python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD
 
 
-th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
-th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
+#th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
+#th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
diff --git a/src/sql/SqlTemplate.py b/src/sql/SqlTemplate.py
index 5735d35..ce2254f 100755
--- a/src/sql/SqlTemplate.py
+++ b/src/sql/SqlTemplate.py
@@ -2,7 +2,7 @@
 from sql.ParseTypes import *
 import pdb
 import re
-from regexp_tokenizer import tokenizeRegex
+from .regexp_tokenizer import tokenizeRegex
 
 class SqlTemplate:
 
@@ -62,9 +62,9 @@ def renameIdentifiers(self, tok):
       tok.value = "CODE_HEX"
 
   def __hash__(self):
-    return hash(tuple([str(x) for x in self.tokensWithBlanks])) 
+    return hash(tuple([str(x) for x in self.tokensWithBlanks]))
 
-  def __init__(self, sql, regex=False, rename=True):  
+  def __init__(self, sql, regex=False, rename=True):
 
     self.sql = SqlTemplate.sanitizeSql(sql)
 
@@ -123,7 +123,7 @@ def identifySubQueries(self, tokenList):
 
     for tok in tokenList.tokens:
       if isinstance(tok, sqlparse.sql.TokenList):
-        subQuery = self.identifySubQueries(tok)  
+        subQuery = self.identifySubQueries(tok)
         if (subQuery and isinstance(tok, sqlparse.sql.Parenthesis)):
           tok.ptype = SUBQUERY
       elif str(tok) == "select":
@@ -156,7 +156,7 @@ def identifyLiterals(self, tokenList):
         tok.ptype = WILDCARD
       elif (tok.ttype in blankTokens or isinstance(tok, blankTokenTypes[0])):
         tok.ptype = COLUMN
-  
+
   def identifyFunctions(self, tokenList):
     for tok in tokenList.tokens:
       if (isinstance(tok, sqlparse.sql.Function)):
@@ -174,7 +174,7 @@ def identifyTables(self, tokenList):
       self.tableStack.append(False)
 
     for i in xrange(len(tokenList.tokens)):
-      prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely 
+      prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely
       tok = tokenList.tokens[i]
 
       if (str(tok) == "." and tok.ttype == sqlparse.tokens.Punctuation and prevtok.ptype == COLUMN):
@@ -187,7 +187,7 @@ def identifyTables(self, tokenList):
         self.tableStack[-1] = False
 
       if isinstance(tok, sqlparse.sql.TokenList):
-        self.identifyTables(tok)  
+        self.identifyTables(tok)
 
       elif (tok.ptype == COLUMN):
         if self.tableStack[-1]:
diff --git a/src/sql/regexp_tokenizer.py b/src/sql/regexp_tokenizer.py
index 9f2e6a1..c7c4aab 100644
--- a/src/sql/regexp_tokenizer.py
+++ b/src/sql/regexp_tokenizer.py
@@ -17,5 +17,5 @@ def tokenizeRegex(s):
   return results
 
 if __name__ == '__main__':
-  print tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$")
-  print tokenizeRegex("'helloworld'")
+  print(tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$"))
+  print(tokenizeRegex("'helloworld'"))
diff --git a/src/sqlparse/sqlparse/engine/grouping.py b/src/sqlparse/sqlparse/engine/grouping.py
index a317044..c38035d 100755
--- a/src/sqlparse/sqlparse/engine/grouping.py
+++ b/src/sqlparse/sqlparse/engine/grouping.py
@@ -8,7 +8,7 @@
 try:
     next
 except NameError:  # Python < 2.6
-    next = lambda i: i.next()
+    next = lambda i: i.__next__()
 
 
 def _group_left_right(tlist, ttype, value, cls,
@@ -209,9 +209,7 @@ def _next_token(tl, i):
     idx = 0
     token = _next_token(tlist, idx)
     while token:
-        identifier_tokens = [token] + list(
-            _consume_cycle(tlist,
-                           tlist.token_index(token) + 1))
+        identifier_tokens = [token] + list(_consume_cycle(tlist,tlist.token_index(token) + 1))
         # remove trailing whitespace
         if identifier_tokens and identifier_tokens[-1].ttype is T.Whitespace:
             identifier_tokens = identifier_tokens[:-1]
diff --git a/src/sqlparse/sqlparse/filters.py b/src/sqlparse/sqlparse/filters.py
index 676344f..8b2bb64 100755
--- a/src/sqlparse/sqlparse/filters.py
+++ b/src/sqlparse/sqlparse/filters.py
@@ -154,7 +154,7 @@ def process(self, stack, stream):
                         f.close()
 
                     # There was a problem loading the include file
-                    except IOError, err:
+                    except IOError as err:
                         # Raise the exception to the interpreter
                         if self.raiseexceptions:
                             raise
@@ -171,7 +171,7 @@ def process(self, stack, stream):
                                                      self.raiseexceptions)
 
                         # Max recursion limit reached
-                        except ValueError, err:
+                        except ValueError as err:
                             # Raise the exception to the interpreter
                             if self.raiseexceptions:
                                 raise
diff --git a/src/sqlparse/sqlparse/lexer.py b/src/sqlparse/sqlparse/lexer.py
index 9533e8a..966b09d 100755
--- a/src/sqlparse/sqlparse/lexer.py
+++ b/src/sqlparse/sqlparse/lexer.py
@@ -17,7 +17,7 @@
 
 from sqlparse import tokens
 from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON
-from cStringIO import StringIO
+from io import StringIO
 
 
 class include(str):
@@ -81,7 +81,7 @@ def _process_state(cls, unprocessed, processed, state):
 
             try:
                 rex = re.compile(tdef[0], rflags).match
-            except Exception, err:
+            except Exception as err:
                 raise ValueError(("uncompilable regex %r in state"
                                   " %r of %r: %s"
                                   % (tdef[0], state, cls, err)))
@@ -135,7 +135,7 @@ def process_tokendef(cls):
         cls._tmpname = 0
         processed = cls._all_tokens[cls.__name__] = {}
         #tokendefs = tokendefs or cls.tokens[name]
-        for state in cls.tokens.keys():
+        for state in list(cls.tokens.keys()):
             cls._process_state(cls.tokens, processed, state)
         return processed
 
@@ -152,9 +152,7 @@ def __call__(cls, *args, **kwds):
         return type.__call__(cls, *args, **kwds)
 
 
-class Lexer(object):
-
-    __metaclass__ = LexerMeta
+class Lexer(object, metaclass=LexerMeta):
 
     encoding = 'utf-8'
     stripall = False
@@ -235,8 +233,8 @@ def _decode(self, text):
         if self.encoding == 'guess':
             try:
                 text = text.decode('utf-8')
-                if text.startswith(u'\ufeff'):
-                    text = text[len(u'\ufeff'):]
+                if text.startswith('\ufeff'):
+                    text = text[len('\ufeff'):]
             except UnicodeDecodeError:
                 text = text.decode('latin1')
         else:
@@ -258,13 +256,13 @@ def get_tokens(self, text, unfiltered=False):
         Also preprocess the text, i.e. expand tabs and strip it if
         wanted and applies registered filters.
         """
-        if isinstance(text, basestring):
+        if isinstance(text,str):
             if self.stripall:
                 text = text.strip()
             elif self.stripnl:
                 text = text.strip('\n')
 
-            if sys.version_info[0] < 3 and isinstance(text, unicode):
+            if sys.version_info[0] < 3 and isinstance(text, str):
                 text = StringIO(text.encode('utf-8'))
                 self.encoding = 'utf-8'
             else:
@@ -342,7 +340,7 @@ def get_tokens_unprocessed(self, stream, stack=('root',)):
                         pos += 1
                         statestack = ['root']
                         statetokens = tokendefs['root']
-                        yield pos, tokens.Text, u'\n'
+                        yield pos, tokens.Text, '\n'
                         continue
                     yield pos, tokens.Error, text[pos]
                     pos += 1
diff --git a/src/sqlparse/sqlparse/sql.py b/src/sqlparse/sqlparse/sql.py
index a467fd0..9315808 100755
--- a/src/sqlparse/sqlparse/sql.py
+++ b/src/sqlparse/sqlparse/sql.py
@@ -35,13 +35,13 @@ def __str__(self):
         if sys.version_info[0] == 3:
             return self.value
         else:
-            return unicode(self).encode('utf-8')
+            return str(self).encode('utf-8')
 
     def getValue(self):
         if sys.version_info[0] == 3:
             return self.value
         else:
-            return unicode(self).encode('utf-8')
+            return str(self).encode('utf-8')
 
     def __repr__(self):
         short = self._get_repr_value()
@@ -60,15 +60,15 @@ def to_unicode(self):
         .. deprecated:: 0.1.5
            Use ``unicode(token)`` (for Python 3: ``str(token)``) instead.
         """
-        return unicode(self)
+        return str(self)
 
     def _get_repr_name(self):
         return str(self.ttype).split('.')[-1]
 
     def _get_repr_value(self):
-        raw = unicode(self)
+        raw = str(self)
         if len(raw) > 7:
-            raw = raw[:6] + u'...'
+            raw = raw[:6] + '...'
         return re.sub('\s+', ' ', raw)
 
     def flatten(self):
@@ -92,7 +92,7 @@ def match(self, ttype, values, regex=False):
             return type_matched
 
         if regex:
-            if isinstance(values, basestring):
+            if isinstance(values, str):
                 values = set([values])
 
             if self.ttype is T.Keyword:
@@ -105,7 +105,7 @@ def match(self, ttype, values, regex=False):
                     return True
             return False
 
-        if isinstance(values, basestring):
+        if isinstance(values, str):
             if self.is_keyword:
                 return values.upper() == self.normalized
             return values == self.value
@@ -184,13 +184,13 @@ def getValue(self):
         if sys.version_info[0] == 3:
             return str(type(self))[8:-2].split('.')[-1]
         else:
-            return unicode(str(type(self))[8:-2].split('.')[-1]).encode('utf-8')
+            return str(str(type(self))[8:-2].split('.')[-1]).encode('utf-8')
 
     def _to_string(self):
         if sys.version_info[0] == 3:
             return ''.join(x.value for x in self.flatten())
         else:
-            return ''.join(unicode(x) for x in self.flatten())
+            return ''.join(str(x) for x in self.flatten())
 
     def _get_repr_name(self):
         return self.__class__.__name__
@@ -203,9 +203,9 @@ def _pprint_tree(self, max_depth=None, depth=0):
                 pre = ' +-'
             else:
                 pre = ' | '
-            print '%s%s%d %s \'%s\'' % (indent, pre, idx,
+            print(('%s%s%d %s \'%s\'' % (indent, pre, idx,
                                         token._get_repr_name(),
-                                        token._get_repr_value())
+                                        token._get_repr_value())))
             if (token.is_group() and (max_depth is None or depth < max_depth)):
                 token._pprint_tree(max_depth, depth + 1)
 
@@ -298,7 +298,7 @@ def token_next_match(self, idx, ttype, value, regex=False):
         if not isinstance(idx, int):
             idx = self.token_index(idx)
 
-        for n in xrange(idx, len(self.tokens)):
+        for n in range(idx, len(self.tokens)):
             token = self.tokens[n]
             if token.match(ttype, value, regex):
                 return token
@@ -520,7 +520,7 @@ def get_typecast(self):
         next_ = self.token_next(self.token_index(marker), False)
         if next_ is None:
             return None
-        return unicode(next_)
+        return str(next_)
 
     def get_ordering(self):
         """Returns the ordering or ``None`` as uppercase string."""

From 29d584af8939363b808cf49728dec4b6e283e1a8 Mon Sep 17 00:00:00 2001
From: suryadesu <desusurya99@gmail.com>
Date: Sat, 5 Dec 2020 01:17:50 +0530
Subject: [PATCH 2/3] extraneous changes removed

---
 src/model/buildData.lua | 3 +--
 src/model/buildData.sh  | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/model/buildData.lua b/src/model/buildData.lua
index 0c1311b..2f16416 100644
--- a/src/model/buildData.lua
+++ b/src/model/buildData.lua
@@ -86,8 +86,7 @@ function main()
 	cmd:text()
 	opt = cmd:parse(arg)
 	local working_dir = os.getenv("CODENN_WORK")
-    print(working_dir .. "/vocab." .. opt.language)
-	local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r')
+    local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r')
 	local vocab = JSON:decode(vocabFile:read())
 	vocabFile:close()
 	torch.save(working_dir .. '/vocab.data.' .. opt.language , vocab)
diff --git a/src/model/buildData.sh b/src/model/buildData.sh
index 0e03ff4..b1ca419 100755
--- a/src/model/buildData.sh
+++ b/src/model/buildData.sh
@@ -16,8 +16,8 @@ CSHARP_UNK_THRESHOLD=2
 NL_UNK_THRESHOLD=2
 
 python3 buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD
-#python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD
+python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD
 
 
-#th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
-#th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
+th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE
+th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE

From 4d2e1b75ac3b9debb37671b1d7d7216b7520fee2 Mon Sep 17 00:00:00 2001
From: suryadesu <desusurya99@gmail.com>
Date: Sat, 5 Dec 2020 16:35:55 +0530
Subject: [PATCH 3/3] added modifications for sql data building

---
 src/sql/SqlTemplate.py                   |  2 +-
 src/sqlparse/sqlparse/engine/grouping.py | 20 ++++++++++++++------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/src/sql/SqlTemplate.py b/src/sql/SqlTemplate.py
index ce2254f..336908c 100755
--- a/src/sql/SqlTemplate.py
+++ b/src/sql/SqlTemplate.py
@@ -173,7 +173,7 @@ def identifyTables(self, tokenList):
     if tokenList.ptype == SUBQUERY:
       self.tableStack.append(False)
 
-    for i in xrange(len(tokenList.tokens)):
+    for i in range(len(tokenList.tokens)):
       prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely
       tok = tokenList.tokens[i]
 
diff --git a/src/sqlparse/sqlparse/engine/grouping.py b/src/sqlparse/sqlparse/engine/grouping.py
index c38035d..9559ef5 100755
--- a/src/sqlparse/sqlparse/engine/grouping.py
+++ b/src/sqlparse/sqlparse/engine/grouping.py
@@ -173,14 +173,22 @@ def _consume_cycle(tl, i):
         for t in tl.tokens[i:]:
             # Don't take whitespaces into account.
             if t.ttype is T.Whitespace:
-                yield t
+                try:
+                    yield t
+                except StopIteration:
+                    return
                 continue
             if next(x)(t):
-                yield t
+                try:
+                    yield t
+                except StopIteration:
+                    return
             else:
                 if isinstance(t, sql.Comment) and t.is_multiline():
-                    yield t
-                raise StopIteration
+                    try:
+                        yield t
+                    except StopIteration:
+                        return
 
     def _next_token(tl, i):
         # chooses the next token. if two tokens are found then the
@@ -202,13 +210,13 @@ def _next_token(tl, i):
             return t2
 
     # bottom up approach: group subgroups first
-    [group_identifier(sgroup) for sgroup in tlist.get_sublists()
-     if not isinstance(sgroup, sql.Identifier)]
+    [group_identifier(sgroup) for sgroup in tlist.get_sublists() if not isinstance(sgroup, sql.Identifier)]
 
     # real processing
     idx = 0
     token = _next_token(tlist, idx)
     while token:
+        k = _consume_cycle(tlist,tlist.token_index(token)+1)
         identifier_tokens = [token] + list(_consume_cycle(tlist,tlist.token_index(token) + 1))
         # remove trailing whitespace
         if identifier_tokens and identifier_tokens[-1].ttype is T.Whitespace: