From 079aa9b07c2d0b67e8f5791f71ea2fa5a2231dbb Mon Sep 17 00:00:00 2001 From: suryadesu Date: Sat, 5 Dec 2020 01:09:12 +0530 Subject: [PATCH 1/3] changing python2 to python3 --- src/csharp/CSharpTemplate.py | 10 ++++----- src/model/buildData.lua | 2 +- src/model/buildData.py | 25 +++++++++++++---------- src/model/buildData.sh | 8 ++++---- src/sql/SqlTemplate.py | 14 ++++++------- src/sql/regexp_tokenizer.py | 4 ++-- src/sqlparse/sqlparse/engine/grouping.py | 6 ++---- src/sqlparse/sqlparse/filters.py | 4 ++-- src/sqlparse/sqlparse/lexer.py | 20 ++++++++---------- src/sqlparse/sqlparse/sql.py | 26 ++++++++++++------------ 10 files changed, 59 insertions(+), 60 deletions(-) diff --git a/src/csharp/CSharpTemplate.py b/src/csharp/CSharpTemplate.py index f724846..ee43258 100644 --- a/src/csharp/CSharpTemplate.py +++ b/src/csharp/CSharpTemplate.py @@ -31,11 +31,11 @@ def parseCSharp(code): if __name__ == '__main__': - print parseCSharp("public Boolean SomeValue { get { return someValue; } set { someValue = value; } }") - print parseCSharp("Console.WriteLine('cat'); int mouse = 5; int cat = 0.4; int cow = 'c'; int moo = \"mouse\"; ") - print parseCSharp("int i = 4; // i is assigned the literal value of '4' \n int j = i // j is assigned the value of i. Since i is a variable, //it can change and is not a 'literal'") + print(parseCSharp("public Boolean SomeValue { get { return someValue; } set { someValue = value; } }")) + print(parseCSharp("Console.WriteLine('cat'); int mouse = 5; int cat = 0.4; int cow = 'c'; int moo = \"mouse\"; ")) + print(parseCSharp("int i = 4; // i is assigned the literal value of '4' \n int j = i // j is assigned the value of i. Since i is a variable, //it can change and is not a 'literal'")) try: - print parseCSharp('string `fixed = Regex.Replace(input, "\s*()","$1");'); + print(parseCSharp('string `fixed = Regex.Replace(input, "\s*()","$1");')); except: - print "Error" + print("Error") diff --git a/src/model/buildData.lua b/src/model/buildData.lua index f7887c4..0c1311b 100644 --- a/src/model/buildData.lua +++ b/src/model/buildData.lua @@ -86,7 +86,7 @@ function main() cmd:text() opt = cmd:parse(arg) local working_dir = os.getenv("CODENN_WORK") - + print(working_dir .. "/vocab." .. opt.language) local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r') local vocab = JSON:decode(vocabFile:read()) vocabFile:close() diff --git a/src/model/buildData.py b/src/model/buildData.py index 4142f8f..ed9910a 100644 --- a/src/model/buildData.py +++ b/src/model/buildData.py @@ -13,11 +13,11 @@ END = 4 def tokenizeNL(nl): - nl = nl.strip().decode('utf-8').encode('ascii', 'replace') + nl = nl.strip() return re.findall(r"[\w]+|[^\s\w]", nl) def tokenizeCode(code, lang): - code = code.strip().decode('utf-8').encode('ascii', 'replace') + code = code.strip() typedCode = None if lang == "sql": query = SqlTemplate(code, regex=True) @@ -43,9 +43,12 @@ def buildVocab(filename, code_unk_threshold, nl_unk_threshold, lang): tokens = collections.Counter() for line in open(filename, "r"): - qid, rid, nl, code, weight = line.strip().split('\t') - tokens.update(tokenizeCode(code, lang)) - words.update(tokenizeNL(nl)) + if len(line.strip().split('\t')) == 5: + qid, rid, nl, code, weight = line.strip().split('\t') + tokens.update(tokenizeCode(code, lang)) + words.update(tokenizeNL(nl)) + #tokens.update(tokenizeCode(code, lang)) + #words.update(tokenizeNL(nl)) token_count = END + 1 nl_count = END + 1 @@ -83,10 +86,10 @@ def get_data(filename, vocab, dont_skip, max_code_length, max_nl_length): dataset = [] skipped = 0 for line in open(filename, 'r'): - - qid, rid, nl, code, wt = line.strip().split('\t') - codeToks = tokenizeCode(code, vocab["lang"]) - nlToks = tokenizeNL(nl) + if len(line.strip().split('\t'))==5: + qid, rid, nl, code, wt = line.strip().split('\t') + codeToks = tokenizeCode(code, vocab["lang"]) + nlToks = tokenizeNL(nl) datasetEntry = {"id": rid, "code": code, "code_sizes": len(codeToks), "code_num":[], "nl_num":[]} @@ -108,8 +111,8 @@ def get_data(filename, vocab, dont_skip, max_code_length, max_nl_length): else: skipped += 1 - print 'Total size = ' + str(len(dataset)) - print 'Total skipped = ' + str(skipped) + print('Total size = ' + str(len(dataset))) + print('Total skipped = ' + str(skipped)) f = open(os.environ["CODENN_WORK"] + '/' + os.path.basename(filename) + "." + lang, 'w') f.write(json.dumps(dataset)) diff --git a/src/model/buildData.sh b/src/model/buildData.sh index c5716dc..0e03ff4 100755 --- a/src/model/buildData.sh +++ b/src/model/buildData.sh @@ -15,9 +15,9 @@ SQL_UNK_THRESHOLD=3 CSHARP_UNK_THRESHOLD=2 NL_UNK_THRESHOLD=2 -python buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD -python buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD +python3 buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD +#python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD -th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE -th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE +#th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE +#th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE diff --git a/src/sql/SqlTemplate.py b/src/sql/SqlTemplate.py index 5735d35..ce2254f 100755 --- a/src/sql/SqlTemplate.py +++ b/src/sql/SqlTemplate.py @@ -2,7 +2,7 @@ from sql.ParseTypes import * import pdb import re -from regexp_tokenizer import tokenizeRegex +from .regexp_tokenizer import tokenizeRegex class SqlTemplate: @@ -62,9 +62,9 @@ def renameIdentifiers(self, tok): tok.value = "CODE_HEX" def __hash__(self): - return hash(tuple([str(x) for x in self.tokensWithBlanks])) + return hash(tuple([str(x) for x in self.tokensWithBlanks])) - def __init__(self, sql, regex=False, rename=True): + def __init__(self, sql, regex=False, rename=True): self.sql = SqlTemplate.sanitizeSql(sql) @@ -123,7 +123,7 @@ def identifySubQueries(self, tokenList): for tok in tokenList.tokens: if isinstance(tok, sqlparse.sql.TokenList): - subQuery = self.identifySubQueries(tok) + subQuery = self.identifySubQueries(tok) if (subQuery and isinstance(tok, sqlparse.sql.Parenthesis)): tok.ptype = SUBQUERY elif str(tok) == "select": @@ -156,7 +156,7 @@ def identifyLiterals(self, tokenList): tok.ptype = WILDCARD elif (tok.ttype in blankTokens or isinstance(tok, blankTokenTypes[0])): tok.ptype = COLUMN - + def identifyFunctions(self, tokenList): for tok in tokenList.tokens: if (isinstance(tok, sqlparse.sql.Function)): @@ -174,7 +174,7 @@ def identifyTables(self, tokenList): self.tableStack.append(False) for i in xrange(len(tokenList.tokens)): - prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely + prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely tok = tokenList.tokens[i] if (str(tok) == "." and tok.ttype == sqlparse.tokens.Punctuation and prevtok.ptype == COLUMN): @@ -187,7 +187,7 @@ def identifyTables(self, tokenList): self.tableStack[-1] = False if isinstance(tok, sqlparse.sql.TokenList): - self.identifyTables(tok) + self.identifyTables(tok) elif (tok.ptype == COLUMN): if self.tableStack[-1]: diff --git a/src/sql/regexp_tokenizer.py b/src/sql/regexp_tokenizer.py index 9f2e6a1..c7c4aab 100644 --- a/src/sql/regexp_tokenizer.py +++ b/src/sql/regexp_tokenizer.py @@ -17,5 +17,5 @@ def tokenizeRegex(s): return results if __name__ == '__main__': - print tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$") - print tokenizeRegex("'helloworld'") + print(tokenizeRegex("^discount[^(]*\\([0-9]+\\%\\)$")) + print(tokenizeRegex("'helloworld'")) diff --git a/src/sqlparse/sqlparse/engine/grouping.py b/src/sqlparse/sqlparse/engine/grouping.py index a317044..c38035d 100755 --- a/src/sqlparse/sqlparse/engine/grouping.py +++ b/src/sqlparse/sqlparse/engine/grouping.py @@ -8,7 +8,7 @@ try: next except NameError: # Python < 2.6 - next = lambda i: i.next() + next = lambda i: i.__next__() def _group_left_right(tlist, ttype, value, cls, @@ -209,9 +209,7 @@ def _next_token(tl, i): idx = 0 token = _next_token(tlist, idx) while token: - identifier_tokens = [token] + list( - _consume_cycle(tlist, - tlist.token_index(token) + 1)) + identifier_tokens = [token] + list(_consume_cycle(tlist,tlist.token_index(token) + 1)) # remove trailing whitespace if identifier_tokens and identifier_tokens[-1].ttype is T.Whitespace: identifier_tokens = identifier_tokens[:-1] diff --git a/src/sqlparse/sqlparse/filters.py b/src/sqlparse/sqlparse/filters.py index 676344f..8b2bb64 100755 --- a/src/sqlparse/sqlparse/filters.py +++ b/src/sqlparse/sqlparse/filters.py @@ -154,7 +154,7 @@ def process(self, stack, stream): f.close() # There was a problem loading the include file - except IOError, err: + except IOError as err: # Raise the exception to the interpreter if self.raiseexceptions: raise @@ -171,7 +171,7 @@ def process(self, stack, stream): self.raiseexceptions) # Max recursion limit reached - except ValueError, err: + except ValueError as err: # Raise the exception to the interpreter if self.raiseexceptions: raise diff --git a/src/sqlparse/sqlparse/lexer.py b/src/sqlparse/sqlparse/lexer.py index 9533e8a..966b09d 100755 --- a/src/sqlparse/sqlparse/lexer.py +++ b/src/sqlparse/sqlparse/lexer.py @@ -17,7 +17,7 @@ from sqlparse import tokens from sqlparse.keywords import KEYWORDS, KEYWORDS_COMMON -from cStringIO import StringIO +from io import StringIO class include(str): @@ -81,7 +81,7 @@ def _process_state(cls, unprocessed, processed, state): try: rex = re.compile(tdef[0], rflags).match - except Exception, err: + except Exception as err: raise ValueError(("uncompilable regex %r in state" " %r of %r: %s" % (tdef[0], state, cls, err))) @@ -135,7 +135,7 @@ def process_tokendef(cls): cls._tmpname = 0 processed = cls._all_tokens[cls.__name__] = {} #tokendefs = tokendefs or cls.tokens[name] - for state in cls.tokens.keys(): + for state in list(cls.tokens.keys()): cls._process_state(cls.tokens, processed, state) return processed @@ -152,9 +152,7 @@ def __call__(cls, *args, **kwds): return type.__call__(cls, *args, **kwds) -class Lexer(object): - - __metaclass__ = LexerMeta +class Lexer(object, metaclass=LexerMeta): encoding = 'utf-8' stripall = False @@ -235,8 +233,8 @@ def _decode(self, text): if self.encoding == 'guess': try: text = text.decode('utf-8') - if text.startswith(u'\ufeff'): - text = text[len(u'\ufeff'):] + if text.startswith('\ufeff'): + text = text[len('\ufeff'):] except UnicodeDecodeError: text = text.decode('latin1') else: @@ -258,13 +256,13 @@ def get_tokens(self, text, unfiltered=False): Also preprocess the text, i.e. expand tabs and strip it if wanted and applies registered filters. """ - if isinstance(text, basestring): + if isinstance(text,str): if self.stripall: text = text.strip() elif self.stripnl: text = text.strip('\n') - if sys.version_info[0] < 3 and isinstance(text, unicode): + if sys.version_info[0] < 3 and isinstance(text, str): text = StringIO(text.encode('utf-8')) self.encoding = 'utf-8' else: @@ -342,7 +340,7 @@ def get_tokens_unprocessed(self, stream, stack=('root',)): pos += 1 statestack = ['root'] statetokens = tokendefs['root'] - yield pos, tokens.Text, u'\n' + yield pos, tokens.Text, '\n' continue yield pos, tokens.Error, text[pos] pos += 1 diff --git a/src/sqlparse/sqlparse/sql.py b/src/sqlparse/sqlparse/sql.py index a467fd0..9315808 100755 --- a/src/sqlparse/sqlparse/sql.py +++ b/src/sqlparse/sqlparse/sql.py @@ -35,13 +35,13 @@ def __str__(self): if sys.version_info[0] == 3: return self.value else: - return unicode(self).encode('utf-8') + return str(self).encode('utf-8') def getValue(self): if sys.version_info[0] == 3: return self.value else: - return unicode(self).encode('utf-8') + return str(self).encode('utf-8') def __repr__(self): short = self._get_repr_value() @@ -60,15 +60,15 @@ def to_unicode(self): .. deprecated:: 0.1.5 Use ``unicode(token)`` (for Python 3: ``str(token)``) instead. """ - return unicode(self) + return str(self) def _get_repr_name(self): return str(self.ttype).split('.')[-1] def _get_repr_value(self): - raw = unicode(self) + raw = str(self) if len(raw) > 7: - raw = raw[:6] + u'...' + raw = raw[:6] + '...' return re.sub('\s+', ' ', raw) def flatten(self): @@ -92,7 +92,7 @@ def match(self, ttype, values, regex=False): return type_matched if regex: - if isinstance(values, basestring): + if isinstance(values, str): values = set([values]) if self.ttype is T.Keyword: @@ -105,7 +105,7 @@ def match(self, ttype, values, regex=False): return True return False - if isinstance(values, basestring): + if isinstance(values, str): if self.is_keyword: return values.upper() == self.normalized return values == self.value @@ -184,13 +184,13 @@ def getValue(self): if sys.version_info[0] == 3: return str(type(self))[8:-2].split('.')[-1] else: - return unicode(str(type(self))[8:-2].split('.')[-1]).encode('utf-8') + return str(str(type(self))[8:-2].split('.')[-1]).encode('utf-8') def _to_string(self): if sys.version_info[0] == 3: return ''.join(x.value for x in self.flatten()) else: - return ''.join(unicode(x) for x in self.flatten()) + return ''.join(str(x) for x in self.flatten()) def _get_repr_name(self): return self.__class__.__name__ @@ -203,9 +203,9 @@ def _pprint_tree(self, max_depth=None, depth=0): pre = ' +-' else: pre = ' | ' - print '%s%s%d %s \'%s\'' % (indent, pre, idx, + print(('%s%s%d %s \'%s\'' % (indent, pre, idx, token._get_repr_name(), - token._get_repr_value()) + token._get_repr_value()))) if (token.is_group() and (max_depth is None or depth < max_depth)): token._pprint_tree(max_depth, depth + 1) @@ -298,7 +298,7 @@ def token_next_match(self, idx, ttype, value, regex=False): if not isinstance(idx, int): idx = self.token_index(idx) - for n in xrange(idx, len(self.tokens)): + for n in range(idx, len(self.tokens)): token = self.tokens[n] if token.match(ttype, value, regex): return token @@ -520,7 +520,7 @@ def get_typecast(self): next_ = self.token_next(self.token_index(marker), False) if next_ is None: return None - return unicode(next_) + return str(next_) def get_ordering(self): """Returns the ordering or ``None`` as uppercase string.""" From 29d584af8939363b808cf49728dec4b6e283e1a8 Mon Sep 17 00:00:00 2001 From: suryadesu Date: Sat, 5 Dec 2020 01:17:50 +0530 Subject: [PATCH 2/3] extraneous changes removed --- src/model/buildData.lua | 3 +-- src/model/buildData.sh | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/model/buildData.lua b/src/model/buildData.lua index 0c1311b..2f16416 100644 --- a/src/model/buildData.lua +++ b/src/model/buildData.lua @@ -86,8 +86,7 @@ function main() cmd:text() opt = cmd:parse(arg) local working_dir = os.getenv("CODENN_WORK") - print(working_dir .. "/vocab." .. opt.language) - local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r') + local vocabFile = io.open(working_dir .. "/vocab." .. opt.language, 'r') local vocab = JSON:decode(vocabFile:read()) vocabFile:close() torch.save(working_dir .. '/vocab.data.' .. opt.language , vocab) diff --git a/src/model/buildData.sh b/src/model/buildData.sh index 0e03ff4..b1ca419 100755 --- a/src/model/buildData.sh +++ b/src/model/buildData.sh @@ -16,8 +16,8 @@ CSHARP_UNK_THRESHOLD=2 NL_UNK_THRESHOLD=2 python3 buildData.py sql $MAX_CODE_LENGTH $MAX_NL_LENGTH $SQL_UNK_THRESHOLD $NL_UNK_THRESHOLD -#python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD +python3 buildData.py csharp $MAX_CODE_LENGTH $MAX_NL_LENGTH $CSHARP_UNK_THRESHOLD $NL_UNK_THRESHOLD -#th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE -#th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE +th buildData.lua -language sql -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE +th buildData.lua -language csharp -max_code_length $MAX_CODE_LENGTH -max_nl_length $MAX_NL_LENGTH -batch_size $BATCH_SIZE From 4d2e1b75ac3b9debb37671b1d7d7216b7520fee2 Mon Sep 17 00:00:00 2001 From: suryadesu Date: Sat, 5 Dec 2020 16:35:55 +0530 Subject: [PATCH 3/3] added modifications for sql data building --- src/sql/SqlTemplate.py | 2 +- src/sqlparse/sqlparse/engine/grouping.py | 20 ++++++++++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/sql/SqlTemplate.py b/src/sql/SqlTemplate.py index ce2254f..336908c 100755 --- a/src/sql/SqlTemplate.py +++ b/src/sql/SqlTemplate.py @@ -173,7 +173,7 @@ def identifyTables(self, tokenList): if tokenList.ptype == SUBQUERY: self.tableStack.append(False) - for i in xrange(len(tokenList.tokens)): + for i in range(len(tokenList.tokens)): prevtok = tokenList.tokens[i - 1] # Possible bug but unlikely tok = tokenList.tokens[i] diff --git a/src/sqlparse/sqlparse/engine/grouping.py b/src/sqlparse/sqlparse/engine/grouping.py index c38035d..9559ef5 100755 --- a/src/sqlparse/sqlparse/engine/grouping.py +++ b/src/sqlparse/sqlparse/engine/grouping.py @@ -173,14 +173,22 @@ def _consume_cycle(tl, i): for t in tl.tokens[i:]: # Don't take whitespaces into account. if t.ttype is T.Whitespace: - yield t + try: + yield t + except StopIteration: + return continue if next(x)(t): - yield t + try: + yield t + except StopIteration: + return else: if isinstance(t, sql.Comment) and t.is_multiline(): - yield t - raise StopIteration + try: + yield t + except StopIteration: + return def _next_token(tl, i): # chooses the next token. if two tokens are found then the @@ -202,13 +210,13 @@ def _next_token(tl, i): return t2 # bottom up approach: group subgroups first - [group_identifier(sgroup) for sgroup in tlist.get_sublists() - if not isinstance(sgroup, sql.Identifier)] + [group_identifier(sgroup) for sgroup in tlist.get_sublists() if not isinstance(sgroup, sql.Identifier)] # real processing idx = 0 token = _next_token(tlist, idx) while token: + k = _consume_cycle(tlist,tlist.token_index(token)+1) identifier_tokens = [token] + list(_consume_cycle(tlist,tlist.token_index(token) + 1)) # remove trailing whitespace if identifier_tokens and identifier_tokens[-1].ttype is T.Whitespace: