From 8c579039a2e612215eb84960c77284d54a18c4f8 Mon Sep 17 00:00:00 2001 From: Randall Pittman Date: Mon, 18 Feb 2019 12:08:14 -0800 Subject: [PATCH 1/3] 2/3 compatibility cdlparser.py -- initial 2/3 compatibility steps cdlparser.py -- 2/3 compatibility fix octal strings to use 0o cdlparser.py -- Fix variable name cdlparser.py -- 2/3 integer division cdlparser.py -- 2/3 string_escape compatibility. CDL files must be utf-8 compatible. test_constants.py -- 2/3 compat -- keys() is an iterator in PY3 and cannot be indexed. test_charvars.py -- 2/3 compat. netcdf NC_CHAR are bytes, not strings cdlparser.py -- 2/3 basestring compatibility --- cdlparser.py | 55 +++++++++++++++++++++++++++++++----------- test/test_charvars.py | 2 +- test/test_constants.py | 6 +++-- 3 files changed, 46 insertions(+), 17 deletions(-) diff --git a/cdlparser.py b/cdlparser.py index 11ad19b..34ed70c 100644 --- a/cdlparser.py +++ b/cdlparser.py @@ -67,15 +67,23 @@ Creator: Phil Bentley """ +from __future__ import print_function + __version_info__ = (0, 0, 8, 'beta', 0) __version__ = "%d.%d.%d-%s" % __version_info__[0:4] import sys, os, logging, types +import six +import re import ply.lex as lex from ply.lex import TOKEN import ply.yacc as yacc import netCDF4 as nc4 import numpy as np +from functools import reduce + +if not six.PY2: + long = int # default fill values for netCDF-3 data types (as defined in netcdf.h include file) NC_FILL_BYTE = np.int8(-127) @@ -376,6 +384,7 @@ def t_DOUBLE_CONST(self, t) : def t_SHORT_CONST(self, t) : r'[+-]?([0-9]+|0[xX][0-9a-fA-F]+)[sS]' #r'[+-]?[0-9]+[sS]|0[xX][0-9a-fA-F]+[sS]' # original regex in ncgen3.l file + t.value = fix_octal(t.value) try : int_val = int(eval(t.value[:-1])) except : @@ -391,6 +400,7 @@ def t_SHORT_CONST(self, t) : def t_BYTE_CONST(self, t) : #r'[+-]?[0-9]+[Bb]' # modified regex #r'[+-]?[0-9]*[0-9][Bb]' # original regex in ncgen3.l file + t.value = fix_octal(t.value) try : if t.value[0] == "'" : int_val = ord(eval(t.value)) @@ -411,6 +421,7 @@ def t_INT_CONST(self, t) : r'[+-]?([1-9][0-9]*|0[xX]?[0-9a-fA-F]+|0)' # [Ll] suffix has been deprecated #r'[+-]?([1-9][0-9]*|0)[lL]?' # original regex for decimal integers in ncgen3.l file #r'0[xX]?[0-9a-fA-F]+[lL]?' # original regex for octal or hex integers in ncgen3.l file + t.value = fix_octal(t.value) try : long_val = long(eval(t.value)) except : @@ -418,7 +429,7 @@ def t_INT_CONST(self, t) : raise CDLContentError(errmsg) if long_val < XDR_INT_MIN or long_val > XDR_INT_MAX : errmsg = "Integer constant outside valid range (%d -> %d): %s" \ - % (XDR_INT_MIN, XDR_INT_MAX, int_val) + % (XDR_INT_MIN, XDR_INT_MAX, long_val) raise CDLContentError(errmsg) else : t.value = np.int32(long_val) @@ -472,7 +483,7 @@ def p_dimdecl(self, p) : | dimd EQUALS DOUBLE_CONST | dimd EQUALS NC_UNLIMITED_K""" dimname = "" - if isinstance(p[3], basestring) : + if isinstance(p[3], six.string_types) : if p[3] == "unlimited" : if self.rec_dimname : raise CDLContentError("Only one UNLIMITED dimension is allowed.") @@ -639,7 +650,7 @@ def p_datadecl(self, p) : try : self.write_var_data(var, arr) self.logger.info("Wrote %d data value(s) for variable %s" % (len(arr), p[1])) - except Exception, exc : + except Exception as exc : self.logger.error(str(exc)) raise @@ -772,7 +783,7 @@ def write_var_data(self, var, arr) : arrlen = len(arr) varlen = var.size if is_charvar and var.ndim > 0 : - varlen /= var.shape[-1] + varlen = varlen // var.shape[-1] reclen = 0 self.logger.debug("Length of passed-in data array = %d" % arrlen) if varlen : self.logger.debug("Expected length of variable = %d" % varlen) @@ -782,7 +793,7 @@ def write_var_data(self, var, arr) : if is_recvar : rec_dimlen = len(self.ncdataset.dimensions[self.rec_dimname]) if rec_dimlen > 0 : # record dimension has been set to non-zero - reclen = varlen / rec_dimlen + reclen = varlen // rec_dimlen else : # record dimension is still equal to zero varlen = arrlen reclen = 1 @@ -806,7 +817,7 @@ def write_var_data(self, var, arr) : put_char_data(var, arr, reclen) else : put_numeric_data(var, arr, reclen) - except Exception, exc : + except Exception as exc : errmsg = "Error attempting to write data array for variable %s\n" % var._name errmsg += "Exception details are as follows:\n%s" % str(exc) raise CDLContentError(errmsg) @@ -814,12 +825,12 @@ def write_var_data(self, var, arr) : def _lextest(self, data) : """private method - for test purposes only""" self.lexer.input(data) - print "-----" + print("-----") while 1 : t = self.lexer.token() if not t : break - print "type: %-15s\tvalue: %s" % (t.type, t.value) - print "-----" + print("type: %-15s\tvalue: %s" % (t.type, t.value)) + print("-----") #--------------------------------------------------------------------------------------------------- def put_numeric_data(var, arr, reclen=0) : @@ -827,7 +838,7 @@ def put_numeric_data(var, arr, reclen=0) : """Write numeric data array to netcdf variable.""" nparr = np.array(arr, dtype=var.dtype) shape = list(var.shape) - if reclen : shape[0] = len(arr) / reclen + if reclen : shape[0] = len(arr) // reclen nparr.shape = shape var[:] = nparr @@ -838,7 +849,7 @@ def put_char_data(var, arr, reclen=0) : maxlen = var.shape[-1] if var.ndim > 0 else 1 nparr = str_list_to_char_arr(arr, maxlen) shape = list(var.shape) - if reclen : shape[0] = len(arr) / reclen + if reclen : shape[0] = len(arr) // reclen nparr.shape = shape var[:] = nparr @@ -899,7 +910,23 @@ def expand_escapes(tstring) : A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply uses the built-in string.decode() method. """ - return tstring.decode('string_escape') + if six.PY2: + return tstring.decode('string_escape') + else: + return(bytes(tstring, 'utf-8').decode('unicode_escape')) + +#--------------------------------------------------------------------------------------------------- +def fix_octal(octal_str) : +#--------------------------------------------------------------------------------------------------- + """ + Fixes anything octal, including +/- prefix and letter suffix to use "0o" + """ + m = re.match(r"([+-]?)0(\d+.*)", octal_str) + if m: + # Make octal python 3 compatible + return m.group(1) + "0o" + m.group(2) + else: + return octal_str #--------------------------------------------------------------------------------------------------- def get_default_fill_value(datatype) : @@ -926,14 +953,14 @@ def main() : """Rudimentary main function - primarily for testing purposes at this point in time.""" debug = 0 if len(sys.argv) < 2 : - print "usage: python cdlparser.py cdlfile [keyword=value, ...]" + print("usage: python cdlparser.py cdlfile [keyword=value, ...]") sys.exit(1) cdlfile = sys.argv[1] kwargs = {} if len(sys.argv) > 2 : keys = [x.split('=')[0] for x in sys.argv[2:]] vals = [eval(x.split('=')[1]) for x in sys.argv[2:]] - kwargs = dict(zip(keys,vals)) + kwargs = dict(list(zip(keys,vals))) cdlparser = CDL3Parser(**kwargs) ncdataset = cdlparser.parse_file(cdlfile) try : diff --git a/test/test_charvars.py b/test/test_charvars.py index e2e5788..ca95da5 100644 --- a/test/test_charvars.py +++ b/test/test_charvars.py @@ -50,7 +50,7 @@ def tearDown(self) : def test_scalar_variables(self) : var = self.dataset.variables['letter'] - self.assertTrue(var[:] == "X") + self.assertTrue(var[:] == b"X") def test_non_scalar_variables(self) : var = self.dataset.variables['regcodes'] diff --git a/test/test_constants.py b/test/test_constants.py index 639c932..f6e71ab 100644 --- a/test/test_constants.py +++ b/test/test_constants.py @@ -111,13 +111,15 @@ def test_double_array(self) : def test_dimensions(self) : self.assertTrue(len(self.dataset.dimensions) == 1) - self.assertTrue(self.dataset.dimensions.keys()[0] == "dim1") + dimnames = [k for k in self.dataset.dimensions.keys()] + self.assertTrue(dimnames[0] == "dim1") dim = self.dataset.dimensions['dim1'] self.assertTrue(len(dim) == 3) def test_variables(self) : self.assertTrue(len(self.dataset.variables) == 1) - self.assertTrue(self.dataset.variables.keys()[0] == "var1") + varnames = [k for k in self.dataset.variables.keys()] + self.assertTrue(varnames[0] == "var1") var = self.dataset.variables['var1'] self.assertTrue(var.att1 == "dummy attribute") data = var[:] From 6bcf33aa04185ff8af6794915db9b366c9a263f1 Mon Sep 17 00:00:00 2001 From: Randall Pittman Date: Mon, 18 Feb 2019 17:12:28 -0800 Subject: [PATCH 2/3] 2/3 compat - A few extra changes cdlparser.py -- remove unnecessary list() from 2to3 test_constants.py -- Simple fix for netcdf4-python mask problem and make dimname and varname checks more pythonic --- cdlparser.py | 2 +- test/test_constants.py | 9 +++++---- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/cdlparser.py b/cdlparser.py index 34ed70c..a14409c 100644 --- a/cdlparser.py +++ b/cdlparser.py @@ -960,7 +960,7 @@ def main() : if len(sys.argv) > 2 : keys = [x.split('=')[0] for x in sys.argv[2:]] vals = [eval(x.split('=')[1]) for x in sys.argv[2:]] - kwargs = dict(list(zip(keys,vals))) + kwargs = dict(zip(keys,vals)) cdlparser = CDL3Parser(**kwargs) ncdataset = cdlparser.parse_file(cdlfile) try : diff --git a/test/test_constants.py b/test/test_constants.py index f6e71ab..41c6991 100644 --- a/test/test_constants.py +++ b/test/test_constants.py @@ -17,6 +17,9 @@ def setUp(self) : variables: float var1(dim1) ; var1:att1 = "dummy attribute" ; + // FillValue necessary to enable masking in NETCDF3_CLASSIC right now. + // See https://github.com/Unidata/netcdf4-python/issues/725. + var1:_FillValue = 9.9692099683868690e+36; // global attributes :c1 = "foo" ; // with spaces :c2="bar" ; // w/o spaces @@ -111,15 +114,13 @@ def test_double_array(self) : def test_dimensions(self) : self.assertTrue(len(self.dataset.dimensions) == 1) - dimnames = [k for k in self.dataset.dimensions.keys()] - self.assertTrue(dimnames[0] == "dim1") + self.assertTrue('dim1' in self.dataset.dimensions.keys()) dim = self.dataset.dimensions['dim1'] self.assertTrue(len(dim) == 3) def test_variables(self) : self.assertTrue(len(self.dataset.variables) == 1) - varnames = [k for k in self.dataset.variables.keys()] - self.assertTrue(varnames[0] == "var1") + self.assertTrue("var1" in self.dataset.variables.keys()) var = self.dataset.variables['var1'] self.assertTrue(var.att1 == "dummy attribute") data = var[:] From 48687e874ab687ea131959f4cf5f6950828baf94 Mon Sep 17 00:00:00 2001 From: Randall Pittman Date: Mon, 25 Feb 2019 15:02:32 -0800 Subject: [PATCH 3/3] cdlparser.py -- Replace expand_escapes() code to properly handle text containing either escape codes or unicode characters. --- cdlparser.py | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/cdlparser.py b/cdlparser.py index a14409c..7f1d518 100644 --- a/cdlparser.py +++ b/cdlparser.py @@ -72,6 +72,7 @@ __version_info__ = (0, 0, 8, 'beta', 0) __version__ = "%d.%d.%d-%s" % __version_info__[0:4] +import codecs import sys, os, logging, types import six import re @@ -179,9 +180,8 @@ def parse_file(self, cdlfile, ncfile=None) : :returns: A handle to a netCDF4.Dataset object. """ self.cdlfile = cdlfile - f = open(cdlfile) - data = f.read() # FIXME: can we parse input w/o reading entire CDL file into memory? - f.close() + with codecs.open(cdlfile, encoding="utf-8") as f: + data = f.read() # FIXME: can we parse input w/o reading entire CDL file into memory? return self.parse_text(data, ncfile=ncfile) def parse_text(self, cdltext, ncfile=None) : @@ -196,7 +196,8 @@ def parse_text(self, cdltext, ncfile=None) : Alternatively, this can be done immediately upon completion of parsing by setting the close_on_completion keyword argument to True when instantiating the CDLParser instance. - :param cdltext: String containing the CDL text to parse. + :param cdltext: String containing the CDL text to parse. Must be unicode str if containing + unicode. :param ncfile: Optional pathname of the netCDF file to receive output. :returns: A handle to a netCDF4.Dataset object. """ @@ -903,17 +904,30 @@ def deescapify(name) : i += 1 return newname + +# Regex for finding escape sequences +ESCAPE_SEQUENCE_RE = re.compile(r''' + ( \\U........ # 8-digit hex escapes + | \\u.... # 4-digit hex escapes + | \\x.. # 2-digit hex escapes + | \\[0-7]{1,3} # Octal escapes + | \\N\{[^}]+\} # Unicode characters by name + | \\[\\'"abfnrtv] # Single-character escapes + )''', re.UNICODE | re.VERBOSE) + #--------------------------------------------------------------------------------------------------- def expand_escapes(tstring) : #--------------------------------------------------------------------------------------------------- """ - A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply - uses the built-in string.decode() method. + Function to convert escapes to actual (unicode) characters. Fulfills the same purpose as + expand_escapes() in ncgen3/escapes.c or unescape() in ncgen/escapes.c. + Input string containing unicode must be a unicode string. + https://stackoverflow.com/a/24519338/2196270 """ - if six.PY2: - return tstring.decode('string_escape') - else: - return(bytes(tstring, 'utf-8').decode('unicode_escape')) + def decode_match(match): + return codecs.decode(match.group(0), 'unicode-escape') + + return ESCAPE_SEQUENCE_RE.sub(decode_match, tstring) #--------------------------------------------------------------------------------------------------- def fix_octal(octal_str) :