From 8c579039a2e612215eb84960c77284d54a18c4f8 Mon Sep 17 00:00:00 2001
From: Randall Pittman <randallpittman@outlook.com>
Date: Mon, 18 Feb 2019 12:08:14 -0800
Subject: [PATCH 1/3] 2/3 compatibility cdlparser.py -- initial 2/3
 compatibility steps

cdlparser.py -- 2/3 compatibility fix octal strings to use 0o

cdlparser.py -- Fix variable name

cdlparser.py -- 2/3 integer division

cdlparser.py -- 2/3 string_escape compatibility. CDL files must be utf-8 compatible.

test_constants.py -- 2/3 compat -- keys() is an iterator in PY3 and cannot be indexed.

test_charvars.py -- 2/3 compat. netcdf NC_CHAR are bytes, not strings

cdlparser.py -- 2/3 basestring compatibility
---
 cdlparser.py           | 55 +++++++++++++++++++++++++++++++-----------
 test/test_charvars.py  |  2 +-
 test/test_constants.py |  6 +++--
 3 files changed, 46 insertions(+), 17 deletions(-)

diff --git a/cdlparser.py b/cdlparser.py
index 11ad19b..34ed70c 100644
--- a/cdlparser.py
+++ b/cdlparser.py
@@ -67,15 +67,23 @@
 
 Creator: Phil Bentley
 """
+from __future__ import print_function
+
 __version_info__ = (0, 0, 8, 'beta', 0)
 __version__ = "%d.%d.%d-%s" % __version_info__[0:4]
 
 import sys, os, logging, types
+import six
+import re
 import ply.lex as lex
 from ply.lex import TOKEN
 import ply.yacc as yacc
 import netCDF4 as nc4
 import numpy as np
+from functools import reduce
+
+if not six.PY2:
+   long = int
 
 # default fill values for netCDF-3 data types (as defined in netcdf.h include file)
 NC_FILL_BYTE   = np.int8(-127)
@@ -376,6 +384,7 @@ def t_DOUBLE_CONST(self, t) :
    def t_SHORT_CONST(self, t) :
       r'[+-]?([0-9]+|0[xX][0-9a-fA-F]+)[sS]'
       #r'[+-]?[0-9]+[sS]|0[xX][0-9a-fA-F]+[sS]'   # original regex in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          int_val = int(eval(t.value[:-1]))
       except :
@@ -391,6 +400,7 @@ def t_SHORT_CONST(self, t) :
    def t_BYTE_CONST(self, t) :
       #r'[+-]?[0-9]+[Bb]'        # modified regex
       #r'[+-]?[0-9]*[0-9][Bb]'   # original regex in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          if t.value[0] == "'" :
             int_val = ord(eval(t.value))
@@ -411,6 +421,7 @@ def t_INT_CONST(self, t) :
       r'[+-]?([1-9][0-9]*|0[xX]?[0-9a-fA-F]+|0)'   # [Ll] suffix has been deprecated
       #r'[+-]?([1-9][0-9]*|0)[lL]?' # original regex for decimal integers in ncgen3.l file
       #r'0[xX]?[0-9a-fA-F]+[lL]?'   # original regex for octal or hex integers in ncgen3.l file
+      t.value = fix_octal(t.value)
       try :
          long_val = long(eval(t.value))
       except :
@@ -418,7 +429,7 @@ def t_INT_CONST(self, t) :
          raise CDLContentError(errmsg)
       if long_val < XDR_INT_MIN or long_val > XDR_INT_MAX :
          errmsg = "Integer constant outside valid range (%d -> %d): %s" \
-            % (XDR_INT_MIN, XDR_INT_MAX, int_val)
+            % (XDR_INT_MIN, XDR_INT_MAX, long_val)
          raise CDLContentError(errmsg)
       else :
          t.value = np.int32(long_val)
@@ -472,7 +483,7 @@ def p_dimdecl(self, p) :
                  | dimd EQUALS DOUBLE_CONST
                  | dimd EQUALS NC_UNLIMITED_K"""
       dimname = ""
-      if isinstance(p[3], basestring) :
+      if isinstance(p[3], six.string_types) :
          if p[3] == "unlimited" :
             if self.rec_dimname :
                raise CDLContentError("Only one UNLIMITED dimension is allowed.")
@@ -639,7 +650,7 @@ def p_datadecl(self, p) :
          try :
             self.write_var_data(var, arr)
             self.logger.info("Wrote %d data value(s) for variable %s" % (len(arr), p[1]))
-         except Exception, exc :
+         except Exception as exc :
             self.logger.error(str(exc))
             raise
 
@@ -772,7 +783,7 @@ def write_var_data(self, var, arr) :
       arrlen = len(arr)
       varlen = var.size
       if is_charvar and var.ndim > 0 :
-         varlen /= var.shape[-1]
+         varlen = varlen // var.shape[-1]
       reclen = 0
       self.logger.debug("Length of passed-in data array = %d" % arrlen)
       if varlen : self.logger.debug("Expected length of variable = %d" % varlen)
@@ -782,7 +793,7 @@ def write_var_data(self, var, arr) :
       if is_recvar :
          rec_dimlen = len(self.ncdataset.dimensions[self.rec_dimname])
          if rec_dimlen > 0 :   # record dimension has been set to non-zero
-            reclen = varlen / rec_dimlen
+            reclen = varlen // rec_dimlen
          else :                # record dimension is still equal to zero
             varlen = arrlen
             reclen = 1
@@ -806,7 +817,7 @@ def write_var_data(self, var, arr) :
             put_char_data(var, arr, reclen)
          else :
             put_numeric_data(var, arr, reclen)
-      except Exception, exc :
+      except Exception as exc :
          errmsg = "Error attempting to write data array for variable %s\n" % var._name
          errmsg += "Exception details are as follows:\n%s" % str(exc)
          raise CDLContentError(errmsg)
@@ -814,12 +825,12 @@ def write_var_data(self, var, arr) :
    def _lextest(self, data) :
       """private method - for test purposes only"""
       self.lexer.input(data)
-      print "-----"
+      print("-----")
       while 1 :
          t = self.lexer.token()
          if not t : break
-         print "type: %-15s\tvalue: %s" % (t.type, t.value)
-      print "-----"
+         print("type: %-15s\tvalue: %s" % (t.type, t.value))
+      print("-----")
 
 #---------------------------------------------------------------------------------------------------
 def put_numeric_data(var, arr, reclen=0) :
@@ -827,7 +838,7 @@ def put_numeric_data(var, arr, reclen=0) :
    """Write numeric data array to netcdf variable."""
    nparr = np.array(arr, dtype=var.dtype)
    shape = list(var.shape)
-   if reclen : shape[0] = len(arr) / reclen
+   if reclen : shape[0] = len(arr) // reclen
    nparr.shape = shape
    var[:] = nparr
 
@@ -838,7 +849,7 @@ def put_char_data(var, arr, reclen=0) :
    maxlen = var.shape[-1] if var.ndim > 0 else 1
    nparr = str_list_to_char_arr(arr, maxlen)
    shape = list(var.shape)
-   if reclen : shape[0] = len(arr) / reclen
+   if reclen : shape[0] = len(arr) // reclen
    nparr.shape = shape
    var[:] = nparr
 
@@ -899,7 +910,23 @@ def expand_escapes(tstring) :
    A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply
    uses the built-in string.decode() method.
    """
-   return tstring.decode('string_escape')
+   if six.PY2:
+      return tstring.decode('string_escape')
+   else:
+      return(bytes(tstring, 'utf-8').decode('unicode_escape'))
+
+#---------------------------------------------------------------------------------------------------
+def fix_octal(octal_str) :
+#---------------------------------------------------------------------------------------------------
+   """
+   Fixes anything octal, including +/- prefix and letter suffix to use "0o"
+   """
+   m = re.match(r"([+-]?)0(\d+.*)", octal_str)
+   if m:
+      # Make octal python 3 compatible
+      return m.group(1) + "0o" + m.group(2)
+   else:
+      return octal_str
 
 #---------------------------------------------------------------------------------------------------
 def get_default_fill_value(datatype) :
@@ -926,14 +953,14 @@ def main() :
    """Rudimentary main function - primarily for testing purposes at this point in time."""
    debug = 0
    if len(sys.argv) < 2 :
-      print "usage: python cdlparser.py cdlfile [keyword=value, ...]"
+      print("usage: python cdlparser.py cdlfile [keyword=value, ...]")
       sys.exit(1)
    cdlfile = sys.argv[1]
    kwargs = {}
    if len(sys.argv) > 2 :
       keys = [x.split('=')[0] for x in sys.argv[2:]]
       vals = [eval(x.split('=')[1]) for x in sys.argv[2:]]
-      kwargs = dict(zip(keys,vals))
+      kwargs = dict(list(zip(keys,vals)))
    cdlparser = CDL3Parser(**kwargs)
    ncdataset = cdlparser.parse_file(cdlfile)
    try :
diff --git a/test/test_charvars.py b/test/test_charvars.py
index e2e5788..ca95da5 100644
--- a/test/test_charvars.py
+++ b/test/test_charvars.py
@@ -50,7 +50,7 @@ def tearDown(self) :
 
    def test_scalar_variables(self) :
       var = self.dataset.variables['letter']
-      self.assertTrue(var[:] == "X")
+      self.assertTrue(var[:] == b"X")
 
    def test_non_scalar_variables(self) :
       var = self.dataset.variables['regcodes']
diff --git a/test/test_constants.py b/test/test_constants.py
index 639c932..f6e71ab 100644
--- a/test/test_constants.py
+++ b/test/test_constants.py
@@ -111,13 +111,15 @@ def test_double_array(self) :
 
    def test_dimensions(self) :
       self.assertTrue(len(self.dataset.dimensions) == 1)
-      self.assertTrue(self.dataset.dimensions.keys()[0] == "dim1")
+      dimnames = [k for k in self.dataset.dimensions.keys()]
+      self.assertTrue(dimnames[0] == "dim1")
       dim = self.dataset.dimensions['dim1']
       self.assertTrue(len(dim) == 3)
 
    def test_variables(self) :
       self.assertTrue(len(self.dataset.variables) == 1)
-      self.assertTrue(self.dataset.variables.keys()[0] == "var1")
+      varnames = [k for k in self.dataset.variables.keys()]
+      self.assertTrue(varnames[0] == "var1")
       var = self.dataset.variables['var1']
       self.assertTrue(var.att1 == "dummy attribute")
       data = var[:]

From 6bcf33aa04185ff8af6794915db9b366c9a263f1 Mon Sep 17 00:00:00 2001
From: Randall Pittman <randallpittman@outlook.com>
Date: Mon, 18 Feb 2019 17:12:28 -0800
Subject: [PATCH 2/3] 2/3 compat - A few extra changes

cdlparser.py -- remove unnecessary list() from 2to3

test_constants.py -- Simple fix for netcdf4-python mask problem and make dimname and varname checks more pythonic
---
 cdlparser.py           | 2 +-
 test/test_constants.py | 9 +++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cdlparser.py b/cdlparser.py
index 34ed70c..a14409c 100644
--- a/cdlparser.py
+++ b/cdlparser.py
@@ -960,7 +960,7 @@ def main() :
    if len(sys.argv) > 2 :
       keys = [x.split('=')[0] for x in sys.argv[2:]]
       vals = [eval(x.split('=')[1]) for x in sys.argv[2:]]
-      kwargs = dict(list(zip(keys,vals)))
+      kwargs = dict(zip(keys,vals))
    cdlparser = CDL3Parser(**kwargs)
    ncdataset = cdlparser.parse_file(cdlfile)
    try :
diff --git a/test/test_constants.py b/test/test_constants.py
index f6e71ab..41c6991 100644
--- a/test/test_constants.py
+++ b/test/test_constants.py
@@ -17,6 +17,9 @@ def setUp(self) :
          variables:
             float var1(dim1) ;
                var1:att1 = "dummy attribute" ;
+               // FillValue necessary to enable masking in NETCDF3_CLASSIC right now.
+               // See https://github.com/Unidata/netcdf4-python/issues/725.
+               var1:_FillValue = 9.9692099683868690e+36;
          // global attributes
             :c1 = "foo" ;      // with spaces
             :c2="bar" ;        // w/o spaces
@@ -111,15 +114,13 @@ def test_double_array(self) :
 
    def test_dimensions(self) :
       self.assertTrue(len(self.dataset.dimensions) == 1)
-      dimnames = [k for k in self.dataset.dimensions.keys()]
-      self.assertTrue(dimnames[0] == "dim1")
+      self.assertTrue('dim1' in self.dataset.dimensions.keys())
       dim = self.dataset.dimensions['dim1']
       self.assertTrue(len(dim) == 3)
 
    def test_variables(self) :
       self.assertTrue(len(self.dataset.variables) == 1)
-      varnames = [k for k in self.dataset.variables.keys()]
-      self.assertTrue(varnames[0] == "var1")
+      self.assertTrue("var1" in self.dataset.variables.keys())
       var = self.dataset.variables['var1']
       self.assertTrue(var.att1 == "dummy attribute")
       data = var[:]

From 48687e874ab687ea131959f4cf5f6950828baf94 Mon Sep 17 00:00:00 2001
From: Randall Pittman <randallpittman@outlook.com>
Date: Mon, 25 Feb 2019 15:02:32 -0800
Subject: [PATCH 3/3] cdlparser.py -- Replace expand_escapes() code to properly
 handle text containing either escape codes or unicode characters.

---
 cdlparser.py | 34 ++++++++++++++++++++++++----------
 1 file changed, 24 insertions(+), 10 deletions(-)

diff --git a/cdlparser.py b/cdlparser.py
index a14409c..7f1d518 100644
--- a/cdlparser.py
+++ b/cdlparser.py
@@ -72,6 +72,7 @@
 __version_info__ = (0, 0, 8, 'beta', 0)
 __version__ = "%d.%d.%d-%s" % __version_info__[0:4]
 
+import codecs
 import sys, os, logging, types
 import six
 import re
@@ -179,9 +180,8 @@ def parse_file(self, cdlfile, ncfile=None) :
       :returns: A handle to a netCDF4.Dataset object.
       """
       self.cdlfile = cdlfile
-      f = open(cdlfile)
-      data = f.read()   # FIXME: can we parse input w/o reading entire CDL file into memory?
-      f.close()
+      with codecs.open(cdlfile, encoding="utf-8") as f:
+         data = f.read()   # FIXME: can we parse input w/o reading entire CDL file into memory?
       return self.parse_text(data, ncfile=ncfile)
 
    def parse_text(self, cdltext, ncfile=None) :
@@ -196,7 +196,8 @@ def parse_text(self, cdltext, ncfile=None) :
       Alternatively, this can be done immediately upon completion of parsing by setting the
       close_on_completion keyword argument to True when instantiating the CDLParser instance.
 
-      :param cdltext: String containing the CDL text to parse.
+      :param cdltext: String containing the CDL text to parse. Must be unicode str if containing
+                      unicode.
       :param ncfile: Optional pathname of the netCDF file to receive output.
       :returns: A handle to a netCDF4.Dataset object.
       """
@@ -903,17 +904,30 @@ def deescapify(name) :
       i += 1
    return newname
 
+
+# Regex for finding escape sequences
+ESCAPE_SEQUENCE_RE = re.compile(r'''
+    ( \\U........      # 8-digit hex escapes
+    | \\u....          # 4-digit hex escapes
+    | \\x..            # 2-digit hex escapes
+    | \\[0-7]{1,3}     # Octal escapes
+    | \\N\{[^}]+\}     # Unicode characters by name
+    | \\[\\'"abfnrtv]  # Single-character escapes
+   )''', re.UNICODE | re.VERBOSE)
+
 #---------------------------------------------------------------------------------------------------
 def expand_escapes(tstring) :
 #---------------------------------------------------------------------------------------------------
    """
-   A Python version of ncgen's expand_escapes() function (see escapes.c). This function simply
-   uses the built-in string.decode() method.
+   Function to convert escapes to actual (unicode) characters. Fulfills the same purpose as
+      expand_escapes() in ncgen3/escapes.c or unescape() in ncgen/escapes.c.
+   Input string containing unicode must be a unicode string.
+   https://stackoverflow.com/a/24519338/2196270
    """
-   if six.PY2:
-      return tstring.decode('string_escape')
-   else:
-      return(bytes(tstring, 'utf-8').decode('unicode_escape'))
+   def decode_match(match):
+      return codecs.decode(match.group(0), 'unicode-escape')
+
+   return ESCAPE_SEQUENCE_RE.sub(decode_match, tstring)
 
 #---------------------------------------------------------------------------------------------------
 def fix_octal(octal_str) :