visualfabriq · ARF1 · Mar 19, 2015 · Mar 21, 2015
diff --git a/bquery/ctable.py b/bquery/ctable.py
@@ -8,9 +8,133 @@
 import os
 from bquery.ctable_ext import \
     SUM, COUNT, COUNT_NA, COUNT_DISTINCT, SORTED_COUNT_DISTINCT
+import ast
+from types import ModuleType
+try:
+    import meta
+except ImportError:
+    pass
 
 
 class ctable(bcolz.ctable):
+    ###
+    ### Overriding bcolz.ctable methods
+    ###
+
+    def __init__(self, *args, **kwargs):
+        self._transformers = []
+        self.intermediary_cparams = None
+        super(ctable, self).__init__(*args, **kwargs)
+
+    def eval(self, expression, **kwargs):
+        # TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
+        user_dict = kwargs.pop('user_dict', {})
+        if len(self._transformers) > 0 \
+                and type(expression) is str:
+            expression, user_dict = self.transform_query(expression)
+            # TODO: clean this up/simplify once Blosc/bcolz#162 is resolved
+            if expression in ['True', 'False']:
+                out_flavor = kwargs.pop('out_flavor', None)
+                if out_flavor is None:
+                    out_flavor = bcolz.defaults.eval_out_flavor
+                if out_flavor == 'numpy':
+                    np.array([expression=='True']*len(self), dtype=dtype) \
+                        .view(np.ndarray)
+                else:
+                    return bcolz.carray([expression=='True']*len(self))
+
+        if len(user_dict) == 0:
+            user_dict.update({key: self.cols[key] for key in self.cols})
+        return bcolz.eval(expression, user_dict=user_dict, **kwargs)
+
+    def where(self, expression, outcols=None, limit=None, skip=0):
+        # if query transformers are defined, transform query
+        if len(self._transformers) > 0 \
+                and type(expression) is str:
+            expression, user_dict = self.transform_query(expression)
+            cparams = kwargs.pop('cparams', self.intermediary_cparams)
+            expression = self.eval(expression, user_dict=user_dict,
+                                   cparams=cparams)
+        return super(ctable, self).where(expression, outcols=outcols, 
+                                         limit=limit, skip=skip)
+
+    def __getitem__(self, key):
+        # if query transformers are defined, transform query
+        if len(self._transformers) > 0 \
+                and isinstance(key, bcolz.py2help._strtypes) \
+                and key not in self.names:
+            # key is not a column name, try to evaluate
+            key, user_dict = self.transform_query(key)
+            # TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
+            user_dict.update({key: self.cols[key] for key in self.cols})
+            arr = bcolz.eval(key, user_dict=user_dict, depth=3, 
+                             cparams=self.intermediary_cparams)
+            if arr.dtype.type != np.bool_:
+                raise IndexError(
+                    "`key` %s does not represent a boolean "
+                    "expression" % key)
+            # TODO: clean this up once Blosc/bcolz#162 is resolved
+            elif arr == False:
+                dtype = np.dtype([(name, self.cols[name].dtype) 
+                                  for name in self.names])
+                return np.empty(0, dtype=dtype).view(np.ndarray)
+            return self._where(arr)
+
+        return super(ctable, self).__getitem__(key)
+
+    ###
+    ### Extending bcolz.ctable
+    ###
+
+    @property
+    def transformers(self):
+        """The list of :class:`QueryTransformer` instances that are applied 
+        automatically to all query strings."""
+
+        return self._transformers
+
+    @transformers.setter
+    def transformers(self, value):
+        # enable query transformation if the required modules are installed
+        try:
+            if not isinstance(meta, ModuleType):
+                raise NameError()
+        except NameError:
+            raise RuntimeError(
+                'Query transformation requires the module `meta`.')
+            return
+        self._transformers = value
+
+    def transform_query(self, query, user_dict=None):
+        """transform_query(query, user_dict=None)
+
+        Applies the :class:`QueryTransformer` instances configured in 
+        :attr:`self.transformers` to the `query`.
+
+        Parameters
+        ----------
+        query : string
+            A string forming a boolean expression, like 
+            "(col1 == 'Example') & (col2 != 'Text')".
+
+        Returns
+        -------
+        out : (string, dict)
+            A tuple containing the transformed query string and a dictionary
+            where the variables added by the transformer can be found by name.
+        """
+
+        ast_tree = ast.parse(query)
+        if not ast_tree:
+            return query
+
+        if user_dict is None:
+            user_dict = {}
+        for transformer in self._transformers:
+            ast_tree = transformer.apply(self, ast_tree, user_dict)
+
+        return meta.dump_python_source(ast_tree).strip(), user_dict
+
     def cache_valid(self, col):
         """
         Checks whether the column has a factorization that exists and is not older than the source
@@ -32,7 +156,7 @@ def cache_valid(self, col):
         else:
             return False
 
-    def cache_factor(self, col_list, refresh=False):
+    def cache_factor(self, col_list, refresh=False, **kwargs):
         """
         Existing todos here are: these should be hidden helper carrays
         As in: not normal columns that you would normally see as a user
@@ -64,14 +188,14 @@ def cache_factor(self, col_list, refresh=False):
 
                 carray_factor = \
                     bcolz.carray([], dtype='int64', expectedlen=self.size,
-                                   rootdir=col_factor_rootdir, mode='w')
+                                   rootdir=col_factor_rootdir, mode='w', **kwargs)
                 _, values = \
                     ctable_ext.factorize(self[col], labels=carray_factor)
                 carray_factor.flush()
 
                 carray_values = \
                     bcolz.carray(values.values(), dtype=self[col].dtype,
-                                 rootdir=col_values_rootdir, mode='w')
+                                 rootdir=col_values_rootdir, mode='w', **kwargs)
                 carray_values.flush()
 
     def unique(self, col_or_col_list):

diff --git a/bquery/transformers.py b/bquery/transformers.py
@@ -0,0 +1,227 @@
+from ast import NodeTransformer
+from ast import (Eq, NotEq, In, NotIn, BitOr, BitAnd, 
+                 Compare, BinOp, 
+                 Name, Load, Str,
+                 )
+import ast
+import copy
+
+import bcolz
+
+__all__ = ['standard_transformers',
+           'QueryTransformer',
+           'InOperatorTransformer',
+           'CachedFactorOptimizer',
+           'TrivialBooleanExpressionsOptimizer',
+           ]
+
+class QueryTransformer(NodeTransformer):
+    """A :class:`ast.NodeTransformer` subclass that walks the abstract syntax tree 
+    of the query and allows modification of nodes.
+
+    The user-provided dictionary of the variables in expression that is passed 
+    to the `bcolz.eval()` function can accessed and modified as 
+    :attr:`self.user_dict`. The calling :class:`ctable` instance is available 
+    as :attr:`self.ctable_`.
+
+    The `QueryTransformer` will walk the AST and use the return value of the
+    visitor methods to replace or remove the old node. If the return value of
+    the visitor method is ``None``, the node will be removed from its location,
+    otherwise it is replaced with the return value. The return value may be the
+    original node in which case no replacement takes place.
+
+    Keep in mind that if the node you're operating on has child nodes you must
+    either transform the child nodes yourself or call the :meth:`generic_visit`
+    method for the node first.
+
+    For nodes that were part of a collection of statements (that applies to all
+    statement nodes), the visitor may also return a list of nodes rather than
+    just a single node.
+
+    The visitor functions for the nodes are ``'visit_'`` + class name of the 
+    node. So a `TryFinally` node visit function would be `visit_TryFinally`. 
+    If no visitor function exists for a node (return value `None`) the 
+    `generic_visit` visitor is used instead.
+
+    Usually you use the transformer like this::
+    node = QueryTransformer().apply(ctable_, node, user_dict)
+    """
+
+    def apply(self, ctable_, node, user_dict):
+        self.user_dict = user_dict
+        self.ctable_ = ctable_
+        return self.visit(node)
+
+
+class InOperatorTransformer(QueryTransformer):
+    """A :class:`QueryTransformer` that converts comparisons with `in` and 
+    `not in` operators into expressions using `==` and `!=`.
+
+    Example:
+        `my_col in ['ABC', 'DEF']` is transformed into
+        `(my_col == 'ABC') | (my_col == 'DEF')`
+
+    This is useful as Numexpr currently does not support `in` operators."""
+
+    def visit_Compare(self, node):
+        # first transform all child nodes if necessary
+        node = self.generic_visit(node)
+
+        if not isinstance(node.ops[0], (In, NotIn)):
+            return node
+
+        # replace `in` comparisions with empty comparison list
+        if len(node.comparators[0].elts) == 0:
+            if isinstance(node.ops[0], In):
+                return Name(id='False', ctx=Load())
+            else:
+                return Name(id='True', ctx=Load())
+
+        compare_op , binop_op = self.get_operators(node.ops[0])
+        # rewrite the first element in list using `==` / `!=` comparison
+        eq_expr = Compare(
+            left = node.left, 
+            ops = [compare_op], 
+            comparators = [node.comparators[0].elts[0]])
+        # join similar comparisons for all othe elements using the appropriate
+        # binary operator, i.e. | or &
+        for element in node.comparators[0].elts[1:]:
+            eq_expr = BinOp(
+                left = eq_expr,
+                op = binop_op,
+                right = Compare(
+                    left = copy.copy(node.left),
+                    ops = [compare_op],
+                    comparators = [element]
+                    )
+                )
+        return eq_expr
+
+    def get_operators(self, op):
+        if isinstance(op, In):
+            return Eq(), BitOr()
+        else:
+            return NotEq(), BitAnd()
+
+
+class CachedFactorOptimizer(QueryTransformer):
+    """A :class:`QueryTransformer` that converts comparisons containing 
+    columns with cached factors into comparisons using the factor instead.
+
+    This potentially speeds up queries significantly:
+     - By detecting queries that will not return any values without 
+       scanning the entire column.
+     - By evaluating the comparison on the integer typed factor rather than
+       a column of a datatype that is more costly to compare, e.g. String.
+
+    The `CachedFactorOptimizer` should be followed by the 
+    :class:`TrivialBooleanExpressionsOptimizer` to obtain the full benefit."""
+
+    def visit_Compare(self, node):
+        # first transform all child nodes if necessary
+        node = self.generic_visit(node)
+
+        # check we have a simple comparison
+        if len(node.comparators) != 1 or len(node.ops) != 1:
+            return node
+
+        # TODO: we currently do not sort the values of the cached
+        #       factors. Therefore we cannot optimize inequalities
+        elif not isinstance(node.ops[0], (Eq, NotEq)):
+            return node
+
+        # col_name == 'value'
+        if isinstance(node.left, Name):
+            var = node.left
+            val = node.comparators[0]
+        # 'value' == col_name
+        elif isinstance(node.comparators[0], Name):
+            var = node.comparators[0]
+            val = node.left
+        # we can accelerate expressions that contain at least one column ref
+        else:
+            return node
+
+        col = var.id
+        if not self.ctable_.cache_valid(col):
+            return node
+
+        # find factor id for requested value
+        col_values_rootdir = self.ctable_[col].rootdir + '.values'
+        carray_values = bcolz.carray(rootdir=col_values_rootdir, 
+                                        mode='r')
+        idx = None
+        # deal with strings and number nodes
+        val_field = 's' if isinstance(val, Str) else 'n'
+        for index, value in enumerate(carray_values.iter()):
+            if value == getattr(val, val_field):
+                idx = index
+                break
+        # value not in cached factorisation
+        if idx is None:
+            if isinstance(node.ops[0], Eq):
+                return Name(id='False', ctx=Load())
+            else:
+                return Name(id='True', ctx=Load())
+
+        # found value in cached factorisation:
+        # rewrite the comparison expression
+        setattr(val, val_field, idx)
+        var.id = 'bquery_factors_%s' % col
+        # load the factor for later use
+        if not self.user_dict.has_key('bquery_factors_%s' % col):
+            col_factor_rootdir = self.ctable_[col].rootdir + '.factor'
+            self.user_dict['bquery_factors_%s' % col] = \
+                bcolz.carray(rootdir=col_factor_rootdir, mode='r')
+        return node
+
+
+class TrivialBooleanExpressionsOptimizer(QueryTransformer):
+    """A :class:`QueryTransformer` that simplifies boolean expression 
+    containing subparts that are trivial boolean expressions.
+
+    Example:
+        `(my_col == 'ABC') | (False)` is transformed into
+        `False`
+
+    This speeds up queries that can be logically determined to never return 
+    any entries are not explicitly evaluated against the database."""
+
+    def visit_BinOp(self, node):
+        # first transform all child nodes if necessary
+        node = self.generic_visit(node)
+
+        # only optimize & and | expressions
+        if not isinstance(node.op, (BitOr, BitAnd)):
+            return node
+
+        if isinstance(node.left, Name):
+            name_operand = node.left
+            other_operand = node.right
+        elif isinstance(node.right, Name):
+            name_operand = node.right
+            other_operand = node.left
+        # no Name operand means no trivial boolean expressions
+        else:
+            return node
+
+        # the Name operand is not a trivial boolean expression but a variable
+        if name_operand.id not in ['True', 'False']:
+            return node
+
+        # simplify comparisons containing trivial boolean expression
+        if isinstance(node.op, BitOr):
+            if name_operand.id == 'True':
+                return name_operand
+            else:
+                return other_operand
+        else:
+            if name_operand.id == 'False':
+                return name_operand
+            else:
+                return other_operand
+
+# provides a convenient short-cut for configuring a set of standard transformers
+standard_transformers = [InOperatorTransformer(),
+                         TrivialBooleanExpressionsOptimizer(),
+                         ]