diff --git a/bquery/ctable.py b/bquery/ctable.py index fd5b899..5ad039c 100644 --- a/bquery/ctable.py +++ b/bquery/ctable.py @@ -8,9 +8,133 @@ import os from bquery.ctable_ext import \ SUM, COUNT, COUNT_NA, COUNT_DISTINCT, SORTED_COUNT_DISTINCT +import ast +from types import ModuleType +try: + import meta +except ImportError: + pass class ctable(bcolz.ctable): + ### + ### Overriding bcolz.ctable methods + ### + + def __init__(self, *args, **kwargs): + self._transformers = [] + self.intermediary_cparams = None + super(ctable, self).__init__(*args, **kwargs) + + def eval(self, expression, **kwargs): + # TODO: clean this up/simplify once Blosc/bcolz#164 is resolved + user_dict = kwargs.pop('user_dict', {}) + if len(self._transformers) > 0 \ + and type(expression) is str: + expression, user_dict = self.transform_query(expression) + # TODO: clean this up/simplify once Blosc/bcolz#162 is resolved + if expression in ['True', 'False']: + out_flavor = kwargs.pop('out_flavor', None) + if out_flavor is None: + out_flavor = bcolz.defaults.eval_out_flavor + if out_flavor == 'numpy': + np.array([expression=='True']*len(self), dtype=dtype) \ + .view(np.ndarray) + else: + return bcolz.carray([expression=='True']*len(self)) + + if len(user_dict) == 0: + user_dict.update({key: self.cols[key] for key in self.cols}) + return bcolz.eval(expression, user_dict=user_dict, **kwargs) + + def where(self, expression, outcols=None, limit=None, skip=0): + # if query transformers are defined, transform query + if len(self._transformers) > 0 \ + and type(expression) is str: + expression, user_dict = self.transform_query(expression) + cparams = kwargs.pop('cparams', self.intermediary_cparams) + expression = self.eval(expression, user_dict=user_dict, + cparams=cparams) + return super(ctable, self).where(expression, outcols=outcols, + limit=limit, skip=skip) + + def __getitem__(self, key): + # if query transformers are defined, transform query + if len(self._transformers) > 0 \ + and isinstance(key, bcolz.py2help._strtypes) \ + and key not in self.names: + # key is not a column name, try to evaluate + key, user_dict = self.transform_query(key) + # TODO: clean this up/simplify once Blosc/bcolz#164 is resolved + user_dict.update({key: self.cols[key] for key in self.cols}) + arr = bcolz.eval(key, user_dict=user_dict, depth=3, + cparams=self.intermediary_cparams) + if arr.dtype.type != np.bool_: + raise IndexError( + "`key` %s does not represent a boolean " + "expression" % key) + # TODO: clean this up once Blosc/bcolz#162 is resolved + elif arr == False: + dtype = np.dtype([(name, self.cols[name].dtype) + for name in self.names]) + return np.empty(0, dtype=dtype).view(np.ndarray) + return self._where(arr) + + return super(ctable, self).__getitem__(key) + + ### + ### Extending bcolz.ctable + ### + + @property + def transformers(self): + """The list of :class:`QueryTransformer` instances that are applied + automatically to all query strings.""" + + return self._transformers + + @transformers.setter + def transformers(self, value): + # enable query transformation if the required modules are installed + try: + if not isinstance(meta, ModuleType): + raise NameError() + except NameError: + raise RuntimeError( + 'Query transformation requires the module `meta`.') + return + self._transformers = value + + def transform_query(self, query, user_dict=None): + """transform_query(query, user_dict=None) + + Applies the :class:`QueryTransformer` instances configured in + :attr:`self.transformers` to the `query`. + + Parameters + ---------- + query : string + A string forming a boolean expression, like + "(col1 == 'Example') & (col2 != 'Text')". + + Returns + ------- + out : (string, dict) + A tuple containing the transformed query string and a dictionary + where the variables added by the transformer can be found by name. + """ + + ast_tree = ast.parse(query) + if not ast_tree: + return query + + if user_dict is None: + user_dict = {} + for transformer in self._transformers: + ast_tree = transformer.apply(self, ast_tree, user_dict) + + return meta.dump_python_source(ast_tree).strip(), user_dict + def cache_valid(self, col): """ Checks whether the column has a factorization that exists and is not older than the source diff --git a/bquery/transformers.py b/bquery/transformers.py new file mode 100644 index 0000000..354c95e --- /dev/null +++ b/bquery/transformers.py @@ -0,0 +1,227 @@ +from ast import NodeTransformer +from ast import (Eq, NotEq, In, NotIn, BitOr, BitAnd, + Compare, BinOp, + Name, Load, Str, + ) +import ast +import copy + +import bcolz + +__all__ = ['standard_transformers', + 'QueryTransformer', + 'InOperatorTransformer', + 'CachedFactorOptimizer', + 'TrivialBooleanExpressionsOptimizer', + ] + +class QueryTransformer(NodeTransformer): + """A :class:`ast.NodeTransformer` subclass that walks the abstract syntax tree + of the query and allows modification of nodes. + + The user-provided dictionary of the variables in expression that is passed + to the `bcolz.eval()` function can accessed and modified as + :attr:`self.user_dict`. The calling :class:`ctable` instance is available + as :attr:`self.ctable_`. + + The `QueryTransformer` will walk the AST and use the return value of the + visitor methods to replace or remove the old node. If the return value of + the visitor method is ``None``, the node will be removed from its location, + otherwise it is replaced with the return value. The return value may be the + original node in which case no replacement takes place. + + Keep in mind that if the node you're operating on has child nodes you must + either transform the child nodes yourself or call the :meth:`generic_visit` + method for the node first. + + For nodes that were part of a collection of statements (that applies to all + statement nodes), the visitor may also return a list of nodes rather than + just a single node. + + The visitor functions for the nodes are ``'visit_'`` + class name of the + node. So a `TryFinally` node visit function would be `visit_TryFinally`. + If no visitor function exists for a node (return value `None`) the + `generic_visit` visitor is used instead. + + Usually you use the transformer like this:: + node = QueryTransformer().apply(ctable_, node, user_dict) + """ + + def apply(self, ctable_, node, user_dict): + self.user_dict = user_dict + self.ctable_ = ctable_ + return self.visit(node) + + +class InOperatorTransformer(QueryTransformer): + """A :class:`QueryTransformer` that converts comparisons with `in` and + `not in` operators into expressions using `==` and `!=`. + + Example: + `my_col in ['ABC', 'DEF']` is transformed into + `(my_col == 'ABC') | (my_col == 'DEF')` + + This is useful as Numexpr currently does not support `in` operators.""" + + def visit_Compare(self, node): + # first transform all child nodes if necessary + node = self.generic_visit(node) + + if not isinstance(node.ops[0], (In, NotIn)): + return node + + # replace `in` comparisions with empty comparison list + if len(node.comparators[0].elts) == 0: + if isinstance(node.ops[0], In): + return Name(id='False', ctx=Load()) + else: + return Name(id='True', ctx=Load()) + + compare_op , binop_op = self.get_operators(node.ops[0]) + # rewrite the first element in list using `==` / `!=` comparison + eq_expr = Compare( + left = node.left, + ops = [compare_op], + comparators = [node.comparators[0].elts[0]]) + # join similar comparisons for all othe elements using the appropriate + # binary operator, i.e. | or & + for element in node.comparators[0].elts[1:]: + eq_expr = BinOp( + left = eq_expr, + op = binop_op, + right = Compare( + left = copy.copy(node.left), + ops = [compare_op], + comparators = [element] + ) + ) + return eq_expr + + def get_operators(self, op): + if isinstance(op, In): + return Eq(), BitOr() + else: + return NotEq(), BitAnd() + + +class CachedFactorOptimizer(QueryTransformer): + """A :class:`QueryTransformer` that converts comparisons containing + columns with cached factors into comparisons using the factor instead. + + This potentially speeds up queries significantly: + - By detecting queries that will not return any values without + scanning the entire column. + - By evaluating the comparison on the integer typed factor rather than + a column of a datatype that is more costly to compare, e.g. String. + + The `CachedFactorOptimizer` should be followed by the + :class:`TrivialBooleanExpressionsOptimizer` to obtain the full benefit.""" + + def visit_Compare(self, node): + # first transform all child nodes if necessary + node = self.generic_visit(node) + + # check we have a simple comparison + if len(node.comparators) != 1 or len(node.ops) != 1: + return node + + # TODO: we currently do not sort the values of the cached + # factors. Therefore we cannot optimize inequalities + elif not isinstance(node.ops[0], (Eq, NotEq)): + return node + + # col_name == 'value' + if isinstance(node.left, Name): + var = node.left + val = node.comparators[0] + # 'value' == col_name + elif isinstance(node.comparators[0], Name): + var = node.comparators[0] + val = node.left + # we can accelerate expressions that contain at least one column ref + else: + return node + + col = var.id + if not self.ctable_.cache_valid(col): + return node + + # find factor id for requested value + col_values_rootdir = self.ctable_[col].rootdir + '.values' + carray_values = bcolz.carray(rootdir=col_values_rootdir, + mode='r') + idx = None + # deal with strings and number nodes + val_field = 's' if isinstance(val, Str) else 'n' + for index, value in enumerate(carray_values.iter()): + if value == getattr(val, val_field): + idx = index + break + # value not in cached factorisation + if idx is None: + if isinstance(node.ops[0], Eq): + return Name(id='False', ctx=Load()) + else: + return Name(id='True', ctx=Load()) + + # found value in cached factorisation: + # rewrite the comparison expression + setattr(val, val_field, idx) + var.id = 'bquery_factors_%s' % col + # load the factor for later use + if not self.user_dict.has_key('bquery_factors_%s' % col): + col_factor_rootdir = self.ctable_[col].rootdir + '.factor' + self.user_dict['bquery_factors_%s' % col] = \ + bcolz.carray(rootdir=col_factor_rootdir, mode='r') + return node + + +class TrivialBooleanExpressionsOptimizer(QueryTransformer): + """A :class:`QueryTransformer` that simplifies boolean expression + containing subparts that are trivial boolean expressions. + + Example: + `(my_col == 'ABC') | (False)` is transformed into + `False` + + This speeds up queries that can be logically determined to never return + any entries are not explicitly evaluated against the database.""" + + def visit_BinOp(self, node): + # first transform all child nodes if necessary + node = self.generic_visit(node) + + # only optimize & and | expressions + if not isinstance(node.op, (BitOr, BitAnd)): + return node + + if isinstance(node.left, Name): + name_operand = node.left + other_operand = node.right + elif isinstance(node.right, Name): + name_operand = node.right + other_operand = node.left + # no Name operand means no trivial boolean expressions + else: + return node + + # the Name operand is not a trivial boolean expression but a variable + if name_operand.id not in ['True', 'False']: + return node + + # simplify comparisons containing trivial boolean expression + if isinstance(node.op, BitOr): + if name_operand.id == 'True': + return name_operand + else: + return other_operand + else: + if name_operand.id == 'False': + return name_operand + else: + return other_operand + +# provides a convenient short-cut for configuring a set of standard transformers +standard_transformers = [InOperatorTransformer(), + TrivialBooleanExpressionsOptimizer(), + ]