Skip to content
This repository was archived by the owner on Jun 19, 2025. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 127 additions & 3 deletions bquery/ctable.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,133 @@
import os
from bquery.ctable_ext import \
SUM, COUNT, COUNT_NA, COUNT_DISTINCT, SORTED_COUNT_DISTINCT
import ast
from types import ModuleType
try:
import meta
except ImportError:
pass


class ctable(bcolz.ctable):
###
### Overriding bcolz.ctable methods
###

def __init__(self, *args, **kwargs):
self._transformers = []
self.intermediary_cparams = None
super(ctable, self).__init__(*args, **kwargs)

def eval(self, expression, **kwargs):
# TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
user_dict = kwargs.pop('user_dict', {})
if len(self._transformers) > 0 \
and type(expression) is str:
expression, user_dict = self.transform_query(expression)
# TODO: clean this up/simplify once Blosc/bcolz#162 is resolved
if expression in ['True', 'False']:
out_flavor = kwargs.pop('out_flavor', None)
if out_flavor is None:
out_flavor = bcolz.defaults.eval_out_flavor
if out_flavor == 'numpy':
np.array([expression=='True']*len(self), dtype=dtype) \
.view(np.ndarray)
else:
return bcolz.carray([expression=='True']*len(self))

if len(user_dict) == 0:
user_dict.update({key: self.cols[key] for key in self.cols})
return bcolz.eval(expression, user_dict=user_dict, **kwargs)

def where(self, expression, outcols=None, limit=None, skip=0):
# if query transformers are defined, transform query
if len(self._transformers) > 0 \
and type(expression) is str:
expression, user_dict = self.transform_query(expression)
cparams = kwargs.pop('cparams', self.intermediary_cparams)
expression = self.eval(expression, user_dict=user_dict,
cparams=cparams)
return super(ctable, self).where(expression, outcols=outcols,
limit=limit, skip=skip)

def __getitem__(self, key):
# if query transformers are defined, transform query
if len(self._transformers) > 0 \
and isinstance(key, bcolz.py2help._strtypes) \
and key not in self.names:
# key is not a column name, try to evaluate
key, user_dict = self.transform_query(key)
# TODO: clean this up/simplify once Blosc/bcolz#164 is resolved
user_dict.update({key: self.cols[key] for key in self.cols})
arr = bcolz.eval(key, user_dict=user_dict, depth=3,
cparams=self.intermediary_cparams)
if arr.dtype.type != np.bool_:
raise IndexError(
"`key` %s does not represent a boolean "
"expression" % key)
# TODO: clean this up once Blosc/bcolz#162 is resolved
elif arr == False:
dtype = np.dtype([(name, self.cols[name].dtype)
for name in self.names])
return np.empty(0, dtype=dtype).view(np.ndarray)
return self._where(arr)

return super(ctable, self).__getitem__(key)

###
### Extending bcolz.ctable
###

@property
def transformers(self):
"""The list of :class:`QueryTransformer` instances that are applied
automatically to all query strings."""

return self._transformers

@transformers.setter
def transformers(self, value):
# enable query transformation if the required modules are installed
try:
if not isinstance(meta, ModuleType):
raise NameError()
except NameError:
raise RuntimeError(
'Query transformation requires the module `meta`.')
return
self._transformers = value

def transform_query(self, query, user_dict=None):
"""transform_query(query, user_dict=None)

Applies the :class:`QueryTransformer` instances configured in
:attr:`self.transformers` to the `query`.

Parameters
----------
query : string
A string forming a boolean expression, like
"(col1 == 'Example') & (col2 != 'Text')".

Returns
-------
out : (string, dict)
A tuple containing the transformed query string and a dictionary
where the variables added by the transformer can be found by name.
"""

ast_tree = ast.parse(query)
if not ast_tree:
return query

if user_dict is None:
user_dict = {}
for transformer in self._transformers:
ast_tree = transformer.apply(self, ast_tree, user_dict)

return meta.dump_python_source(ast_tree).strip(), user_dict

def cache_valid(self, col):
"""
Checks whether the column has a factorization that exists and is not older than the source
Expand All @@ -32,7 +156,7 @@ def cache_valid(self, col):
else:
return False

def cache_factor(self, col_list, refresh=False):
def cache_factor(self, col_list, refresh=False, **kwargs):
"""
Existing todos here are: these should be hidden helper carrays
As in: not normal columns that you would normally see as a user
Expand Down Expand Up @@ -64,14 +188,14 @@ def cache_factor(self, col_list, refresh=False):

carray_factor = \
bcolz.carray([], dtype='int64', expectedlen=self.size,
rootdir=col_factor_rootdir, mode='w')
rootdir=col_factor_rootdir, mode='w', **kwargs)
_, values = \
ctable_ext.factorize(self[col], labels=carray_factor)
carray_factor.flush()

carray_values = \
bcolz.carray(values.values(), dtype=self[col].dtype,
rootdir=col_values_rootdir, mode='w')
rootdir=col_values_rootdir, mode='w', **kwargs)
carray_values.flush()

def unique(self, col_or_col_list):
Expand Down
227 changes: 227 additions & 0 deletions bquery/transformers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
from ast import NodeTransformer
from ast import (Eq, NotEq, In, NotIn, BitOr, BitAnd,
Compare, BinOp,
Name, Load, Str,
)
import ast
import copy

import bcolz

__all__ = ['standard_transformers',
'QueryTransformer',
'InOperatorTransformer',
'CachedFactorOptimizer',
'TrivialBooleanExpressionsOptimizer',
]

class QueryTransformer(NodeTransformer):
"""A :class:`ast.NodeTransformer` subclass that walks the abstract syntax tree
of the query and allows modification of nodes.

The user-provided dictionary of the variables in expression that is passed
to the `bcolz.eval()` function can accessed and modified as
:attr:`self.user_dict`. The calling :class:`ctable` instance is available
as :attr:`self.ctable_`.

The `QueryTransformer` will walk the AST and use the return value of the
visitor methods to replace or remove the old node. If the return value of
the visitor method is ``None``, the node will be removed from its location,
otherwise it is replaced with the return value. The return value may be the
original node in which case no replacement takes place.

Keep in mind that if the node you're operating on has child nodes you must
either transform the child nodes yourself or call the :meth:`generic_visit`
method for the node first.

For nodes that were part of a collection of statements (that applies to all
statement nodes), the visitor may also return a list of nodes rather than
just a single node.

The visitor functions for the nodes are ``'visit_'`` + class name of the
node. So a `TryFinally` node visit function would be `visit_TryFinally`.
If no visitor function exists for a node (return value `None`) the
`generic_visit` visitor is used instead.

Usually you use the transformer like this::
node = QueryTransformer().apply(ctable_, node, user_dict)
"""

def apply(self, ctable_, node, user_dict):
self.user_dict = user_dict
self.ctable_ = ctable_
return self.visit(node)


class InOperatorTransformer(QueryTransformer):
"""A :class:`QueryTransformer` that converts comparisons with `in` and
`not in` operators into expressions using `==` and `!=`.

Example:
`my_col in ['ABC', 'DEF']` is transformed into
`(my_col == 'ABC') | (my_col == 'DEF')`

This is useful as Numexpr currently does not support `in` operators."""

def visit_Compare(self, node):
# first transform all child nodes if necessary
node = self.generic_visit(node)

if not isinstance(node.ops[0], (In, NotIn)):
return node

# replace `in` comparisions with empty comparison list
if len(node.comparators[0].elts) == 0:
if isinstance(node.ops[0], In):
return Name(id='False', ctx=Load())
else:
return Name(id='True', ctx=Load())

compare_op , binop_op = self.get_operators(node.ops[0])
# rewrite the first element in list using `==` / `!=` comparison
eq_expr = Compare(
left = node.left,
ops = [compare_op],
comparators = [node.comparators[0].elts[0]])
# join similar comparisons for all othe elements using the appropriate
# binary operator, i.e. | or &
for element in node.comparators[0].elts[1:]:
eq_expr = BinOp(
left = eq_expr,
op = binop_op,
right = Compare(
left = copy.copy(node.left),
ops = [compare_op],
comparators = [element]
)
)
return eq_expr

def get_operators(self, op):
if isinstance(op, In):
return Eq(), BitOr()
else:
return NotEq(), BitAnd()


class CachedFactorOptimizer(QueryTransformer):
"""A :class:`QueryTransformer` that converts comparisons containing
columns with cached factors into comparisons using the factor instead.

This potentially speeds up queries significantly:
- By detecting queries that will not return any values without
scanning the entire column.
- By evaluating the comparison on the integer typed factor rather than
a column of a datatype that is more costly to compare, e.g. String.

The `CachedFactorOptimizer` should be followed by the
:class:`TrivialBooleanExpressionsOptimizer` to obtain the full benefit."""

def visit_Compare(self, node):
# first transform all child nodes if necessary
node = self.generic_visit(node)

# check we have a simple comparison
if len(node.comparators) != 1 or len(node.ops) != 1:
return node

# TODO: we currently do not sort the values of the cached
# factors. Therefore we cannot optimize inequalities
elif not isinstance(node.ops[0], (Eq, NotEq)):
return node

# col_name == 'value'
if isinstance(node.left, Name):
var = node.left
val = node.comparators[0]
# 'value' == col_name
elif isinstance(node.comparators[0], Name):
var = node.comparators[0]
val = node.left
# we can accelerate expressions that contain at least one column ref
else:
return node

col = var.id
if not self.ctable_.cache_valid(col):
return node

# find factor id for requested value
col_values_rootdir = self.ctable_[col].rootdir + '.values'
carray_values = bcolz.carray(rootdir=col_values_rootdir,
mode='r')
idx = None
# deal with strings and number nodes
val_field = 's' if isinstance(val, Str) else 'n'
for index, value in enumerate(carray_values.iter()):
if value == getattr(val, val_field):
idx = index
break
# value not in cached factorisation
if idx is None:
if isinstance(node.ops[0], Eq):
return Name(id='False', ctx=Load())
else:
return Name(id='True', ctx=Load())

# found value in cached factorisation:
# rewrite the comparison expression
setattr(val, val_field, idx)
var.id = 'bquery_factors_%s' % col
# load the factor for later use
if not self.user_dict.has_key('bquery_factors_%s' % col):
col_factor_rootdir = self.ctable_[col].rootdir + '.factor'
self.user_dict['bquery_factors_%s' % col] = \
bcolz.carray(rootdir=col_factor_rootdir, mode='r')
return node


class TrivialBooleanExpressionsOptimizer(QueryTransformer):
"""A :class:`QueryTransformer` that simplifies boolean expression
containing subparts that are trivial boolean expressions.

Example:
`(my_col == 'ABC') | (False)` is transformed into
`False`

This speeds up queries that can be logically determined to never return
any entries are not explicitly evaluated against the database."""

def visit_BinOp(self, node):
# first transform all child nodes if necessary
node = self.generic_visit(node)

# only optimize & and | expressions
if not isinstance(node.op, (BitOr, BitAnd)):
return node

if isinstance(node.left, Name):
name_operand = node.left
other_operand = node.right
elif isinstance(node.right, Name):
name_operand = node.right
other_operand = node.left
# no Name operand means no trivial boolean expressions
else:
return node

# the Name operand is not a trivial boolean expression but a variable
if name_operand.id not in ['True', 'False']:
return node

# simplify comparisons containing trivial boolean expression
if isinstance(node.op, BitOr):
if name_operand.id == 'True':
return name_operand
else:
return other_operand
else:
if name_operand.id == 'False':
return name_operand
else:
return other_operand

# provides a convenient short-cut for configuring a set of standard transformers
standard_transformers = [InOperatorTransformer(),
TrivialBooleanExpressionsOptimizer(),
]