diff --git a/pydough/conversion/hybrid_expressions.py b/pydough/conversion/hybrid_expressions.py index d28a78286..8b8f19843 100644 --- a/pydough/conversion/hybrid_expressions.py +++ b/pydough/conversion/hybrid_expressions.py @@ -14,10 +14,12 @@ "HybridFunctionExpr", "HybridLiteralExpr", "HybridRefExpr", + "make_condition", ] import copy from abc import ABC, abstractmethod +from collections.abc import Collection import pydough.pydough_operators as pydop from pydough.qdag import ( @@ -766,3 +768,30 @@ def expand_sided(self, shift: int) -> HybridExpr: self.typ, self.kwargs, ) + + +def make_condition( + expressions: Collection[HybridExpr], conjunction: bool +) -> HybridExpr: + """ + Converts a list of expressions into a composite boolean expression, either + a conjunction or disjunction. If the list is empty, returns a literal True + expression. + + Args: + `expressions`: the expressions to combine into a condition. + `conjunction`: whether to combine the expressions using AND (if True) or + OR (if False). + + Returns: + The combined condition expression. + """ + if not expressions: + return HybridLiteralExpr(Literal(True, BooleanType())) + if len(expressions) == 1: + return next(iter(expressions)) + if conjunction: + return HybridFunctionExpr( + pydop.BAN, sorted(expressions, key=repr), BooleanType() + ) + return HybridFunctionExpr(pydop.BOR, sorted(expressions, key=repr), BooleanType()) diff --git a/pydough/conversion/hybrid_filter_merger.py b/pydough/conversion/hybrid_filter_merger.py new file mode 100644 index 000000000..9062ab84f --- /dev/null +++ b/pydough/conversion/hybrid_filter_merger.py @@ -0,0 +1,560 @@ +""" +Logic to merge multiple subtrees in the hybrid tree into one if they are the +same except one of them has more filters than the other and is only used in +a COUNT aggregation, meaning the filter can be implemented by doing a SUM on +the less-filtered subtree where the SUM argument is the additional filters. +""" + +import copy +from typing import TYPE_CHECKING + +import pydough.pydough_operators as pydop +from pydough.qdag import Literal +from pydough.types import BooleanType, NumericType + +from .hybrid_connection import ConnectionType +from .hybrid_expressions import ( + HybridChildRefExpr, + HybridExpr, + HybridFunctionExpr, + HybridLiteralExpr, + make_condition, +) +from .hybrid_operations import ( + HybridCalculate, + HybridFilter, + HybridLimit, +) +from .hybrid_tree import HybridTree + +if TYPE_CHECKING: + from .hybrid_translator import HybridTranslator + + +class HybridFilterMerger: + """ + TODO + """ + + def __init__(self, translator: "HybridTranslator") -> None: + self.translator: HybridTranslator = translator + + def merge_filters(self, tree: HybridTree) -> None: + """ + The main protocol that runs the filter merging procedure on the given + tree with regards to its children, then recursively invokes the same + procedure on the rest of the tree. + """ + # Keep a set of all children that are marked for certain deletion. + must_delete: set[int] = set() + + # Run the main procedure on subtrees with multiple children. + if len(tree.children) > 1: + # Identify which children are only used by a COUNT aggregation that + # is not ONLY_MATCH. + mergeable_children: set[int] = self.identify_mergeable_children(tree) + + # Extract the set of filters in the bottom level of each child tree, + # only considering filters after a critical point (limits, windows). + child_filters: list[set[HybridExpr]] = [ + self.get_final_filters(child.subtree) for child in tree.children + ] + + # Obtain a mapping from each child to the set of all other children + # in the tree that are isomorphic to it excluding filters after + # the critical point. + child_isomorphisms: list[set[int]] = self.get_child_isomorphisms(tree) + + # Create a DAG of each mergeable child to another child that it is + # isomorphic to except that the other child has a strict subset of + # its filters, if such a child exists, otherwise None. Path + # compression is used to ensure there is no daisy chain. + filter_dag: list[int | None] = self.make_filter_dag( + mergeable_children, child_filters, child_isomorphisms + ) + + # Create a secondary mapping to indicate pools of children that were + # not merged by the dag because there was no child with a filter + # subset relationship, but are still isomorphic to one another. + # These are stored in the form of a pool of isomorphic children, + # where one member of the pool is the key and the rest are the + # value. + secondary_merges: dict[int, set[int]] = self.make_secondary_merges( + mergeable_children, child_isomorphisms, filter_dag + ) + + # Build up a dictionary indicating all COUNT(*) references in the + # tree that have been replaced with a SUM(cond) reference in a + # different child of the tree. + replacement_map: dict[HybridExpr, HybridExpr] = {} + + # For each pair (source -> target) in the filter DAG, run the + # basic merging procedure. + for source_idx, target_idx in enumerate(filter_dag): + # Make sure the source maps to a target, as opposed to None. + if target_idx is None: + continue + # Identify all the filters in the source and not the child, + # and vice versa. There should be at least 1 extra filter in + # the source, but no extra filters in the target since it is + # a subset relationship. + extra_source_filters: set[HybridExpr] = ( + child_filters[source_idx] - child_filters[target_idx] + ) + extra_target_filters: set[HybridExpr] = ( + child_filters[target_idx] - child_filters[source_idx] + ) + assert len(extra_source_filters) > 0 and len(extra_target_filters) == 0 + # Run the merge subset filter procedure for this source and + # target, updating the replacement map and deletion set. + self.merge_subset_filters( + tree, + source_idx, + target_idx, + extra_source_filters, + replacement_map, + must_delete, + ) + + # For each (target <- source_pool), run the more advanced algorithm + # which combines multiple children with distinct sets of filters. + for target_idx, source_idxs in secondary_merges.items(): + self.merge_partial_disjoint_filters( + tree, + target_idx, + source_idxs, + child_filters, + replacement_map, + must_delete, + ) + + # Replace all of the COUNT(*) terms in the current tree from a + # remapped child with the new SUM expression. + for operation in tree.pipeline: + operation.replace_expressions(replacement_map) + + # Before moving on, we need to remove any dead children from the tree. + tree.remove_dead_children(must_delete) + + # Run the procedure recursively on the parent tree and the child + # subtrees. + if tree.parent is not None: + self.merge_filters(tree.parent) + for child in tree.children: + self.merge_filters(child.subtree) + + def merge_subset_filters( + self, + tree: HybridTree, + source_idx: int, + target_idx: int, + extra_source_filters: set[HybridExpr], + replacement_map: dict[HybridExpr, HybridExpr], + must_delete: set[int], + ) -> None: + """ + Run the merging procedure on a source and target child where the source + has a strict superset of filters compared to the target, and the same + underlying aggregation structure, meaning the source can be merged into + the target. + + Args: + `tree`: The tree whose children are being merged. + `source_idx`: The index of the source child that is being merged. + `target_idx`: The index of the target child that is being merged + into. + `extra_source_filters`: The set of filters in the source child that + are not in the target child. + `replacement_map`: A mapping that must be updated to indicate any + references to the old source child and a new reference in the target + child that the should be remapped to. + `must_delete`: A set of child indices that must be updated to + include the source child index, since it will be merged into the + target and therefore removed. + """ + # Build a new aggregation SUM(IFF(conj, 1, 0)) where conj is the + # conjunction of all the extra filters from the source subtree. + new_cond: HybridExpr = make_condition(extra_source_filters, True) + numeric_expr: HybridExpr = HybridFunctionExpr( + pydop.IFF, + [ + new_cond, + HybridLiteralExpr(Literal(1, NumericType())), + HybridLiteralExpr(Literal(0, NumericType())), + ], + NumericType(), + ) + sum_expr: HybridFunctionExpr = HybridFunctionExpr( + pydop.SUM, + [numeric_expr], + NumericType(), + ) + # Insert the new aggregation into the target subtree, and update the + # replacement map to point from the old COUNT(*) reference in the source + # subtree to the new SUM expression reference in the target subtree. + agg_name: str = self.translator.gen_agg_name(tree.children[target_idx]) + tree.children[target_idx].aggs[agg_name] = sum_expr + agg_ref: HybridExpr = HybridChildRefExpr(agg_name, target_idx, NumericType()) + old_agg_ref = HybridChildRefExpr( + next( + name + for name, expr in tree.children[source_idx].aggs.items() + if repr(expr) == "COUNT()" + ), + source_idx, + NumericType(), + ) + replacement_map[old_agg_ref] = agg_ref + + # Update the min/max steps of the target subtree to indicate overlap + # with the source subtree. + tree.children[target_idx].max_steps = min( + tree.children[target_idx].max_steps, + tree.children[source_idx].max_steps, + ) + tree.children[target_idx].min_steps = min( + tree.children[target_idx].min_steps, + tree.children[source_idx].min_steps, + ) + + # Add a new filter for the extra conditions from the source + # subtree if it was an ONLY_MATCH, checking whether the SUM + # is not zero, indicating that there was a match. + if ( + tree.children[source_idx].connection_type + == ConnectionType.AGGREGATION_ONLY_MATCH + ): + tree.add_operation( + HybridFilter( + tree.pipeline[-1], + HybridFunctionExpr( + pydop.NEQ, + [agg_ref, HybridLiteralExpr(Literal(0, NumericType()))], + BooleanType(), + ), + ) + ) + + # Finally, mark the source child for deletion since it has now been + # merged into the target child, unless they are the same child (see + # the special case in `merge_partial_disjoint_filters`). + if source_idx != target_idx: + must_delete.add(source_idx) + + def merge_partial_disjoint_filters( + self, + tree: HybridTree, + target_idx: int, + source_idxs: set[int], + all_filters: list[set[HybridExpr]], + replacement_map: dict[HybridExpr, HybridExpr], + must_delete: set[int], + ) -> None: + """ + Run the merging procedure on a pool of multiple source children that + are isomorphic to the target child, but where there is no subset + relationship. This is done by transforming the target child to have + a disjunction of all the filters from the source children, then + making all of the COUNT(*) calls from the different target/sources + be on all the filters from that specific child that are not in all of + the other children. + + Args: + `tree`: The tree whose children are being merged. + `target_idx`: The index of the target child that is being merged + into. + `source_idxs`: The set of indices of the source children that are + being merged into the target. + `all_filters`: A list of the sets of filters in each child subtree + after the critical point. + `replacement_map`: A mapping that must be updated to indicate any + references to the old source child and a new reference in the target + child that the should be remapped to. + `must_delete`: A set of child indices that must be updated to + include the source child indices, since they will be merged into the + target and therefore removed. + """ + # Identify any filters that are in all the children, since these can be + # ignored when creating the new conditions for the aggregations. + intersection = set.intersection( + *(all_filters[source_idx] for source_idx in source_idxs), + all_filters[target_idx], + ) + + # For each of the source children, merge it onto the target child as if + # it were a subset merge. + for source_idx in sorted(source_idxs): + extra_source_filters: set[HybridExpr] = ( + all_filters[source_idx] - intersection + ) + self.merge_subset_filters( + tree, + source_idx, + target_idx, + extra_source_filters, + replacement_map, + must_delete, + ) + + # Merge the target child onto itself using the extra filters it has, so + # that its own COUNT(*) is replaced with a SUM over the filters that it + # has that are not in all the other children. + extra_target_filters: set[HybridExpr] = all_filters[target_idx] - intersection + self.merge_subset_filters( + tree, + target_idx, + target_idx, + extra_target_filters, + replacement_map, + must_delete, + ) + + # Build up a list of the conjunctions for each source child, which will + # be used to create the new disjunctive condition for the target + # subtree. + source_conjunctions: list[HybridExpr] = [] + for source_idx in sorted(source_idxs): + source_cond: HybridExpr = make_condition(all_filters[source_idx], True) + source_conjunctions.append(source_cond) + + # Build a disjunction of the conjunctions from the source children. + new_disjunction: HybridExpr = make_condition(source_conjunctions, False) + + # Now go back through the target subtree, find any existing filters + # after any window/limit, and make them a disjunction of the existing + # filters and the disjunction of source conjunctions. + for operation in reversed(tree.children[target_idx].subtree.pipeline): + if isinstance(operation, HybridFilter): + if operation.condition.contains_window_functions(): + break + operation.condition = HybridFunctionExpr( + pydop.BOR, + [operation.condition, new_disjunction], + BooleanType(), + ) + elif isinstance(operation, HybridLimit): + break + elif isinstance(operation, HybridCalculate): + if any( + expr.contains_window_functions() + for expr in operation.new_expressions.values() + ): + break + + def identify_mergeable_children(self, tree: HybridTree) -> set[int]: + """ + Identify the subset of child indices from a hybrid tree where the child + is an aggregation where the only aggregate is a single COUNT(*). + + Args: + `tree`: The tree whose children we are checking. + + Returns: + A set of the indices of the children that are only used by a COUNT + aggregation that is not ONLY_MATCH. + """ + return { + idx + for idx, child in enumerate(tree.children) + if ( + child.connection_type + in (ConnectionType.AGGREGATION, ConnectionType.AGGREGATION_ONLY_MATCH) + and {repr(v) for v in child.aggs.values()} == {"COUNT()"} + ) + } + + def get_final_filters(self, tree: HybridTree) -> set[HybridExpr]: + """ + Identify the set of all filter conditions that appear in the current + tree's pipeline after any critical points (limits or window functions). + + Args: + `tree`: The tree whose pipeline we are checking. + + Returns: + The set of filters. + """ + result: set[HybridExpr] = set() + for operation in reversed(tree.pipeline): + if isinstance(operation, HybridFilter): + if operation.condition.contains_window_functions(): + break + result.update(operation.condition.get_conjunction()) + elif isinstance(operation, HybridLimit): + break + elif isinstance(operation, HybridCalculate): + if any( + expr.contains_window_functions() + for expr in operation.new_expressions.values() + ): + break + return result + + def get_child_isomorphisms(self, tree: HybridTree) -> list[set[int]]: + """ + Return a datastructure mapping each child index to the set of all other + child indices that have the same canonical form after stripping away all + filters after any critical points. + + Args: + `tree`: The tree whose children we are checking. + + Returns: + A list where the i'th element is the set of all other child indices + that are isomorphic to the i'th child after stripping away all + filters after any critical points. + """ + # Extract the canonical forms + filter_stripped_forms: list[str] = [ + self.get_filter_stripped_form(child.subtree) for child in tree.children + ] + result: list[set[int]] = [] + for i, form in enumerate(filter_stripped_forms): + alternatives: set[int] = set() + for j, other_form in enumerate(filter_stripped_forms): + if i != j and form == other_form: + alternatives.add(j) + result.append(alternatives) + return result + + def get_filter_stripped_form(self, tree: HybridTree) -> str: + """ + Create a canonical string representation of the tree structure for the + hybrid tree after stripping away all filters after any critical + points (limits or window functions). Also includes the join keys, so + as to ensure that the canonical form reflects the same join + conditions. + + Args: + `tree`: The tree whose canonical form we are computing. + + Returns: + The canonical form as a string. + """ + # Make a clone of the tree + stripped_tree = copy.deepcopy(tree) + + # Go backwards in the tree pipeline and remove all filters until + # reaching a window function or limit. + for idx, operation in reversed(list(enumerate(stripped_tree.pipeline))): + if isinstance(operation, HybridFilter): + if operation.condition.contains_window_functions(): + break + stripped_tree.pipeline.pop(idx) + elif isinstance(operation, HybridLimit): + break + elif isinstance(operation, HybridCalculate): + if any( + expr.contains_window_functions() + for expr in operation.new_expressions.values() + ): + break + + # Return the string form of the transformed tree along with its + # join keys. + return repr(stripped_tree) + f" {stripped_tree.join_keys}" + + def make_filter_dag( + self, + mergeable_children: set[int], + child_filters: list[set[HybridExpr]], + child_isomorphisms: list[set[int]], + ) -> list[int | None]: + """ + Create a DAG mapping each child onto another child in the subtree such + that the source child is mergeable, the target child has a subset of the + filters of the source child, and the two children are isomorphic after + stripping away filters. If no such mapping exists for a child, it maps to + None. Path compression is used to ensure there are no daisy chains, so + that if A maps to B and B maps to C, then A will map directly to C. + + Args: + `mergeable_children`: The set of child indices that are mergeable. + `child_filters`: A list of the sets of filters in each child subtree + after the critical point. + `child_isomorphisms`: A list where the i'th element is the set of all + other child indices that are isomorphic to the i'th child after + stripping away all filters after any critical points. + + Returns: + A list where the i'th element is either None if there is no child + that the i'th child can be merged into, or the index of a child that + the i'th child can be merged into, meaning that the i'th child has a + strict superset of filters compared to that child, and they are + isomorphic after stripping away filters. + """ + # Build up the initial DAG as all-None, then fill in as connections + # are formed. + dag: list[int | None] = [None for _ in range(len(child_filters))] + + # Build initial edges from each mergeable child to another isomorphic + # child that is a subset of its filter list. + for idx in mergeable_children: + for other_idx in sorted(child_isomorphisms[idx]): + if child_filters[other_idx] < child_filters[idx]: + dag[idx] = other_idx + break + + # Collapse transitive edges with path compression. + for idx in range(len(dag)): + if dag[idx] is not None: + while True: + target_idx: int | None = dag[idx] + if target_idx is None or dag[target_idx] is None: + break + dag[idx] = dag[target_idx] + return dag + + def make_secondary_merges( + self, + mergeable_children: set[int], + child_isomorphisms: list[set[int]], + filter_dag: list[int | None], + ) -> dict[int, set[int]]: + """ + Form the datastructure for the secondary merges, which is a mapping + from a child index serving as a target, to a set of child indices + serving as a pool of sources to merge into it, where the target and + all sources must be mergeable, isomorphic, and not be used as a + source or sink in the DAG. + + Args: + `mergeable_children`: The set of child indices that are mergeable. + `child_isomorphisms`: A list where the i'th element is the set of + all other child indices that are isomorphic to the i'th child after + stripping away all filters after any critical points. + `filter_dag`: A list where the i'th element is either None if there + is no child that the i'th child can be merged into, or the index of + a child that the i'th child can be merged into, meaning that the + i'th child has a strict superset of filters compared to that child, + and they are isomorphic after stripping away filters. + + Returns: + A mapping from a child index serving as a target, to a set of child + indices serving as a pool of sources to merge into it, where the + target and all sources must be mergeable, isomorphic, and not be + used as a source or sink in the DAG. + """ + secondary_merges: dict[int, set[int]] = {} + + # Form secondary edges between island nodes that are not subsets of + # one another but where both of them are mergeable, and neither one is + # the sink of an edge yet, or has been used as a source yet. + existing_sinks: set[int | None] = set(filter_dag) + already_merged: set[int] = set() + for idx in mergeable_children: + for other_idx in sorted(child_isomorphisms[idx]): + if ( + other_idx in mergeable_children + and filter_dag[idx] is None + and filter_dag[other_idx] is None + and idx not in existing_sinks + and other_idx not in existing_sinks + and other_idx not in secondary_merges + and other_idx not in already_merged + ): + secondary_merges[idx] = secondary_merges.get(idx, set()) + secondary_merges[idx].add(other_idx) + already_merged.add(other_idx) + + return secondary_merges diff --git a/pydough/conversion/hybrid_translator.py b/pydough/conversion/hybrid_translator.py index 64c56bad1..09b1d0ea7 100644 --- a/pydough/conversion/hybrid_translator.py +++ b/pydough/conversion/hybrid_translator.py @@ -65,6 +65,7 @@ HybridSidedRefExpr, HybridWindowExpr, ) +from .hybrid_filter_merger import HybridFilterMerger from .hybrid_operations import ( HybridCalculate, HybridCollectionAccess, @@ -1701,6 +1702,19 @@ def run_hybrid_decorrelation(self, hybrid: "HybridTree") -> None: decorr.find_correlated_children(hybrid) decorr.decorrelate_hybrid_tree(hybrid) + def run_filter_merging(self, hybrid: "HybridTree") -> None: + """ + Invokes the procedure to merge identical child subtrees in the hybrid + tree if they are identical except for the filters they have, which can + be emulated via a SUM on a predicate. The transformation is done + in-place. + + Args: + `hybrid`: The hybrid tree to run filter merging on. + """ + filter_merger: HybridFilterMerger = HybridFilterMerger(self) + filter_merger.merge_filters(hybrid) + def convert_qdag_to_hybrid(self, node: PyDoughCollectionQDAG) -> HybridTree: """ Convert a PyDough QDAG node to a hybrid tree, including any necessary @@ -1725,10 +1739,11 @@ def convert_qdag_to_hybrid(self, node: PyDoughCollectionQDAG) -> HybridTree: self.run_correlation_extraction(hybrid) # 5. Run the de-correlation procedure. self.run_hybrid_decorrelation(hybrid) - # 6. Run any final rewrites, such as turning MEDIAN into an average + # 6. Run the filter-merging procedure, then re-run ejecting aggregate + # inputs to clean up any new aggregates created by filter merging. + self.run_filter_merging(hybrid) + self.eject_aggregate_inputs(hybrid) + # 8. Run any final rewrites, such as turning MEDIAN into an average # of the 1-2 median rows, that must happen after de-correlation. self.run_rewrites(hybrid) - # 7. Remove any dead children in the hybrid tree that are no longer - # being used. - hybrid.remove_dead_children(set()) return hybrid diff --git a/pydough/conversion/hybrid_tree.py b/pydough/conversion/hybrid_tree.py index aeea1bf38..87a043d59 100644 --- a/pydough/conversion/hybrid_tree.py +++ b/pydough/conversion/hybrid_tree.py @@ -976,7 +976,7 @@ def remove_dead_children(self, must_remove: set[int]) -> dict[int, int]: continue if ( self.children[child_idx].connection_type.is_semi - and not self.children[child_idx].subtree.always_exists() + and not self.children[child_idx].get_always_exists() ) or self.children[child_idx].connection_type.is_anti: children_to_delete.discard(child_idx) diff --git a/pydough/conversion/relational_simplification.py b/pydough/conversion/relational_simplification.py index 45a40cd16..7b46278e1 100644 --- a/pydough/conversion/relational_simplification.py +++ b/pydough/conversion/relational_simplification.py @@ -870,8 +870,8 @@ def simplify_function_call( output_predicates.positive = True output_expr = CallExpression(pydop.COUNT, expr.data_type, []) - # All of these operators are non-null or non-negative if their - # first argument is. + # All of these operators are non-null, non-negative, or positive if + # their first argument is. case ( pydop.SUM | pydop.AVG @@ -882,8 +882,28 @@ def simplify_function_call( | pydop.QUANTILE ): output_predicates |= arg_predicates[0] & PredicateSet( - not_null=True, not_negative=True + not_null=True, + not_negative=True, + positive=True, ) + if expr.op == pydop.SUM: + if ( + isinstance(expr.inputs[0], CallExpression) + and expr.inputs[0].op == pydop.IFF + ): + # SUM(IFF(cond, 1, 0)) -> SUM(cond) + cond_arg: RelationalExpression = expr.inputs[0].inputs[0] + first_arg: RelationalExpression = expr.inputs[0].inputs[1] + second_arg: RelationalExpression = expr.inputs[0].inputs[2] + if ( + isinstance(first_arg, LiteralExpression) + and first_arg.value in (1, 1.0, True) + and isinstance(second_arg, LiteralExpression) + and second_arg.value in (0, 0.0, False) + ): + output_expr = CallExpression( + pydop.SUM, expr.data_type, [cond_arg] + ) # INTEGER(x) -> x if x is a literal integer. Also simplify for # booleans. @@ -1144,7 +1164,69 @@ def simplify_function_call( for arg in expr.inputs ): output_expr = LiteralExpression(True, expr.data_type) - output_predicates.not_negative = True + # If any of the two arguments are opposites, replace the entire + # AND expression with False. For example: + # (x >= y) & (x < y) -> False + for i in range(len(expr.inputs)): + for j in range(i + 1, len(expr.inputs)): + if are_opposites(expr.inputs[i], expr.inputs[j]): + output_expr = LiteralExpression(False, expr.data_type) + + # If all the arguments are ANDs that have overlap, create an OR + # between the overlapping parts vs the rest of the arguments + # that are AND-ed together. + # For example: (x | y) & (x | z) -> x | (y & z) + if all( + isinstance(arg, CallExpression) and arg.op == pydop.BOR + for arg in expr.inputs + ): + common_args: set[RelationalExpression] = set.intersection( + *[ + set(arg.inputs) + for arg in expr.inputs + if isinstance(arg, CallExpression) + ] + ) + if len(common_args) > 0: + common_arg_expr: RelationalExpression + if len(common_args) == 1: + common_arg_expr = next(iter(common_args)) + else: + common_arg_expr = CallExpression( + pydop.BOR, expr.data_type, list(common_args) + ) + conjunctions: list[RelationalExpression] = [] + for arg in expr.inputs: + assert isinstance(arg, CallExpression) + conj_args: list[RelationalExpression] = list( + set(arg.inputs) - common_args + ) + if len(conj_args) == 0: + continue + elif len(conj_args) == 1: + conjunctions.append(next(iter(conj_args))) + else: + conjunctions.append( + CallExpression(pydop.BAN, expr.data_type, conj_args) + ) + + disjunction_expr: RelationalExpression + if len(conjunctions) == 0: + output_expr = common_arg_expr + else: + if len(conjunctions) == 1: + disjunction_expr = next(iter(conjunctions)) + else: + disjunction_expr = CallExpression( + pydop.BOR, expr.data_type, conjunctions + ) + output_expr = CallExpression( + pydop.BAN, + expr.data_type, + [common_arg_expr, disjunction_expr], + ) + + output_predicates.not_negative = False # X | Y is True if any of the arguments are Truth-y literals, and False # if all of the arguments are False-y literals. @@ -1160,6 +1242,70 @@ def simplify_function_call( for arg in expr.inputs ): output_expr = LiteralExpression(False, expr.data_type) + # If any of the two arguments are opposites, replace the entire + # OR expression with True. For example: + # (x >= y) | (x < y) -> True + for i in range(len(expr.inputs)): + for j in range(i + 1, len(expr.inputs)): + if are_opposites(expr.inputs[i], expr.inputs[j]): + output_expr = LiteralExpression(True, expr.data_type) + output_predicates.positive = True + break + + # If all the arguments are ORs that have overlap, create an AND + # between the overlapping parts vs the rest of the arguments + # that are OR-ed together. + # For example: (x & y) | (x & z) -> x & (y | z) + if all( + isinstance(arg, CallExpression) and arg.op == pydop.BAN + for arg in expr.inputs + ): + common_args = set.intersection( + *[ + set(arg.inputs) + for arg in expr.inputs + if isinstance(arg, CallExpression) + ] + ) + breakpoint() + if len(common_args) > 0: + if len(common_args) == 1: + common_arg_expr = next(iter(common_args)) + else: + common_arg_expr = CallExpression( + pydop.BAN, expr.data_type, list(common_args) + ) + disjunctions: list[RelationalExpression] = [] + for arg in expr.inputs: + assert isinstance(arg, CallExpression) + disj_args: list[RelationalExpression] = list( + set(arg.inputs) - common_args + ) + if len(disj_args) == 0: + continue + elif len(disj_args) == 1: + disjunctions.append(next(iter(disj_args))) + else: + disjunctions.append( + CallExpression(pydop.BOR, expr.data_type, disj_args) + ) + + conjunction_expr: RelationalExpression + if len(disjunctions) == 0: + output_expr = common_arg_expr + else: + if len(disjunctions) == 1: + conjunction_expr = next(iter(disjunctions)) + else: + conjunction_expr = CallExpression( + pydop.BOR, expr.data_type, disjunctions + ) + output_expr = CallExpression( + pydop.BOR, + expr.data_type, + [common_arg_expr, conjunction_expr], + ) + output_predicates.not_negative = True # NOT(x) is True if x is a False-y literal, and False if x is a @@ -1741,3 +1887,47 @@ def simplify_expressions( """ simplifier: SimplificationVisitor = SimplificationVisitor(session) node.accept(simplifier) + + +def are_opposites(expr1: RelationalExpression, expr2: RelationalExpression) -> bool: + """ + Helper function to determine whether two expressions are opposites of each + other. For example, x >= y and x < y are opposites, as are x = y and + x != y. + + Args: + `expr1`: The first expression to compare. + `expr2`: The second expression to compare. + + Returns: + A boolean indicating whether the two expressions are opposites of each + other. + """ + if not (isinstance(expr1, CallExpression) and isinstance(expr2, CallExpression)): + return False + match (expr1.op, expr2.op): + # e.g. (a = b) is the opposite of (a != b) + case (pydop.EQU, pydop.NEQ) | (pydop.NEQ, pydop.EQU): + return set(expr1.inputs) == set(expr2.inputs) + + # e.g. (a < b) is the opposite of (a >= b) + case ( + (pydop.LET, pydop.GEQ) + | (pydop.LEQ, pydop.GRT) + | (pydop.GEQ, pydop.LET) + | (pydop.GRT, pydop.LEQ) + ): + return expr1.inputs == expr2.inputs + + # e.g. (a < b) is the opposite of (b <= a) + case ( + (pydop.LET, pydop.LEQ) + | (pydop.LEQ, pydop.LET) + | (pydop.GEQ, pydop.GRT) + | (pydop.GRT, pydop.GEQ) + ): + return expr1.inputs == list(reversed(expr2.inputs)) + + # TODO: add more cases + case _: + return False diff --git a/pydough/relational/relational_expressions/call_expression.py b/pydough/relational/relational_expressions/call_expression.py index 4802404c0..758cfb8d7 100644 --- a/pydough/relational/relational_expressions/call_expression.py +++ b/pydough/relational/relational_expressions/call_expression.py @@ -48,7 +48,7 @@ def __init__( # If the operator is a commutative operation (AND, OR, EQUAL, NOT-EQUAL) # first sort the inputs to normalize them. if self.op in (BAN, BOR, EQU, NEQ): - self._inputs.sort(key=repr) + self._inputs = sorted(set(self._inputs), key=repr) @property def op(self) -> PyDoughExpressionOperator: diff --git a/tests/conftest.py b/tests/conftest.py index bea0969f9..7f75ca8ed 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -683,7 +683,7 @@ def _impl(database_name: str) -> DatabaseContext: return _impl -S3_DATASETS = ["synthea", "world_development_indicators", "menu"] +S3_DATASETS = ["synthea", "world_development_indicators", "menu", "donor", "movielens"] """ Contains the name of all the custom datasets that will be used for testing. This includes the datasets from S3 and initialized with a .sql file. diff --git a/tests/test_masked_sf.py b/tests/test_masked_sf.py index 6ab47728f..6d288e06f 100644 --- a/tests/test_masked_sf.py +++ b/tests/test_masked_sf.py @@ -1264,19 +1264,18 @@ def test_pipeline_e2e_masked_sf( [ { "DRY_RUN", - "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6]", "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", - "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970, 'EQUAL', 2, 'MONTH', 1, '__col__', 6]", "bodo/fsi/protected_customers/dob: ['AND', 3, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'DAY', 1, '__col__', 15]", "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'MONTH', 1, '__col__', 6]", "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['OR', 2, 'AND', 2, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", }, { - "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6]", - "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", - "bodo/fsi/protected_customers/dob: ['AND', 2, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", - "bodo/fsi/protected_customers/dob: ['AND', 3, 'EQUAL', 2, 'DAY', 1, '__col__', 15, 'EQUAL', 2, 'MONTH', 1, '__col__', 6, 'EQUAL', 2, 'YEAR', 1, '__col__', 1970]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'DAY', 1, '__col__', 15]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'MONTH', 1, '__col__', 6]", + "bodo/fsi/protected_customers/dob: ['EQUAL', 2, 'YEAR', 1, '__col__', 1970]", }, ], id="fsi_customers_b", diff --git a/tests/test_masked_sqlite.py b/tests/test_masked_sqlite.py index e2383fc5e..0dea9ab83 100644 --- a/tests/test_masked_sqlite.py +++ b/tests/test_masked_sqlite.py @@ -1475,6 +1475,7 @@ def test_pipeline_e2e_cryptbank( "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 5, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", "DRY_RUN", }, { @@ -1519,6 +1520,7 @@ def test_pipeline_e2e_cryptbank( "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 5, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'u', 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'AND', 2, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'AND', 2, 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", "DRY_RUN", }, { @@ -1549,10 +1551,6 @@ def test_pipeline_e2e_cryptbank( + ")", [ { - "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", - "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'i']", - "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'o']", - "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'u']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", @@ -1561,8 +1559,8 @@ def test_pipeline_e2e_cryptbank( "DRY_RUN", }, { - "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", @@ -1594,6 +1592,7 @@ def test_pipeline_e2e_cryptbank( "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 4, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'i', 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'AND', 2, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'u']", "DRY_RUN", }, { @@ -1624,16 +1623,24 @@ def test_pipeline_e2e_cryptbank( "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'a']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 3, 'AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'AND', 3, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'AND', 3, 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", "DRY_RUN", }, { + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 2, 'CONTAINS', 2, '__col__', 'o', 'CONTAINS', 2, '__col__', 'u']", + "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['AND', 3, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i', 'CONTAINS', 2, '__col__', 'o']", + "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'i']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'u']", @@ -1665,6 +1672,7 @@ def test_pipeline_e2e_cryptbank( "CRBNK/CUSTOMERS/c_fname: ['CONTAINS', 2, '__col__', 'o']", "CRBNK/CUSTOMERS/c_fname: ['NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e']", "CRBNK/CUSTOMERS/c_fname: ['NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", + "CRBNK/CUSTOMERS/c_fname: ['OR', 2, 'AND', 2, 'CONTAINS', 2, '__col__', 'i', 'NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'a', 'CONTAINS', 2, '__col__', 'e', 'AND', 2, 'CONTAINS', 2, '__col__', 'o', 'NOT', 1, 'AND', 2, 'CONTAINS', 2, '__col__', 'e', 'CONTAINS', 2, '__col__', 'i']", "DRY_RUN", }, { diff --git a/tests/test_metadata/masked_graphs.json b/tests/test_metadata/masked_graphs.json index 7ff27c758..8f61d3508 100644 --- a/tests/test_metadata/masked_graphs.json +++ b/tests/test_metadata/masked_graphs.json @@ -384,7 +384,7 @@ "original parent": "branches", "original property": "same_state_customers", "singular": false, - "always matches": true, + "always matches": false, "description": "All branches located in the same state as the customer", "synonyms": ["branches in same state", "local branches"] } diff --git a/tests/test_pipeline_s3_datasets.py b/tests/test_pipeline_s3_datasets.py index 9570029f2..e23be6a79 100644 --- a/tests/test_pipeline_s3_datasets.py +++ b/tests/test_pipeline_s3_datasets.py @@ -137,6 +137,60 @@ ), id="menu_5556", ), + pytest.param( + PyDoughPandasTest( + """ +result = donor.CALCULATE( + total_sb_projects=COUNT(projects.WHERE(LOWER(school_city) == "santa barbara")), + suburban_sb_projects=COUNT(projects.WHERE( + (LOWER(school_city) == "santa barbara") & + (LOWER(school_metro) == "suburban") + )) +).CALCULATE( + percentage_suburban=100.0 * suburban_sb_projects / total_sb_projects +) + """, + "donor", + lambda: pd.DataFrame( + { + "percentage_suburban": [30.303], + } + ), + "donor_3276", + ), + id="donor_3276", + ), + pytest.param( + PyDoughPandasTest( + """ +result = movielens.CALCULATE( + # Count all users who have at least one rating of 2 + total_users_with_rating_2 = COUNT(users.WHERE( + HAS(submitted_ratings.WHERE(rating == 2)) + )), + # Count female users who have at least one rating of 2 + female_users_with_rating_2 = COUNT(users.WHERE( + HAS(submitted_ratings.WHERE(rating == 2)) & (LOWER(u_gender) == 'f') + )) +).CALCULATE( + # Calculate the percentage, handling division by zero + percentage_of_female_users = IFF( + total_users_with_rating_2 > 0, + 100.0 * female_users_with_rating_2 / total_users_with_rating_2, + 0.0 + ) +) + """, + "movielens", + lambda: pd.DataFrame( + { + "percentage_of_female_users": [27.961], + } + ), + "movielens_2274", + ), + id="movielens_2274", + ), ], ) def s3_datasets_test_data(request) -> PyDoughPandasTest: diff --git a/tests/test_pipeline_tpch_custom.py b/tests/test_pipeline_tpch_custom.py index 0b3ad655f..0e39e896b 100644 --- a/tests/test_pipeline_tpch_custom.py +++ b/tests/test_pipeline_tpch_custom.py @@ -2690,6 +2690,632 @@ ), id="quarter_function_test", ), + pytest.param( + PyDoughPandasTest( + "result = TPCH.CALCULATE(" + " n1=COUNT(customers.WHERE(MONOTONIC(500, account_balance, 600))), " + " n2=COUNT(customers.WHERE((market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1379], + "n2": [268], + } + ), + "count_multiple_filters_a", + skip_sql=True, + ), + id="count_multiple_filters_a", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE(MONOTONIC(500, account_balance, 600))\n" + "c2 = customers.WHERE((market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))\n" + "result = TPCH.CALCULATE(" + " n1=COUNT(c1), " + " n2=COUNT(c2), " + ").WHERE(HAS(c1))", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1379], + "n2": [268], + } + ), + "count_multiple_filters_b", + skip_sql=True, + ), + id="count_multiple_filters_b", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE(MONOTONIC(500, account_balance, 600))\n" + "c2 = customers.WHERE((market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))\n" + "result = TPCH.CALCULATE(" + " n1=COUNT(c1), " + " n2=COUNT(c2), " + ").WHERE(HAS(c2))", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1379], + "n2": [268], + } + ), + "count_multiple_filters_c", + skip_sql=True, + ), + id="count_multiple_filters_c", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE(MONOTONIC(500, account_balance, 600))\n" + "c2 = customers.WHERE((market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))\n" + "result = TPCH.CALCULATE(" + " n1=COUNT(c1), " + " n2=COUNT(c2), " + ").WHERE(HAS(c1) & HAS(c2))", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1379], + "n2": [268], + } + ), + "count_multiple_filters_d", + skip_sql=True, + ), + id="count_multiple_filters_d", + ), + pytest.param( + PyDoughPandasTest( + "result = TPCH.CALCULATE(" + " n1=COUNT(customers.WHERE(MONOTONIC(500, account_balance, 600))), " + " n2=COUNT(customers.WHERE(market_segment == 'BUILDING')), " + " n3=COUNT(customers.WHERE((market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))), " + " n4=COUNT(customers.WHERE(MONOTONIC(500, account_balance, 600) & STARTSWITH(phone, '11'))), " + " n5=COUNT(customers.WHERE(STARTSWITH(phone, '11') & (market_segment == 'BUILDING'))), " + " n6=COUNT(customers.WHERE(MONOTONIC(500, account_balance, 600) & STARTSWITH(phone, '11') & (market_segment == 'BUILDING'))), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1379], + "n2": [30142], + "n3": [268], + "n4": [54], + "n5": [1261], + "n6": [19], + } + ), + "count_multiple_filters_e", + skip_sql=True, + ), + id="count_multiple_filters_e", + ), + pytest.param( + PyDoughPandasTest( + "result = TPCH.CALCULATE(" + " n1=COUNT(customers), " + " n2=COUNT(customers.WHERE(market_segment == 'BUILDING')), " + " n3=COUNT(customers.WHERE(MONOTONIC(500, account_balance, 600))), " + " n4=COUNT(customers.WHERE(STARTSWITH(phone, '11'))), " + " n5=COUNT(customers.WHERE(STARTSWITH(phone, '11') & (market_segment == 'BUILDING'))), " + " n6=COUNT(customers.WHERE(STARTSWITH(phone, '11') & (market_segment == 'BUILDING') & MONOTONIC(500, account_balance, 600))), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [150000], + "n2": [30142], + "n3": [1379], + "n4": [5975], + "n5": [1261], + "n6": [19], + } + ), + "count_multiple_filters_f", + skip_sql=True, + ), + id="count_multiple_filters_f", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE(PERCENTILE(by=account_balance.ASC()) == 100)\n" + "c2 = customers.WHERE(nation.name == 'GERMANY').WHERE(PERCENTILE(by=account_balance.ASC()) == 100)\n" + "c3 = customers.WHERE(nation.name == 'GERMANY')\n" + "c4 = customers.WHERE(nation.name == 'CHINA').WHERE(PERCENTILE(by=account_balance.ASC()) == 100)\n" + "c5 = customers.WHERE((PERCENTILE(by=account_balance.ASC()) == 100) & (nation.name == 'CHINA'))\n" + "c6 = customers.WHERE(nation.name == 'CHINA')\n" + "result = TPCH.CALCULATE(" + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + " n4=COUNT(c4), " + " n5=COUNT(c5), " + " n6=COUNT(c6), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1500], + "n2": [59], + "n3": [5908], + "n4": [60], + "n5": [57], + "n6": [6024], + } + ), + "count_multiple_filters_g", + skip_sql=True, + ), + id="count_multiple_filters_g", + ), + pytest.param( + PyDoughPandasTest( + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(nations.customers.orders), " + " n3=COUNT(nations.customers.orders.WHERE(order_priority == '1-URGENT')), " + " n4=COUNT(nations.customers.orders.WHERE(order_priority == '2-HIGH')), " + " n5=COUNT(nations.customers.orders.WHERE(order_priority == '3-MEDIUM')), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [298994, 299103, 301740, 303286, 296877], + "n3": [59767, 59902, 60166, 60373, 60135], + "n4": [59511, 60232, 60246, 60901, 59201], + "n5": [59597, 59230, 60485, 60375, 59036], + } + ), + "count_multiple_filters_h", + skip_sql=True, + ), + id="count_multiple_filters_h", + ), + pytest.param( + PyDoughPandasTest( + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(nations.customers.orders), " + " n3=COUNT(nations.customers.orders.WHERE(order_priority == '1-URGENT')), " + " n4=COUNT(nations.customers.orders.WHERE(order_priority == '2-HIGH')), " + " n5=COUNT(nations.customers.orders.WHERE(order_priority == '3-MEDIUM')), " + ").WHERE(HAS(nations.customers.orders.WHERE(order_priority == '2-HIGH')))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [298994, 299103, 301740, 303286, 296877], + "n3": [59767, 59902, 60166, 60373, 60135], + "n4": [59511, 60232, 60246, 60901, 59201], + "n5": [59597, 59230, 60485, 60375, 59036], + } + ), + "count_multiple_filters_i", + skip_sql=True, + ), + id="count_multiple_filters_i", + ), + pytest.param( + PyDoughPandasTest( + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(nations.customers.orders.WHERE(order_priority == '1-URGENT')), " + " n3=COUNT(nations.customers.orders.WHERE(order_priority == '2-HIGH')), " + " n4=COUNT(nations.customers.orders.WHERE(order_priority == '3-MEDIUM')), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [59767, 59902, 60166, 60373, 60135], + "n3": [59511, 60232, 60246, 60901, 59201], + "n4": [59597, 59230, 60485, 60375, 59036], + } + ), + "count_multiple_filters_j", + skip_sql=True, + ), + id="count_multiple_filters_j", + ), + pytest.param( + PyDoughPandasTest( + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(nations.customers.orders.WHERE(order_priority == '1-URGENT')), " + " n3=COUNT(nations.customers.orders.WHERE(order_priority == '2-HIGH')), " + " n4=COUNT(nations.customers.orders.WHERE(order_priority == '3-MEDIUM')), " + ").WHERE(HAS(nations.customers.orders.WHERE(order_priority == '1-URGENT')))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [59767, 59902, 60166, 60373, 60135], + "n3": [59511, 60232, 60246, 60901, 59201], + "n4": [59597, 59230, 60485, 60375, 59036], + } + ), + "count_multiple_filters_k", + skip_sql=True, + ), + id="count_multiple_filters_k", + ), + pytest.param( + PyDoughPandasTest( + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(nations.customers.orders.WHERE((order_priority == '1-URGENT') | (order_priority == '2-HIGH'))), " + " n3=COUNT(nations.customers.orders.WHERE((order_priority == '2-HIGH') | (order_priority == '3-MEDIUM'))), " + " n4=COUNT(nations.customers.orders.WHERE((order_priority == '3-MEDIUM') | (order_priority == '4-NOT SPECIFIED'))), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [119278, 120134, 120412, 121274, 119336], + "n3": [119108, 119462, 120731, 121276, 118237], + "n4": [119665, 119193, 121015, 121129, 117975], + } + ), + "count_multiple_filters_l", + skip_sql=True, + ), + id="count_multiple_filters_l", + ), + pytest.param( + PyDoughPandasTest( + "c1 = nations.customers.orders.WHERE((order_priority == '1-URGENT') | (order_priority == '2-HIGH'))\n" + "c2 = nations.customers.orders.WHERE((order_priority == '2-HIGH') | (order_priority == '3-MEDIUM'))\n" + "c3 = nations.customers.orders.WHERE((order_priority == '3-MEDIUM') | (order_priority == '4-NOT SPECIFIED'))\n" + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(c1), " + " n3=COUNT(c2), " + " n4=COUNT(c3), " + ").WHERE(HAS(c1))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [119278, 120134, 120412, 121274, 119336], + "n3": [119108, 119462, 120731, 121276, 118237], + "n4": [119665, 119193, 121015, 121129, 117975], + } + ), + "count_multiple_filters_m", + skip_sql=True, + ), + id="count_multiple_filters_m", + ), + pytest.param( + PyDoughPandasTest( + "c1 = nations.customers.orders.WHERE((order_priority == '1-URGENT') | (order_priority == '2-HIGH'))\n" + "c2 = nations.customers.orders.WHERE((order_priority == '2-HIGH') | (order_priority == '3-MEDIUM'))\n" + "c3 = nations.customers.orders.WHERE((order_priority == '3-MEDIUM') | (order_priority == '4-NOT SPECIFIED'))\n" + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(c1), " + " n3=COUNT(c2), " + " n4=COUNT(c3), " + ").WHERE(HAS(c1) & HAS(c2))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [119278, 120134, 120412, 121274, 119336], + "n3": [119108, 119462, 120731, 121276, 118237], + "n4": [119665, 119193, 121015, 121129, 117975], + } + ), + "count_multiple_filters_n", + skip_sql=True, + ), + id="count_multiple_filters_n", + ), + pytest.param( + PyDoughPandasTest( + "c1 = nations.customers.orders.WHERE((order_priority == '1-URGENT') | (order_priority == '2-HIGH'))\n" + "c2 = nations.customers.orders.WHERE((order_priority == '2-HIGH') | (order_priority == '3-MEDIUM'))\n" + "c3 = nations.customers.orders.WHERE((order_priority == '3-MEDIUM') | (order_priority == '4-NOT SPECIFIED'))\n" + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(c1), " + " n3=COUNT(c2), " + " n4=COUNT(c3), " + ").WHERE(HAS(c1) & HAS(c3))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [119278, 120134, 120412, 121274, 119336], + "n3": [119108, 119462, 120731, 121276, 118237], + "n4": [119665, 119193, 121015, 121129, 117975], + } + ), + "count_multiple_filters_o", + skip_sql=True, + ), + id="count_multiple_filters_o", + ), + pytest.param( + PyDoughPandasTest( + "c1 = nations.customers.orders.WHERE((order_priority == '1-URGENT') | (order_priority == '2-HIGH'))\n" + "c2 = nations.customers.orders.WHERE((order_priority == '2-HIGH') | (order_priority == '3-MEDIUM'))\n" + "c3 = nations.customers.orders.WHERE((order_priority == '3-MEDIUM') | (order_priority == '4-NOT SPECIFIED'))\n" + "result = regions.CALCULATE(" + " region_name=name, " + " n1=COUNT(nations.customers), " + " n2=COUNT(c1), " + " n3=COUNT(c2), " + " n4=COUNT(c3), " + ").WHERE(HAS(c1) & HAS(c2) & HAS(c3))", + "TPCH", + lambda: pd.DataFrame( + { + "region_name": [ + "AFRICA", + "AMERICA", + "ASIA", + "EUROPE", + "MIDDLE EAST", + ], + "n1": [29764, 29952, 30183, 30197, 29904], + "n2": [119278, 120134, 120412, 121274, 119336], + "n3": [119108, 119462, 120731, 121276, 118237], + "n4": [119665, 119193, 121015, 121129, 117975], + } + ), + "count_multiple_filters_p", + skip_sql=True, + ), + id="count_multiple_filters_p", + ), + pytest.param( + PyDoughPandasTest( + "c1 = orders.WHERE(order_priority == '1-URGENT')\n" + "c2 = orders.WHERE(order_priority == '2-HIGH')\n" + "c3 = orders.WHERE(order_priority == '3-MEDIUM')\n" + "result = customers.WHERE(ISIN(key, [2, 3, 4, 17, 23, 26, 380, 827])).CALCULATE(" + " customer_key=key, " + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "customer_key": [2, 3, 4, 17, 23, 26, 380, 827], + "n1": [4, 0, 3, 0, 2, 0, 2, 0], + "n2": [1, 0, 5, 0, 0, 1, 0, 4], + "n3": [0, 0, 7, 2, 3, 2, 0, 0], + } + ), + "count_multiple_filters_q", + skip_sql=True, + ), + id="count_multiple_filters_q", + ), + pytest.param( + PyDoughPandasTest( + "c1 = orders.WHERE(order_priority == '1-URGENT')\n" + "c2 = orders.WHERE(order_priority == '2-HIGH')\n" + "c3 = orders.WHERE(order_priority == '3-MEDIUM')\n" + "result = customers.WHERE(ISIN(key, [2, 3, 4, 17, 23, 26, 380, 827])).CALCULATE(" + " customer_key=key, " + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ").WHERE(HAS(c1))", + "TPCH", + lambda: pd.DataFrame( + { + "customer_key": [2, 4, 23, 380], + "n1": [4, 3, 2, 2], + "n2": [1, 5, 0, 0], + "n3": [0, 7, 3, 0], + } + ), + "count_multiple_filters_r", + skip_sql=True, + ), + id="count_multiple_filters_r", + ), + pytest.param( + PyDoughPandasTest( + "c1 = orders.WHERE(order_priority == '1-URGENT')\n" + "c2 = orders.WHERE(order_priority == '2-HIGH')\n" + "c3 = orders.WHERE(order_priority == '3-MEDIUM')\n" + "result = customers.WHERE(ISIN(key, [2, 3, 4, 17, 23, 26, 380, 827])).CALCULATE(" + " customer_key=key, " + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ").WHERE(HAS(c2))", + "TPCH", + lambda: pd.DataFrame( + { + "customer_key": [2, 4, 26, 827], + "n1": [4, 3, 0, 0], + "n2": [1, 5, 1, 4], + "n3": [0, 7, 2, 0], + } + ), + "count_multiple_filters_s", + skip_sql=True, + ), + id="count_multiple_filters_s", + ), + pytest.param( + PyDoughPandasTest( + "c1 = orders.WHERE(order_priority == '1-URGENT')\n" + "c2 = orders.WHERE(order_priority == '2-HIGH')\n" + "c3 = orders.WHERE(order_priority == '3-MEDIUM')\n" + "result = customers.WHERE(ISIN(key, [2, 3, 4, 17, 23, 26, 380, 827])).CALCULATE(" + " customer_key=key, " + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ").WHERE(HAS(c3))", + "TPCH", + lambda: pd.DataFrame( + { + "customer_key": [4, 17, 23, 26], + "n1": [3, 0, 2, 0], + "n2": [5, 0, 0, 1], + "n3": [7, 2, 3, 2], + } + ), + "count_multiple_filters_t", + skip_sql=True, + ), + id="count_multiple_filters_t", + ), + pytest.param( + PyDoughPandasTest( + "c1 = orders.WHERE(order_priority == '1-URGENT')\n" + "c2 = orders.WHERE(order_priority == '2-HIGH')\n" + "c3 = orders.WHERE(order_priority == '3-MEDIUM')\n" + "result = customers.WHERE(ISIN(key, [2, 3, 4, 17, 23, 26, 380, 827])).CALCULATE(" + " customer_key=key, " + " n1=COUNT(c1), " + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ").WHERE(HAS(c1) & HAS(c2) & HAS(c3))", + "TPCH", + lambda: pd.DataFrame( + { + "customer_key": [4], + "n1": [3], + "n2": [5], + "n3": [7], + } + ), + "count_multiple_filters_u", + skip_sql=True, + ), + id="count_multiple_filters_u", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE(market_segment == 'BUILDING')\n" + "c2 = c1.WHERE(MONOTONIC(500, account_balance, 1000))\n" + "c3 = c1.WHERE(~MONOTONIC(500, account_balance, 1000))\n" + "result = TPCH.CALCULATE(" + " n2=COUNT(c2), " + " n3=COUNT(c3), " + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n2": [1394], + "n3": [28748], + } + ), + "count_multiple_filters_v", + skip_sql=True, + ), + id="count_multiple_filters_v", + ), + pytest.param( + PyDoughPandasTest( + "c1 = customers.WHERE((market_segment == 'BUILDING') & STARTSWITH(phone, '30'))\n" + "c2 = customers.WHERE((market_segment == 'BUILDING') & STARTSWITH(phone, '31'))\n" + "c3 = customers.WHERE((market_segment == 'BUILDING') & STARTSWITH(phone, '32'))\n" + "c4 = customers.WHERE((market_segment == 'HOUSEHOLD') & STARTSWITH(phone, '30'))\n" + "c5 = customers.WHERE((market_segment == 'HOUSEHOLD') & STARTSWITH(phone, '31'))\n" + "c6 = customers.WHERE((market_segment == 'HOUSEHOLD') & STARTSWITH(phone, '32'))\n" + "result = TPCH.CALCULATE(" + " n1=COUNT(c1)," + " n2=COUNT(c2)," + " n3=COUNT(c3)," + " n4=COUNT(c4)," + " n5=COUNT(c5)," + " n6=COUNT(c6)," + ")", + "TPCH", + lambda: pd.DataFrame( + { + "n1": [1182], + "n2": [1230], + "n3": [1207], + "n4": [1206], + "n5": [1215], + "n6": [1265], + } + ), + "count_multiple_filters_w", + skip_sql=True, + ), + id="count_multiple_filters_w", + ), pytest.param( PyDoughPandasTest( order_quarter_test, diff --git a/tests/test_plan_refsols/aggregate_semi.txt b/tests/test_plan_refsols/aggregate_semi.txt index 6c99fbc08..6c4ccc09c 100644 --- a/tests/test_plan_refsols/aggregate_semi.txt +++ b/tests/test_plan_refsols/aggregate_semi.txt @@ -1,7 +1,7 @@ ROOT(columns=[('name', s_name), ('num_10parts', n_rows), ('avg_price_of_10parts', sum_p_retailprice / sum_expr), ('sum_price_of_10parts', DEFAULT_TO(sum_p_retailprice, 0:numeric))], orderings=[]) JOIN(condition=t0.s_suppkey == t1.ps_suppkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 's_name': t0.s_name, 'sum_expr': t1.sum_expr, 'sum_p_retailprice': t1.sum_p_retailprice}) SCAN(table=tpch.SUPPLIER, columns={'s_name': s_name, 's_suppkey': s_suppkey}) - AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM(IFF(PRESENT(p_retailprice), 1:numeric, 0:numeric)), 'sum_p_retailprice': SUM(p_retailprice)}) + AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM(PRESENT(p_retailprice)), 'sum_p_retailprice': SUM(p_retailprice)}) JOIN(condition=t0.ps_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'p_retailprice': t1.p_retailprice, 'ps_suppkey': t0.ps_suppkey}) SCAN(table=tpch.PARTSUPP, columns={'ps_partkey': ps_partkey, 'ps_suppkey': ps_suppkey}) FILTER(condition=p_size == 10:numeric, columns={'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) diff --git a/tests/test_plan_refsols/common_prefix_d.txt b/tests/test_plan_refsols/common_prefix_d.txt index 51618c64b..8dd101f81 100644 --- a/tests/test_plan_refsols/common_prefix_d.txt +++ b/tests/test_plan_refsols/common_prefix_d.txt @@ -1,23 +1,15 @@ -ROOT(columns=[('name', r_name), ('n_nations', n_rows), ('n_customers', sum_expr), ('n_suppliers', sum_n_rows), ('n_orders_94', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n_orders_95', DEFAULT_TO(sum_sum_expr_10, 0:numeric)), ('n_orders_96', DEFAULT_TO(sum_sum_n_rows, 0:numeric))], orderings=[(r_name):asc_first]) - JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_expr': t1.sum_n_rows, 'sum_n_rows': t1.sum_sum_n_rows_0, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_10': t1.sum_sum_expr_10, 'sum_sum_n_rows': t1.sum_sum_n_rows}) +ROOT(columns=[('name', r_name), ('n_nations', n_rows), ('n_customers', sum_sum_expr), ('n_suppliers', sum_n_rows), ('n_orders_94', DEFAULT_TO(sum_sum_sum_expr, 0:numeric)), ('n_orders_95', DEFAULT_TO(sum_sum_sum_expr_33, 0:numeric)), ('n_orders_96', DEFAULT_TO(sum_sum_sum_expr_34, 0:numeric))], orderings=[(r_name):asc_first]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_n_rows': t1.sum_n_rows, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_sum_expr': t1.sum_sum_sum_expr, 'sum_sum_sum_expr_33': t1.sum_sum_sum_expr_33, 'sum_sum_sum_expr_34': t1.sum_sum_sum_expr_34}) SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) - AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows), 'sum_sum_expr': SUM(sum_expr), 'sum_sum_expr_10': SUM(sum_expr_10), 'sum_sum_n_rows': SUM(sum_n_rows), 'sum_sum_n_rows_0': SUM(sum_n_rows_0)}) - JOIN(condition=t0.n_nationkey == t1.s_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'n_rows': t0.n_rows, 'sum_expr': t0.sum_expr, 'sum_expr_10': t0.sum_expr_10, 'sum_n_rows': t0.sum_n_rows, 'sum_n_rows_0': t1.n_rows}) - JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_nationkey': t0.n_nationkey, 'n_regionkey': t0.n_regionkey, 'n_rows': t1.n_rows, 'sum_expr': t1.sum_expr_7, 'sum_expr_10': t1.sum_expr_10, 'sum_n_rows': t1.sum_n_rows}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows), 'sum_sum_expr': SUM(sum_expr_16), 'sum_sum_sum_expr': SUM(sum_sum_expr), 'sum_sum_sum_expr_33': SUM(sum_sum_expr_33), 'sum_sum_sum_expr_34': SUM(sum_sum_expr_34)}) + JOIN(condition=t0.n_nationkey == t1.s_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'n_rows': t1.n_rows, 'sum_expr_16': t0.n_rows, 'sum_sum_expr': t0.sum_sum_expr, 'sum_sum_expr_33': t0.sum_sum_expr_33, 'sum_sum_expr_34': t0.sum_sum_expr_34}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_nationkey': t0.n_nationkey, 'n_regionkey': t0.n_regionkey, 'n_rows': t1.n_rows, 'sum_sum_expr': t1.sum_sum_expr_35, 'sum_sum_expr_33': t1.sum_sum_expr_33, 'sum_sum_expr_34': t1.sum_sum_expr_34}) SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) - AGGREGATE(keys={'c_nationkey': c_nationkey}, aggregations={'n_rows': COUNT(), 'sum_expr_10': SUM(expr_10), 'sum_expr_7': SUM(expr_7), 'sum_n_rows': SUM(n_rows)}) - JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_nationkey': t0.c_nationkey, 'expr_10': t0.n_rows, 'expr_7': t0.expr_7, 'n_rows': t1.n_rows}) - JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'expr_7': t0.n_rows, 'n_rows': t1.n_rows}) - JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'n_rows': t1.n_rows}) - SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) - AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'n_rows': COUNT()}) - FILTER(condition=YEAR(o_orderdate) == 1994:numeric, columns={'o_custkey': o_custkey}) - SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate}) - AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'n_rows': COUNT()}) - FILTER(condition=YEAR(o_orderdate) == 1995:numeric, columns={'o_custkey': o_custkey}) - SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate}) - AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'n_rows': COUNT()}) - FILTER(condition=YEAR(o_orderdate) == 1996:numeric, columns={'o_custkey': o_custkey}) + AGGREGATE(keys={'c_nationkey': c_nationkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_33': SUM(sum_expr), 'sum_sum_expr_34': SUM(sum_expr_34), 'sum_sum_expr_35': SUM(sum_expr_35)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_nationkey': t0.c_nationkey, 'sum_expr': t1.sum_expr, 'sum_expr_34': t1.sum_expr_34, 'sum_expr_35': t1.sum_expr_35}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((YEAR(o_orderdate) == 1995:numeric)), 'sum_expr_34': SUM((YEAR(o_orderdate) == 1996:numeric)), 'sum_expr_35': SUM((YEAR(o_orderdate) == 1994:numeric))}) + FILTER(condition=(YEAR(o_orderdate) == 1994:numeric) | (YEAR(o_orderdate) == 1995:numeric) | (YEAR(o_orderdate) == 1996:numeric), columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate}) SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate}) AGGREGATE(keys={'s_nationkey': s_nationkey}, aggregations={'n_rows': COUNT()}) SCAN(table=tpch.SUPPLIER, columns={'s_nationkey': s_nationkey}) diff --git a/tests/test_plan_refsols/common_prefix_n.txt b/tests/test_plan_refsols/common_prefix_n.txt index f96bd3cf4..abda9c358 100644 --- a/tests/test_plan_refsols/common_prefix_n.txt +++ b/tests/test_plan_refsols/common_prefix_n.txt @@ -1,20 +1,18 @@ -ROOT(columns=[('key', l_orderkey), ('order_date', anything_o_orderdate), ('n_elements', DEFAULT_TO(n_rows, 0:numeric)), ('total_retail_price', DEFAULT_TO(sum_p_retailprice, 0:numeric)), ('n_unique_supplier_nations', DEFAULT_TO(ndistinct_n_name, 0:numeric)), ('max_supplier_balance', max_s_acctbal), ('n_small_parts', DEFAULT_TO(sum_n_rows, 0:numeric))], orderings=[(anything_o_orderdate):desc_last, (l_orderkey):asc_first], limit=5:numeric) - FILTER(condition=DEFAULT_TO(n_rows, 0:numeric) > DEFAULT_TO(ndistinct_n_name, 0:numeric), columns={'anything_o_orderdate': anything_o_orderdate, 'l_orderkey': l_orderkey, 'max_s_acctbal': max_s_acctbal, 'n_rows': n_rows, 'ndistinct_n_name': ndistinct_n_name, 'sum_n_rows': sum_n_rows, 'sum_p_retailprice': sum_p_retailprice}) - JOIN(condition=t0.l_orderkey == t1.l_orderkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'anything_o_orderdate': t0.anything_o_orderdate, 'l_orderkey': t0.l_orderkey, 'max_s_acctbal': t0.max_s_acctbal, 'n_rows': t0.n_rows, 'ndistinct_n_name': t1.ndistinct_n_name, 'sum_n_rows': t0.sum_n_rows, 'sum_p_retailprice': t0.sum_p_retailprice}) - AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'anything_o_orderdate': ANYTHING(o_orderdate), 'max_s_acctbal': MAX(s_acctbal), 'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows), 'sum_p_retailprice': SUM(p_retailprice)}) - JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'l_orderkey': t1.l_orderkey, 'n_rows': t1.n_rows, 'o_orderdate': t0.o_orderdate, 'p_retailprice': t1.p_retailprice, 's_acctbal': t1.s_acctbal}) +ROOT(columns=[('key', l_orderkey), ('order_date', anything_o_orderdate), ('n_elements', DEFAULT_TO(sum_sum_n_rows, 0:numeric)), ('total_retail_price', DEFAULT_TO(sum_sum_p_retailprice, 0:numeric)), ('n_unique_supplier_nations', DEFAULT_TO(ndistinct_n_name, 0:numeric)), ('max_supplier_balance', max_s_acctbal), ('n_small_parts', DEFAULT_TO(sum_sum_sum_expr, 0:numeric))], orderings=[(anything_o_orderdate):desc_last, (l_orderkey):asc_first], limit=5:numeric) + FILTER(condition=DEFAULT_TO(sum_sum_n_rows, 0:numeric) > DEFAULT_TO(ndistinct_n_name, 0:numeric), columns={'anything_o_orderdate': anything_o_orderdate, 'l_orderkey': l_orderkey, 'max_s_acctbal': max_s_acctbal, 'ndistinct_n_name': ndistinct_n_name, 'sum_sum_n_rows': sum_sum_n_rows, 'sum_sum_p_retailprice': sum_sum_p_retailprice, 'sum_sum_sum_expr': sum_sum_sum_expr}) + JOIN(condition=t0.l_orderkey == t1.l_orderkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'anything_o_orderdate': t0.anything_o_orderdate, 'l_orderkey': t0.l_orderkey, 'max_s_acctbal': t0.max_s_acctbal, 'ndistinct_n_name': t1.ndistinct_n_name, 'sum_sum_n_rows': t0.sum_sum_n_rows, 'sum_sum_p_retailprice': t0.sum_sum_p_retailprice, 'sum_sum_sum_expr': t0.sum_sum_sum_expr}) + AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'anything_o_orderdate': ANYTHING(o_orderdate), 'max_s_acctbal': MAX(s_acctbal), 'sum_sum_n_rows': SUM(sum_n_rows), 'sum_sum_p_retailprice': SUM(sum_p_retailprice), 'sum_sum_sum_expr': SUM(sum_sum_expr)}) + JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'l_orderkey': t1.l_orderkey, 'o_orderdate': t0.o_orderdate, 's_acctbal': t1.s_acctbal, 'sum_n_rows': t1.sum_n_rows, 'sum_p_retailprice': t1.sum_p_retailprice, 'sum_sum_expr': t1.sum_sum_expr}) FILTER(condition=(YEAR(o_orderdate) == 1996:numeric) & ISIN(MONTH(o_orderdate), [10, 11, 12]:array[numeric]), columns={'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey}) SCAN(table=tpch.ORDERS, columns={'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey}) - JOIN(condition=t0.l_partkey == t1.p_partkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'n_rows': t1.n_rows, 'p_retailprice': t0.p_retailprice, 's_acctbal': t0.s_acctbal}) - JOIN(condition=t0.l_suppkey == t1.s_suppkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_partkey': t0.l_partkey, 'p_retailprice': t0.p_retailprice, 's_acctbal': t1.s_acctbal}) - JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_partkey': t0.l_partkey, 'l_suppkey': t0.l_suppkey, 'p_retailprice': t1.p_retailprice}) - FILTER(condition=(MONTH(l_shipdate) == 11:numeric) & (YEAR(l_shipdate) == 1996:numeric), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}) - SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate, 'l_suppkey': l_suppkey}) - SCAN(table=tpch.PART, columns={'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) - SCAN(table=tpch.SUPPLIER, columns={'s_acctbal': s_acctbal, 's_suppkey': s_suppkey}) - PROJECT(columns={'n_rows': 1:numeric, 'p_partkey': p_partkey}) - FILTER(condition=STARTSWITH(p_container, 'SM':string), columns={'p_partkey': p_partkey}) - SCAN(table=tpch.PART, columns={'p_container': p_container, 'p_partkey': p_partkey}) + JOIN(condition=t0.l_suppkey == t1.s_suppkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 's_acctbal': t1.s_acctbal, 'sum_n_rows': t0.sum_n_rows, 'sum_p_retailprice': t0.sum_p_retailprice, 'sum_sum_expr': t0.sum_sum_expr}) + AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_suppkey': l_suppkey}, aggregations={'sum_n_rows': SUM(n_rows), 'sum_p_retailprice': SUM(p_retailprice), 'sum_sum_expr': SUM(STARTSWITH(p_container, 'SM':string))}) + JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_suppkey': t0.l_suppkey, 'n_rows': t0.n_rows, 'p_container': t1.p_container, 'p_retailprice': t1.p_retailprice}) + AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}, aggregations={'n_rows': COUNT()}) + FILTER(condition=(MONTH(l_shipdate) == 11:numeric) & (YEAR(l_shipdate) == 1996:numeric), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}) + SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate, 'l_suppkey': l_suppkey}) + SCAN(table=tpch.PART, columns={'p_container': p_container, 'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) + SCAN(table=tpch.SUPPLIER, columns={'s_acctbal': s_acctbal, 's_suppkey': s_suppkey}) AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'ndistinct_n_name': NDISTINCT(n_name)}) JOIN(condition=t0.s_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'n_name': t1.n_name}) JOIN(condition=t0.l_suppkey == t1.s_suppkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 's_nationkey': t1.s_nationkey}) diff --git a/tests/test_plan_refsols/common_prefix_o.txt b/tests/test_plan_refsols/common_prefix_o.txt index ff154a980..d7801ab73 100644 --- a/tests/test_plan_refsols/common_prefix_o.txt +++ b/tests/test_plan_refsols/common_prefix_o.txt @@ -1,22 +1,18 @@ -ROOT(columns=[('key', o_orderkey), ('order_date', o_orderdate), ('n_elements', DEFAULT_TO(sum_sum_n_rows, 0:numeric)), ('total_retail_price', DEFAULT_TO(sum_sum_p_retailprice, 0:numeric)), ('n_unique_supplier_nations', DEFAULT_TO(ndistinct_n_name, 0:numeric)), ('max_supplier_balance', max_s_acctbal), ('n_small_parts', sum_sum_sum_n_rows)], orderings=[(o_orderdate):desc_last, (o_orderkey):asc_first], limit=5:numeric) - FILTER(condition=DEFAULT_TO(sum_sum_n_rows, 0:numeric) > DEFAULT_TO(ndistinct_n_name, 0:numeric), columns={'max_s_acctbal': max_s_acctbal, 'ndistinct_n_name': ndistinct_n_name, 'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey, 'sum_sum_n_rows': sum_sum_n_rows, 'sum_sum_p_retailprice': sum_sum_p_retailprice, 'sum_sum_sum_n_rows': sum_sum_sum_n_rows}) - JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'max_s_acctbal': t0.max_s_acctbal, 'ndistinct_n_name': t1.ndistinct_n_name, 'o_orderdate': t0.o_orderdate, 'o_orderkey': t0.o_orderkey, 'sum_sum_n_rows': t0.sum_sum_n_rows, 'sum_sum_p_retailprice': t0.sum_sum_p_retailprice, 'sum_sum_sum_n_rows': t0.sum_sum_sum_n_rows}) - JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'max_s_acctbal': t1.max_s_acctbal, 'o_orderdate': t0.o_orderdate, 'o_orderkey': t0.o_orderkey, 'sum_sum_n_rows': t1.sum_sum_n_rows, 'sum_sum_p_retailprice': t1.sum_sum_p_retailprice, 'sum_sum_sum_n_rows': t1.sum_sum_sum_n_rows}) +ROOT(columns=[('key', o_orderkey), ('order_date', o_orderdate), ('n_elements', DEFAULT_TO(sum_sum_n_rows, 0:numeric)), ('total_retail_price', DEFAULT_TO(sum_sum_p_retailprice, 0:numeric)), ('n_unique_supplier_nations', DEFAULT_TO(ndistinct_n_name, 0:numeric)), ('max_supplier_balance', max_s_acctbal), ('n_small_parts', sum_sum_sum_expr)], orderings=[(o_orderdate):desc_last, (o_orderkey):asc_first], limit=5:numeric) + FILTER(condition=DEFAULT_TO(sum_sum_n_rows, 0:numeric) > DEFAULT_TO(ndistinct_n_name, 0:numeric), columns={'max_s_acctbal': max_s_acctbal, 'ndistinct_n_name': ndistinct_n_name, 'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey, 'sum_sum_n_rows': sum_sum_n_rows, 'sum_sum_p_retailprice': sum_sum_p_retailprice, 'sum_sum_sum_expr': sum_sum_sum_expr}) + JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'max_s_acctbal': t0.max_s_acctbal, 'ndistinct_n_name': t1.ndistinct_n_name, 'o_orderdate': t0.o_orderdate, 'o_orderkey': t0.o_orderkey, 'sum_sum_n_rows': t0.sum_sum_n_rows, 'sum_sum_p_retailprice': t0.sum_sum_p_retailprice, 'sum_sum_sum_expr': t0.sum_sum_sum_expr}) + JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'max_s_acctbal': t1.max_s_acctbal, 'o_orderdate': t0.o_orderdate, 'o_orderkey': t0.o_orderkey, 'sum_sum_n_rows': t1.sum_sum_n_rows, 'sum_sum_p_retailprice': t1.sum_sum_p_retailprice, 'sum_sum_sum_expr': t1.sum_sum_sum_expr}) FILTER(condition=(YEAR(o_orderdate) == 1996:numeric) & ISIN(MONTH(o_orderdate), [10, 11, 12]:array[numeric]), columns={'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey}) SCAN(table=tpch.ORDERS, columns={'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey}) - FILTER(condition=sum_sum_sum_n_rows != 0:numeric, columns={'l_orderkey': l_orderkey, 'max_s_acctbal': max_s_acctbal, 'sum_sum_n_rows': sum_sum_n_rows, 'sum_sum_p_retailprice': sum_sum_p_retailprice, 'sum_sum_sum_n_rows': sum_sum_sum_n_rows}) - AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'max_s_acctbal': MAX(s_acctbal), 'sum_sum_n_rows': SUM(sum_n_rows), 'sum_sum_p_retailprice': SUM(sum_p_retailprice), 'sum_sum_sum_n_rows': SUM(sum_sum_n_rows)}) - JOIN(condition=t0.l_suppkey == t1.s_suppkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 's_acctbal': t1.s_acctbal, 'sum_n_rows': t0.sum_n_rows, 'sum_p_retailprice': t0.sum_p_retailprice, 'sum_sum_n_rows': t0.sum_sum_n_rows}) - AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_suppkey': l_suppkey}, aggregations={'sum_n_rows': SUM(n_rows), 'sum_p_retailprice': SUM(p_retailprice), 'sum_sum_n_rows': SUM(sum_n_rows)}) - JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_suppkey': t0.l_suppkey, 'n_rows': t0.n_rows, 'p_retailprice': t1.p_retailprice, 'sum_n_rows': t0.sum_n_rows}) - AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}, aggregations={'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows)}) - JOIN(condition=t0.l_partkey == t1.p_partkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_partkey': t0.l_partkey, 'l_suppkey': t0.l_suppkey, 'n_rows': t1.n_rows}) - FILTER(condition=(MONTH(l_shipdate) == 11:numeric) & (YEAR(l_shipdate) == 1996:numeric), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}) - SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate, 'l_suppkey': l_suppkey}) - PROJECT(columns={'n_rows': 1:numeric, 'p_partkey': p_partkey}) - FILTER(condition=STARTSWITH(p_container, 'SM':string), columns={'p_partkey': p_partkey}) - SCAN(table=tpch.PART, columns={'p_container': p_container, 'p_partkey': p_partkey}) - SCAN(table=tpch.PART, columns={'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) + FILTER(condition=sum_sum_sum_expr != 0:numeric, columns={'l_orderkey': l_orderkey, 'max_s_acctbal': max_s_acctbal, 'sum_sum_n_rows': sum_sum_n_rows, 'sum_sum_p_retailprice': sum_sum_p_retailprice, 'sum_sum_sum_expr': sum_sum_sum_expr}) + AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'max_s_acctbal': MAX(s_acctbal), 'sum_sum_n_rows': SUM(sum_n_rows), 'sum_sum_p_retailprice': SUM(sum_p_retailprice), 'sum_sum_sum_expr': SUM(sum_sum_expr)}) + JOIN(condition=t0.l_suppkey == t1.s_suppkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 's_acctbal': t1.s_acctbal, 'sum_n_rows': t0.sum_n_rows, 'sum_p_retailprice': t0.sum_p_retailprice, 'sum_sum_expr': t0.sum_sum_expr}) + AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_suppkey': l_suppkey}, aggregations={'sum_n_rows': SUM(n_rows), 'sum_p_retailprice': SUM(p_retailprice), 'sum_sum_expr': SUM(STARTSWITH(p_container, 'SM':string))}) + JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'l_suppkey': t0.l_suppkey, 'n_rows': t0.n_rows, 'p_container': t1.p_container, 'p_retailprice': t1.p_retailprice}) + AGGREGATE(keys={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}, aggregations={'n_rows': COUNT()}) + FILTER(condition=(MONTH(l_shipdate) == 11:numeric) & (YEAR(l_shipdate) == 1996:numeric), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_suppkey': l_suppkey}) + SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate, 'l_suppkey': l_suppkey}) + SCAN(table=tpch.PART, columns={'p_container': p_container, 'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) SCAN(table=tpch.SUPPLIER, columns={'s_acctbal': s_acctbal, 's_suppkey': s_suppkey}) AGGREGATE(keys={'l_orderkey': l_orderkey}, aggregations={'ndistinct_n_name': NDISTINCT(n_name)}) JOIN(condition=t0.s_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'l_orderkey': t0.l_orderkey, 'n_name': t1.n_name}) diff --git a/tests/test_plan_refsols/correl_14.txt b/tests/test_plan_refsols/correl_14.txt index 4f5c79e63..d7e76ff88 100644 --- a/tests/test_plan_refsols/correl_14.txt +++ b/tests/test_plan_refsols/correl_14.txt @@ -2,7 +2,7 @@ ROOT(columns=[('n', ndistinct_ps_suppkey)], orderings=[]) AGGREGATE(keys={}, aggregations={'ndistinct_ps_suppkey': NDISTINCT(ps_suppkey)}) JOIN(condition=(t1.p_retailprice < (t0.ps_supplycost * 1.5:numeric)) & (t1.p_retailprice < (t0.sum_p_retailprice / t0.sum_expr)) & (t0.ps_partkey == t1.p_partkey), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={'ps_suppkey': t0.ps_suppkey}) JOIN(condition=t0.ps_suppkey == t1.ps_suppkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'ps_partkey': t1.ps_partkey, 'ps_suppkey': t0.ps_suppkey, 'ps_supplycost': t1.ps_supplycost, 'sum_expr': t0.sum_expr, 'sum_p_retailprice': t0.sum_p_retailprice}) - AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'sum_expr': SUM(IFF(PRESENT(p_retailprice), 1:numeric, 0:numeric)), 'sum_p_retailprice': SUM(p_retailprice)}) + AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'sum_expr': SUM(PRESENT(p_retailprice)), 'sum_p_retailprice': SUM(p_retailprice)}) JOIN(condition=t0.s_suppkey == t1.ps_suppkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_FILTER, columns={'p_retailprice': t1.p_retailprice, 'ps_suppkey': t1.ps_suppkey}) FILTER(condition=(s_acctbal < 1000:numeric) & (s_nationkey == 19:numeric), columns={'s_suppkey': s_suppkey}) SCAN(table=tpch.SUPPLIER, columns={'s_acctbal': s_acctbal, 's_nationkey': s_nationkey, 's_suppkey': s_suppkey}) diff --git a/tests/test_plan_refsols/correl_15.txt b/tests/test_plan_refsols/correl_15.txt index 8af64f748..6e609c1ac 100644 --- a/tests/test_plan_refsols/correl_15.txt +++ b/tests/test_plan_refsols/correl_15.txt @@ -2,7 +2,7 @@ ROOT(columns=[('n', ndistinct_ps_suppkey)], orderings=[]) AGGREGATE(keys={}, aggregations={'ndistinct_ps_suppkey': NDISTINCT(ps_suppkey)}) JOIN(condition=(t1.p_retailprice < (t0.anything_avg_p_retailprice * 0.85:numeric)) & (t1.p_retailprice < (t0.ps_supplycost * 1.5:numeric)) & (t1.p_retailprice < (t0.sum_p_retailprice / t0.sum_expr)) & (t0.ps_partkey == t1.p_partkey), type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_FILTER, columns={'ps_suppkey': t0.ps_suppkey}) JOIN(condition=t0.ps_suppkey == t1.ps_suppkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'anything_avg_p_retailprice': t0.anything_avg_p_retailprice, 'ps_partkey': t1.ps_partkey, 'ps_suppkey': t0.ps_suppkey, 'ps_supplycost': t1.ps_supplycost, 'sum_expr': t0.sum_expr, 'sum_p_retailprice': t0.sum_p_retailprice}) - AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'anything_avg_p_retailprice': ANYTHING(avg_p_retailprice), 'sum_expr': SUM(IFF(PRESENT(p_retailprice), 1:numeric, 0:numeric)), 'sum_p_retailprice': SUM(p_retailprice)}) + AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'anything_avg_p_retailprice': ANYTHING(avg_p_retailprice), 'sum_expr': SUM(PRESENT(p_retailprice)), 'sum_p_retailprice': SUM(p_retailprice)}) JOIN(condition=t0.s_suppkey == t1.ps_suppkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_FILTER, columns={'avg_p_retailprice': t0.avg_p_retailprice, 'p_retailprice': t1.p_retailprice, 'ps_suppkey': t1.ps_suppkey}) JOIN(condition=True:bool, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'avg_p_retailprice': t0.avg_p_retailprice, 's_suppkey': t1.s_suppkey}) AGGREGATE(keys={}, aggregations={'avg_p_retailprice': AVG(p_retailprice)}) diff --git a/tests/test_plan_refsols/correl_35.txt b/tests/test_plan_refsols/correl_35.txt index 27354d988..39bda4366 100644 --- a/tests/test_plan_refsols/correl_35.txt +++ b/tests/test_plan_refsols/correl_35.txt @@ -10,15 +10,14 @@ ROOT(columns=[('n', n_rows)], orderings=[]) SCAN(table=tpch.SUPPLIER, columns={'s_nationkey': s_nationkey, 's_suppkey': s_suppkey}) FILTER(condition=YEAR(o_orderdate) == 1998:numeric, columns={'o_custkey': o_custkey, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) - FILTER(condition=sum_n_rows != 0:numeric, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey, 'o_orderpriority': o_orderpriority, 'p_type': p_type}) - AGGREGATE(keys={'c_custkey': c_custkey, 'c_nationkey': c_nationkey, 'o_orderpriority': o_orderpriority, 'p_type': p_type}, aggregations={'sum_n_rows': SUM(n_rows)}) - JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'n_rows': t0.n_rows, 'o_orderpriority': t0.o_orderpriority, 'p_type': t1.p_type}) - AGGREGATE(keys={'c_custkey': c_custkey, 'c_nationkey': c_nationkey, 'l_partkey': l_partkey, 'o_orderpriority': o_orderpriority}, aggregations={'n_rows': COUNT()}) - JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'l_partkey': t1.l_partkey, 'o_orderpriority': t0.o_orderpriority}) - JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'o_orderkey': t1.o_orderkey, 'o_orderpriority': t1.o_orderpriority}) - SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) - FILTER(condition=YEAR(o_orderdate) == 1997:numeric, columns={'o_custkey': o_custkey, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) - SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) - FILTER(condition=(YEAR(l_shipdate) == 1997:numeric) & ISIN(MONTH(l_shipdate), [1, 2, 3]:array[numeric]), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey}) - SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate}) - SCAN(table=tpch.PART, columns={'p_partkey': p_partkey, 'p_type': p_type}) + AGGREGATE(keys={'c_custkey': c_custkey, 'c_nationkey': c_nationkey, 'o_orderpriority': o_orderpriority, 'p_type': p_type}, aggregations={}) + JOIN(condition=t0.l_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'o_orderpriority': t0.o_orderpriority, 'p_type': t1.p_type}) + AGGREGATE(keys={'c_custkey': c_custkey, 'c_nationkey': c_nationkey, 'l_partkey': l_partkey, 'o_orderpriority': o_orderpriority}, aggregations={}) + JOIN(condition=t0.o_orderkey == t1.l_orderkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'l_partkey': t1.l_partkey, 'o_orderpriority': t0.o_orderpriority}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t0.c_custkey, 'c_nationkey': t0.c_nationkey, 'o_orderkey': t1.o_orderkey, 'o_orderpriority': t1.o_orderpriority}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + FILTER(condition=YEAR(o_orderdate) == 1997:numeric, columns={'o_custkey': o_custkey, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderdate': o_orderdate, 'o_orderkey': o_orderkey, 'o_orderpriority': o_orderpriority}) + FILTER(condition=(YEAR(l_shipdate) == 1997:numeric) & ISIN(MONTH(l_shipdate), [1, 2, 3]:array[numeric]), columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey}) + SCAN(table=tpch.LINEITEM, columns={'l_orderkey': l_orderkey, 'l_partkey': l_partkey, 'l_shipdate': l_shipdate}) + SCAN(table=tpch.PART, columns={'p_partkey': p_partkey, 'p_type': p_type}) diff --git a/tests/test_plan_refsols/count_multiple_filters_a.txt b/tests/test_plan_refsols/count_multiple_filters_a.txt new file mode 100644 index 000000000..f8f20f9eb --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_a.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n1', n_rows), ('n2', sum_expr)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string))}) + FILTER(condition=MONOTONIC(500:numeric, c_acctbal, 600:numeric), columns={'c_mktsegment': c_mktsegment}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment}) diff --git a/tests/test_plan_refsols/count_multiple_filters_b.txt b/tests/test_plan_refsols/count_multiple_filters_b.txt new file mode 100644 index 000000000..f8f20f9eb --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_b.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n1', n_rows), ('n2', sum_expr)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string))}) + FILTER(condition=MONOTONIC(500:numeric, c_acctbal, 600:numeric), columns={'c_mktsegment': c_mktsegment}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment}) diff --git a/tests/test_plan_refsols/count_multiple_filters_c.txt b/tests/test_plan_refsols/count_multiple_filters_c.txt new file mode 100644 index 000000000..c5974c333 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_c.txt @@ -0,0 +1,5 @@ +ROOT(columns=[('n1', n_rows), ('n2', sum_expr)], orderings=[]) + FILTER(condition=sum_expr != 0:numeric, columns={'n_rows': n_rows, 'sum_expr': sum_expr}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string))}) + FILTER(condition=MONOTONIC(500:numeric, c_acctbal, 600:numeric), columns={'c_mktsegment': c_mktsegment}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment}) diff --git a/tests/test_plan_refsols/count_multiple_filters_d.txt b/tests/test_plan_refsols/count_multiple_filters_d.txt new file mode 100644 index 000000000..c5974c333 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_d.txt @@ -0,0 +1,5 @@ +ROOT(columns=[('n1', n_rows), ('n2', sum_expr)], orderings=[]) + FILTER(condition=sum_expr != 0:numeric, columns={'n_rows': n_rows, 'sum_expr': sum_expr}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string))}) + FILTER(condition=MONOTONIC(500:numeric, c_acctbal, 600:numeric), columns={'c_mktsegment': c_mktsegment}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment}) diff --git a/tests/test_plan_refsols/count_multiple_filters_e.txt b/tests/test_plan_refsols/count_multiple_filters_e.txt new file mode 100644 index 000000000..a1b771090 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_e.txt @@ -0,0 +1,8 @@ +ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', sum_expr), ('n4', sum_expr_11), ('n5', sum_expr_13), ('n6', sum_expr_12)], orderings=[]) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows, 'sum_expr': t0.sum_expr, 'sum_expr_11': t0.sum_expr_11, 'sum_expr_12': t0.sum_expr_12, 'sum_expr_13': t1.sum_expr}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string)), 'sum_expr_11': SUM(STARTSWITH(c_phone, '11':string)), 'sum_expr_12': SUM(((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '11':string)))}) + FILTER(condition=MONOTONIC(500:numeric, c_acctbal, 600:numeric), columns={'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM(STARTSWITH(c_phone, '11':string))}) + FILTER(condition=c_mktsegment == 'BUILDING':string, columns={'c_phone': c_phone}) + SCAN(table=tpch.CUSTOMER, columns={'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/count_multiple_filters_f.txt b/tests/test_plan_refsols/count_multiple_filters_f.txt new file mode 100644 index 000000000..05cf0f52c --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_f.txt @@ -0,0 +1,3 @@ +ROOT(columns=[('n1', n_rows), ('n2', sum_expr), ('n3', sum_expr_13), ('n4', sum_expr_14), ('n5', sum_expr_15), ('n6', sum_expr_11)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((c_mktsegment == 'BUILDING':string)), 'sum_expr_11': SUM(((c_mktsegment == 'BUILDING':string) & MONOTONIC(500:numeric, c_acctbal, 600:numeric) & STARTSWITH(c_phone, '11':string))), 'sum_expr_13': SUM(MONOTONIC(500:numeric, c_acctbal, 600:numeric)), 'sum_expr_14': SUM(STARTSWITH(c_phone, '11':string)), 'sum_expr_15': SUM(((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '11':string)))}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/count_multiple_filters_g.txt b/tests/test_plan_refsols/count_multiple_filters_g.txt new file mode 100644 index 000000000..dbfd3ccf9 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_g.txt @@ -0,0 +1,30 @@ +ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', sum_expr_9), ('n4', agg_3), ('n5', agg_4), ('n6', sum_expr)], orderings=[]) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_3': t0.agg_3, 'agg_4': t1.n_rows, 'n_rows': t0.n_rows, 'sum_expr': t0.sum_expr, 'sum_expr_9': t0.sum_expr_9}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_3': t1.n_rows, 'n_rows': t0.n_rows, 'sum_expr': t0.sum_expr, 'sum_expr_9': t0.sum_expr_9}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'n_rows': t0.n_rows, 'sum_expr': t1.sum_expr, 'sum_expr_9': t1.sum_expr_9}) + JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=100:numeric == PERCENTILE(args=[], partition=[], order=[(c_acctbal):asc_last]), columns={}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=100:numeric == PERCENTILE(args=[], partition=[], order=[(c_acctbal):asc_last]), columns={}) + JOIN(condition=t0.c_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'c_acctbal': t0.c_acctbal}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_nationkey': c_nationkey}) + FILTER(condition=n_name == 'GERMANY':string, columns={'n_nationkey': n_nationkey}) + SCAN(table=tpch.NATION, columns={'n_name': n_name, 'n_nationkey': n_nationkey}) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM((n_name == 'CHINA':string)), 'sum_expr_9': SUM((n_name == 'GERMANY':string))}) + JOIN(condition=t0.c_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'n_name': t1.n_name}) + SCAN(table=tpch.CUSTOMER, columns={'c_nationkey': c_nationkey}) + FILTER(condition=(n_name == 'CHINA':string) | (n_name == 'GERMANY':string), columns={'n_name': n_name, 'n_nationkey': n_nationkey}) + SCAN(table=tpch.NATION, columns={'n_name': n_name, 'n_nationkey': n_nationkey}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=100:numeric == PERCENTILE(args=[], partition=[], order=[(c_acctbal):asc_last]), columns={}) + JOIN(condition=t0.c_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'c_acctbal': t0.c_acctbal}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_nationkey': c_nationkey}) + FILTER(condition=n_name == 'CHINA':string, columns={'n_nationkey': n_nationkey}) + SCAN(table=tpch.NATION, columns={'n_name': n_name, 'n_nationkey': n_nationkey}) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) + FILTER(condition=(n_name == 'CHINA':string) & (100:numeric == PERCENTILE(args=[], partition=[], order=[(c_acctbal):asc_last])), columns={}) + JOIN(condition=t0.c_nationkey == t1.n_nationkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'c_acctbal': t0.c_acctbal, 'n_name': t1.n_name}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_nationkey': c_nationkey}) + SCAN(table=tpch.NATION, columns={'n_name': n_name, 'n_nationkey': n_nationkey}) diff --git a/tests/test_plan_refsols/count_multiple_filters_h.txt b/tests/test_plan_refsols/count_multiple_filters_h.txt new file mode 100644 index 000000000..61843c13f --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_h.txt @@ -0,0 +1,10 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', DEFAULT_TO(sum_n_rows, 0:numeric)), ('n3', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n4', DEFAULT_TO(sum_sum_expr_21, 0:numeric)), ('n5', DEFAULT_TO(sum_sum_expr_22, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_n_rows': t1.sum_n_rows, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_21': t1.sum_sum_expr_21, 'sum_sum_expr_22': t1.sum_sum_expr_22}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows), 'sum_sum_expr': SUM(sum_expr), 'sum_sum_expr_21': SUM(sum_expr_21), 'sum_sum_expr_22': SUM(sum_expr_22)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'n_rows': t1.n_rows, 'sum_expr': t1.sum_expr, 'sum_expr_21': t1.sum_expr_21, 'sum_expr_22': t1.sum_expr_22}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((o_orderpriority == '1-URGENT':string)), 'sum_expr_21': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_22': SUM((o_orderpriority == '3-MEDIUM':string))}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_i.txt b/tests/test_plan_refsols/count_multiple_filters_i.txt new file mode 100644 index 000000000..cead6f2e7 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_i.txt @@ -0,0 +1,11 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', DEFAULT_TO(sum_n_rows, 0:numeric)), ('n3', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n4', sum_sum_expr_21), ('n5', DEFAULT_TO(sum_sum_expr_22, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_n_rows': t1.sum_n_rows, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_21': t1.sum_sum_expr_21, 'sum_sum_expr_22': t1.sum_sum_expr_22}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=sum_sum_expr_21 != 0:numeric, columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_n_rows': sum_n_rows, 'sum_sum_expr': sum_sum_expr, 'sum_sum_expr_21': sum_sum_expr_21, 'sum_sum_expr_22': sum_sum_expr_22}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_n_rows': SUM(n_rows), 'sum_sum_expr': SUM(sum_expr), 'sum_sum_expr_21': SUM(sum_expr_21), 'sum_sum_expr_22': SUM(sum_expr_22)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'n_rows': t1.n_rows, 'sum_expr': t1.sum_expr, 'sum_expr_21': t1.sum_expr_21, 'sum_expr_22': t1.sum_expr_22}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((o_orderpriority == '1-URGENT':string)), 'sum_expr_21': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_22': SUM((o_orderpriority == '3-MEDIUM':string))}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_j.txt b/tests/test_plan_refsols/count_multiple_filters_j.txt new file mode 100644 index 000000000..4be13d8be --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_j.txt @@ -0,0 +1,11 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n3', DEFAULT_TO(sum_sum_expr_16, 0:numeric)), ('n4', DEFAULT_TO(sum_sum_expr_17, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr_18, 'sum_sum_expr_16': t1.sum_sum_expr_16, 'sum_sum_expr_17': t1.sum_sum_expr_17}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_17': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_18': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_k.txt b/tests/test_plan_refsols/count_multiple_filters_k.txt new file mode 100644 index 000000000..7db6ea09b --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_k.txt @@ -0,0 +1,12 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', sum_sum_expr_18), ('n3', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n4', DEFAULT_TO(sum_sum_expr_17, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_17': t1.sum_sum_expr_17, 'sum_sum_expr_18': t1.sum_sum_expr_18}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=sum_sum_expr_18 != 0:numeric, columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_sum_expr': sum_sum_expr_16, 'sum_sum_expr_17': sum_sum_expr_17, 'sum_sum_expr_18': sum_sum_expr_18}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_17': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_18': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_l.txt b/tests/test_plan_refsols/count_multiple_filters_l.txt new file mode 100644 index 000000000..6444b26fb --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_l.txt @@ -0,0 +1,11 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n3', DEFAULT_TO(sum_sum_expr_16, 0:numeric)), ('n4', DEFAULT_TO(sum_sum_expr_17, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr_18, 'sum_sum_expr_16': t1.sum_sum_expr_16, 'sum_sum_expr_17': t1.sum_sum_expr_17}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM(((o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string))), 'sum_expr_17': SUM(((o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string))), 'sum_expr_18': SUM(((o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string)))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_m.txt b/tests/test_plan_refsols/count_multiple_filters_m.txt new file mode 100644 index 000000000..884c0e791 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_m.txt @@ -0,0 +1,12 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', sum_sum_expr_18), ('n3', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n4', DEFAULT_TO(sum_sum_expr_17, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_17': t1.sum_sum_expr_17, 'sum_sum_expr_18': t1.sum_sum_expr_18}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=sum_sum_expr_18 != 0:numeric, columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_sum_expr': sum_sum_expr_16, 'sum_sum_expr_17': sum_sum_expr_17, 'sum_sum_expr_18': sum_sum_expr_18}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM(((o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string))), 'sum_expr_17': SUM(((o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string))), 'sum_expr_18': SUM(((o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string)))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_n.txt b/tests/test_plan_refsols/count_multiple_filters_n.txt new file mode 100644 index 000000000..cabdbbe66 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_n.txt @@ -0,0 +1,12 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', sum_sum_expr_18), ('n3', sum_sum_expr), ('n4', DEFAULT_TO(sum_sum_expr_17, 0:numeric))], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_17': t1.sum_sum_expr_17, 'sum_sum_expr_18': t1.sum_sum_expr_18}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=(sum_sum_expr_16 != 0:numeric) & (sum_sum_expr_18 != 0:numeric), columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_sum_expr': sum_sum_expr_16, 'sum_sum_expr_17': sum_sum_expr_17, 'sum_sum_expr_18': sum_sum_expr_18}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM(((o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string))), 'sum_expr_17': SUM(((o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string))), 'sum_expr_18': SUM(((o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string)))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_o.txt b/tests/test_plan_refsols/count_multiple_filters_o.txt new file mode 100644 index 000000000..e405bf10a --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_o.txt @@ -0,0 +1,12 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', sum_sum_expr_18), ('n3', DEFAULT_TO(sum_sum_expr, 0:numeric)), ('n4', sum_sum_expr_17)], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_17': t1.sum_sum_expr_17, 'sum_sum_expr_18': t1.sum_sum_expr_18}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=(sum_sum_expr_17 != 0:numeric) & (sum_sum_expr_18 != 0:numeric), columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_sum_expr': sum_sum_expr_16, 'sum_sum_expr_17': sum_sum_expr_17, 'sum_sum_expr_18': sum_sum_expr_18}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM(((o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string))), 'sum_expr_17': SUM(((o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string))), 'sum_expr_18': SUM(((o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string)))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_p.txt b/tests/test_plan_refsols/count_multiple_filters_p.txt new file mode 100644 index 000000000..286ea2ea5 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_p.txt @@ -0,0 +1,12 @@ +ROOT(columns=[('region_name', r_name), ('n1', n_rows), ('n2', sum_sum_expr_18), ('n3', sum_sum_expr), ('n4', sum_sum_expr_17)], orderings=[]) + JOIN(condition=t0.r_regionkey == t1.n_regionkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 'r_name': t0.r_name, 'sum_sum_expr': t1.sum_sum_expr, 'sum_sum_expr_17': t1.sum_sum_expr_17, 'sum_sum_expr_18': t1.sum_sum_expr_18}) + SCAN(table=tpch.REGION, columns={'r_name': r_name, 'r_regionkey': r_regionkey}) + FILTER(condition=(sum_sum_expr_16 != 0:numeric) & (sum_sum_expr_17 != 0:numeric) & (sum_sum_expr_18 != 0:numeric), columns={'n_regionkey': n_regionkey, 'n_rows': n_rows, 'sum_sum_expr': sum_sum_expr_16, 'sum_sum_expr_17': sum_sum_expr_17, 'sum_sum_expr_18': sum_sum_expr_18}) + AGGREGATE(keys={'n_regionkey': n_regionkey}, aggregations={'n_rows': COUNT(), 'sum_sum_expr_16': SUM(sum_expr), 'sum_sum_expr_17': SUM(sum_expr_17), 'sum_sum_expr_18': SUM(sum_expr_18)}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_regionkey': t0.n_regionkey, 'sum_expr': t1.sum_expr, 'sum_expr_17': t1.sum_expr_17, 'sum_expr_18': t1.sum_expr_18}) + JOIN(condition=t0.n_nationkey == t1.c_nationkey, type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=SINGULAR_ACCESS, columns={'c_custkey': t1.c_custkey, 'n_regionkey': t0.n_regionkey}) + SCAN(table=tpch.NATION, columns={'n_nationkey': n_nationkey, 'n_regionkey': n_regionkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey, 'c_nationkey': c_nationkey}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM(((o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string))), 'sum_expr_17': SUM(((o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string))), 'sum_expr_18': SUM(((o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string)))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '3-MEDIUM':string) | (o_orderpriority == '4-NOT SPECIFIED':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_q.txt b/tests/test_plan_refsols/count_multiple_filters_q.txt new file mode 100644 index 000000000..45cc47cfd --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_q.txt @@ -0,0 +1,7 @@ +ROOT(columns=[('customer_key', c_custkey), ('n1', DEFAULT_TO(sum_expr_8, 0:numeric)), ('n2', DEFAULT_TO(sum_expr, 0:numeric)), ('n3', DEFAULT_TO(sum_expr_7, 0:numeric))], orderings=[]) + AGGREGATE(keys={'c_custkey': c_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_7': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_8': SUM((o_orderpriority == '1-URGENT':string))}) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=LEFT, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'o_orderpriority': t1.o_orderpriority}) + FILTER(condition=ISIN(c_custkey, [2, 3, 4, 17, 23, 26, 380, 827]:array[unknown]), columns={'c_custkey': c_custkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_r.txt b/tests/test_plan_refsols/count_multiple_filters_r.txt new file mode 100644 index 000000000..ebf5e0ba1 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_r.txt @@ -0,0 +1,8 @@ +ROOT(columns=[('customer_key', c_custkey), ('n1', sum_expr_8), ('n2', sum_expr), ('n3', sum_expr_7)], orderings=[]) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'sum_expr': t1.sum_expr, 'sum_expr_7': t1.sum_expr_7, 'sum_expr_8': t1.sum_expr_8}) + FILTER(condition=ISIN(c_custkey, [2, 3, 4, 17, 23, 26, 380, 827]:array[unknown]), columns={'c_custkey': c_custkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey}) + FILTER(condition=sum_expr_8 != 0:numeric, columns={'o_custkey': o_custkey, 'sum_expr': sum_expr, 'sum_expr_7': sum_expr_7, 'sum_expr_8': sum_expr_8}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_7': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_8': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_s.txt b/tests/test_plan_refsols/count_multiple_filters_s.txt new file mode 100644 index 000000000..c0c15134c --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_s.txt @@ -0,0 +1,8 @@ +ROOT(columns=[('customer_key', c_custkey), ('n1', sum_expr_8), ('n2', sum_expr), ('n3', sum_expr_7)], orderings=[]) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'sum_expr': t1.sum_expr, 'sum_expr_7': t1.sum_expr_7, 'sum_expr_8': t1.sum_expr_8}) + FILTER(condition=ISIN(c_custkey, [2, 3, 4, 17, 23, 26, 380, 827]:array[unknown]), columns={'c_custkey': c_custkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey}) + FILTER(condition=sum_expr != 0:numeric, columns={'o_custkey': o_custkey, 'sum_expr': sum_expr, 'sum_expr_7': sum_expr_7, 'sum_expr_8': sum_expr_8}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_7': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_8': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_t.txt b/tests/test_plan_refsols/count_multiple_filters_t.txt new file mode 100644 index 000000000..8dde513f8 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_t.txt @@ -0,0 +1,8 @@ +ROOT(columns=[('customer_key', c_custkey), ('n1', sum_expr_8), ('n2', sum_expr), ('n3', sum_expr_7)], orderings=[]) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'sum_expr': t1.sum_expr, 'sum_expr_7': t1.sum_expr_7, 'sum_expr_8': t1.sum_expr_8}) + FILTER(condition=ISIN(c_custkey, [2, 3, 4, 17, 23, 26, 380, 827]:array[unknown]), columns={'c_custkey': c_custkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey}) + FILTER(condition=sum_expr_7 != 0:numeric, columns={'o_custkey': o_custkey, 'sum_expr': sum_expr, 'sum_expr_7': sum_expr_7, 'sum_expr_8': sum_expr_8}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_7': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_8': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_u.txt b/tests/test_plan_refsols/count_multiple_filters_u.txt new file mode 100644 index 000000000..35b408288 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_u.txt @@ -0,0 +1,8 @@ +ROOT(columns=[('customer_key', c_custkey), ('n1', sum_expr_8), ('n2', sum_expr), ('n3', sum_expr_7)], orderings=[]) + JOIN(condition=t0.c_custkey == t1.o_custkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'c_custkey': t0.c_custkey, 'sum_expr': t1.sum_expr, 'sum_expr_7': t1.sum_expr_7, 'sum_expr_8': t1.sum_expr_8}) + FILTER(condition=ISIN(c_custkey, [2, 3, 4, 17, 23, 26, 380, 827]:array[unknown]), columns={'c_custkey': c_custkey}) + SCAN(table=tpch.CUSTOMER, columns={'c_custkey': c_custkey}) + FILTER(condition=(sum_expr != 0:numeric) & (sum_expr_7 != 0:numeric) & (sum_expr_8 != 0:numeric), columns={'o_custkey': o_custkey, 'sum_expr': sum_expr, 'sum_expr_7': sum_expr_7, 'sum_expr_8': sum_expr_8}) + AGGREGATE(keys={'o_custkey': o_custkey}, aggregations={'sum_expr': SUM((o_orderpriority == '2-HIGH':string)), 'sum_expr_7': SUM((o_orderpriority == '3-MEDIUM':string)), 'sum_expr_8': SUM((o_orderpriority == '1-URGENT':string))}) + FILTER(condition=(o_orderpriority == '1-URGENT':string) | (o_orderpriority == '2-HIGH':string) | (o_orderpriority == '3-MEDIUM':string), columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) + SCAN(table=tpch.ORDERS, columns={'o_custkey': o_custkey, 'o_orderpriority': o_orderpriority}) diff --git a/tests/test_plan_refsols/count_multiple_filters_v.txt b/tests/test_plan_refsols/count_multiple_filters_v.txt new file mode 100644 index 000000000..a0bd8dced --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_v.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n2', sum_expr), ('n3', sum_expr_4)], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(MONOTONIC(500:numeric, c_acctbal, 1000:numeric)), 'sum_expr_4': SUM(NOT(MONOTONIC(500:numeric, c_acctbal, 1000:numeric)))}) + FILTER(condition=(((c_mktsegment == 'BUILDING':string) & NOT(MONOTONIC(500:numeric, c_acctbal, 1000:numeric))) | (c_mktsegment == 'BUILDING':string)) & (((c_mktsegment == 'BUILDING':string) & NOT(MONOTONIC(500:numeric, c_acctbal, 1000:numeric))) | MONOTONIC(500:numeric, c_acctbal, 1000:numeric)), columns={'c_acctbal': c_acctbal}) + SCAN(table=tpch.CUSTOMER, columns={'c_acctbal': c_acctbal, 'c_mktsegment': c_mktsegment}) diff --git a/tests/test_plan_refsols/count_multiple_filters_w.txt b/tests/test_plan_refsols/count_multiple_filters_w.txt new file mode 100644 index 000000000..7dc0c76f7 --- /dev/null +++ b/tests/test_plan_refsols/count_multiple_filters_w.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('n1', sum_expr), ('n2', sum_expr_14), ('n3', sum_expr_15), ('n4', sum_expr_16), ('n5', sum_expr_17), ('n6', sum_expr_12)], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '30':string))), 'sum_expr_12': SUM(((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '32':string))), 'sum_expr_14': SUM(((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '31':string))), 'sum_expr_15': SUM(((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '32':string))), 'sum_expr_16': SUM(((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '30':string))), 'sum_expr_17': SUM(((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '31':string)))}) + FILTER(condition=((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '30':string)) | ((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '31':string)) | ((c_mktsegment == 'BUILDING':string) & STARTSWITH(c_phone, '32':string)) | ((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '30':string)) | ((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '31':string)) | ((c_mktsegment == 'HOUSEHOLD':string) & STARTSWITH(c_phone, '32':string)), columns={'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) + SCAN(table=tpch.CUSTOMER, columns={'c_mktsegment': c_mktsegment, 'c_phone': c_phone}) diff --git a/tests/test_plan_refsols/cryptbank_general_join_01_raw.txt b/tests/test_plan_refsols/cryptbank_general_join_01_raw.txt index 55924e1c2..a8ac454a8 100644 --- a/tests/test_plan_refsols/cryptbank_general_join_01_raw.txt +++ b/tests/test_plan_refsols/cryptbank_general_join_01_raw.txt @@ -6,7 +6,7 @@ ROOT(columns=[('branch_key', b_key), ('n_local_cust', n_rows), ('n_local_cust_lo SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) AGGREGATE(keys={'b_key': b_key, 'unmask_c_key': UNMASK::((42 - ([c_key])))}, aggregations={'n_rows': COUNT()}) JOIN(condition=(UNMASK::((42 - ([t0.c_key]))) == t1.a_custkey) & (t0.b_key == t1.a_branchkey), type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'b_key': t0.b_key, 'c_key': t0.c_key}) - JOIN(condition=SLICE(UNMASK::(SUBSTRING([t1.c_addr], -1) || SUBSTRING([t1.c_addr], 1, LENGTH([t1.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t0.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t0.b_key, 'c_key': t1.c_key}) + JOIN(condition=SLICE(UNMASK::(SUBSTRING([t1.c_addr], -1) || SUBSTRING([t1.c_addr], 1, LENGTH([t1.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t0.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'b_key': t0.b_key, 'c_key': t1.c_key}) SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_custkey': a_custkey}) diff --git a/tests/test_plan_refsols/cryptbank_general_join_01_rewrite.txt b/tests/test_plan_refsols/cryptbank_general_join_01_rewrite.txt index 55924e1c2..a8ac454a8 100644 --- a/tests/test_plan_refsols/cryptbank_general_join_01_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_general_join_01_rewrite.txt @@ -6,7 +6,7 @@ ROOT(columns=[('branch_key', b_key), ('n_local_cust', n_rows), ('n_local_cust_lo SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) AGGREGATE(keys={'b_key': b_key, 'unmask_c_key': UNMASK::((42 - ([c_key])))}, aggregations={'n_rows': COUNT()}) JOIN(condition=(UNMASK::((42 - ([t0.c_key]))) == t1.a_custkey) & (t0.b_key == t1.a_branchkey), type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=SINGULAR_FILTER, columns={'b_key': t0.b_key, 'c_key': t0.c_key}) - JOIN(condition=SLICE(UNMASK::(SUBSTRING([t1.c_addr], -1) || SUBSTRING([t1.c_addr], 1, LENGTH([t1.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t0.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t0.b_key, 'c_key': t1.c_key}) + JOIN(condition=SLICE(UNMASK::(SUBSTRING([t1.c_addr], -1) || SUBSTRING([t1.c_addr], 1, LENGTH([t1.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t0.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_FILTER, columns={'b_key': t0.b_key, 'c_key': t1.c_key}) SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_custkey': a_custkey}) diff --git a/tests/test_plan_refsols/cryptbank_general_join_02_raw.txt b/tests/test_plan_refsols/cryptbank_general_join_02_raw.txt index c2845023f..9c13fbeef 100644 --- a/tests/test_plan_refsols/cryptbank_general_join_02_raw.txt +++ b/tests/test_plan_refsols/cryptbank_general_join_02_raw.txt @@ -2,6 +2,6 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=(UNMASK::((42 - ([t1.c_key]))) == t0.a_custkey) & (t0.a_branchkey == t1.b_key), type=SEMI, columns={}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_custkey': a_custkey}) - JOIN(condition=SLICE(UNMASK::(SUBSTRING([t0.c_addr], -1) || SUBSTRING([t0.c_addr], 1, LENGTH([t0.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t1.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t1.b_key, 'c_key': t0.c_key}) + JOIN(condition=SLICE(UNMASK::(SUBSTRING([t0.c_addr], -1) || SUBSTRING([t0.c_addr], 1, LENGTH([t0.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t1.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t1.b_key, 'c_key': t0.c_key}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key}) diff --git a/tests/test_plan_refsols/cryptbank_general_join_02_rewrite.txt b/tests/test_plan_refsols/cryptbank_general_join_02_rewrite.txt index c2845023f..9c13fbeef 100644 --- a/tests/test_plan_refsols/cryptbank_general_join_02_rewrite.txt +++ b/tests/test_plan_refsols/cryptbank_general_join_02_rewrite.txt @@ -2,6 +2,6 @@ ROOT(columns=[('n', n_rows)], orderings=[]) AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) JOIN(condition=(UNMASK::((42 - ([t1.c_key]))) == t0.a_custkey) & (t0.a_branchkey == t1.b_key), type=SEMI, columns={}) SCAN(table=CRBNK.ACCOUNTS, columns={'a_branchkey': a_branchkey, 'a_custkey': a_custkey}) - JOIN(condition=SLICE(UNMASK::(SUBSTRING([t0.c_addr], -1) || SUBSTRING([t0.c_addr], 1, LENGTH([t0.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t1.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t1.b_key, 'c_key': t0.c_key}) + JOIN(condition=SLICE(UNMASK::(SUBSTRING([t0.c_addr], -1) || SUBSTRING([t0.c_addr], 1, LENGTH([t0.c_addr]) - 1)), -8:numeric, -6:numeric, None:unknown) == SLICE(t1.b_addr, -8:numeric, -6:numeric, None:unknown), type=INNER, cardinality=PLURAL_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'b_key': t1.b_key, 'c_key': t0.c_key}) SCAN(table=CRBNK.CUSTOMERS, columns={'c_addr': c_addr, 'c_key': c_key}) SCAN(table=CRBNK.BRANCHES, columns={'b_addr': b_addr, 'b_key': b_key}) diff --git a/tests/test_plan_refsols/donor_3276.txt b/tests/test_plan_refsols/donor_3276.txt new file mode 100644 index 000000000..9c51eab5c --- /dev/null +++ b/tests/test_plan_refsols/donor_3276.txt @@ -0,0 +1,4 @@ +ROOT(columns=[('percentage_suburban', (100.0:numeric * suburban_sb_projects) / n_rows)], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'suburban_sb_projects': SUM((LOWER(school_metro) == 'suburban':string))}) + FILTER(condition=LOWER(school_city) == 'santa barbara':string, columns={'school_metro': school_metro}) + SCAN(table=main.projects, columns={'school_city': school_city, 'school_metro': school_metro}) diff --git a/tests/test_plan_refsols/movielens_2274.txt b/tests/test_plan_refsols/movielens_2274.txt new file mode 100644 index 000000000..7e1099d9d --- /dev/null +++ b/tests/test_plan_refsols/movielens_2274.txt @@ -0,0 +1,6 @@ +ROOT(columns=[('percentage_of_female_users', IFF((n_rows > 0:numeric), ((100.0:numeric * sum_expr) / n_rows), 0.0:numeric))], orderings=[]) + AGGREGATE(keys={}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM((LOWER(u_gender) == 'f':string))}) + JOIN(condition=t0.userid == t1.userid, type=SEMI, columns={'u_gender': t0.u_gender}) + SCAN(table=main.users, columns={'u_gender': u_gender, 'userid': userid}) + FILTER(condition=rating == 2:numeric, columns={'userid': userid}) + SCAN(table=main.u2base, columns={'rating': rating, 'userid': userid}) diff --git a/tests/test_plan_refsols/retail_transactions_ts_raw.txt b/tests/test_plan_refsols/retail_transactions_ts_raw.txt index 3d045b55a..3a5e18ca3 100644 --- a/tests/test_plan_refsols/retail_transactions_ts_raw.txt +++ b/tests/test_plan_refsols/retail_transactions_ts_raw.txt @@ -1,20 +1,4 @@ -ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', agg_2), ('n4', agg_3), ('n5', agg_4)], orderings=[]) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t0.agg_3, 'agg_4': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=(DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=(DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=(DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=(DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 4:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric)) | ((HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric) & (MINUTE(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == SECOND(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))))), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) +ROOT(columns=[('n1', sum_expr), ('n2', sum_expr_10), ('n3', sum_expr_11), ('n4', sum_expr_12), ('n5', sum_expr_13)], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric))), 'sum_expr_10': SUM(((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric))), 'sum_expr_11': SUM(((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric))), 'sum_expr_12': SUM(((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric))), 'sum_expr_13': SUM((((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 4:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric)) | ((HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric) & (MINUTE(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == SECOND(UNMASK::(PTY_UNPROTECT_TS([transaction_date])))))))}) + FILTER(condition=((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 4:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric)) | ((HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) < 3:numeric) & (MINUTE(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == SECOND(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))))) | ((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric)) | ((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 1:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric)) | ((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 7:numeric)) | ((DAY(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 2:numeric) & (HOUR(UNMASK::(PTY_UNPROTECT_TS([transaction_date]))) == 8:numeric)), columns={'transaction_date': transaction_date}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) diff --git a/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt b/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt index 2f111a2ba..d8eeeee22 100644 --- a/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt +++ b/tests/test_plan_refsols/retail_transactions_ts_rewrite.txt @@ -1,20 +1,4 @@ -ROOT(columns=[('n1', n_rows), ('n2', agg_1), ('n3', agg_2), ('n4', agg_3), ('n5', agg_4)], orderings=[]) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t0.agg_3, 'agg_4': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t0.agg_2, 'agg_3': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t0.agg_1, 'agg_2': t1.n_rows, 'n_rows': t0.n_rows}) - JOIN(condition=True:bool, type=INNER, cardinality=SINGULAR_ACCESS, reverse_cardinality=PLURAL_ACCESS, columns={'agg_1': t1.n_rows, 'n_rows': t0.n_rows}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(transaction_date, ['2178-03-20 07:19:29', '2825-09-23 07:37:08']:array[unknown]), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(transaction_date, ['1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03']:array[unknown]), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(transaction_date, ['3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51']:array[unknown]), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(transaction_date, ['2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08']:array[unknown]), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) - AGGREGATE(keys={}, aggregations={'n_rows': COUNT()}) - FILTER(condition=ISIN(transaction_date, ['1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32']:array[unknown]) | ISIN(transaction_date, ['2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01']:array[unknown]), columns={}) - SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) +ROOT(columns=[('n1', sum_expr), ('n2', sum_expr_10), ('n3', sum_expr_11), ('n4', sum_expr_12), ('n5', sum_expr_13)], orderings=[]) + AGGREGATE(keys={}, aggregations={'sum_expr': SUM(ISIN(transaction_date, ['2178-03-20 07:19:29', '2825-09-23 07:37:08']:array[unknown])), 'sum_expr_10': SUM(ISIN(transaction_date, ['1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03']:array[unknown])), 'sum_expr_11': SUM(ISIN(transaction_date, ['3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51']:array[unknown])), 'sum_expr_12': SUM(ISIN(transaction_date, ['2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08']:array[unknown])), 'sum_expr_13': SUM((ISIN(transaction_date, ['1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32']:array[unknown]) | ISIN(transaction_date, ['2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01']:array[unknown])))}) + FILTER(condition=ISIN(transaction_date, ['1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03']:array[unknown]) | ISIN(transaction_date, ['1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32']:array[unknown]) | ISIN(transaction_date, ['2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08']:array[unknown]) | ISIN(transaction_date, ['2178-03-20 07:19:29', '2825-09-23 07:37:08']:array[unknown]) | ISIN(transaction_date, ['2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01']:array[unknown]) | ISIN(transaction_date, ['3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51']:array[unknown]), columns={'transaction_date': transaction_date}) + SCAN(table=bodo.retail.transactions, columns={'transaction_date': transaction_date}) diff --git a/tests/test_plan_refsols/semi_aggregate.txt b/tests/test_plan_refsols/semi_aggregate.txt index 6c99fbc08..6c4ccc09c 100644 --- a/tests/test_plan_refsols/semi_aggregate.txt +++ b/tests/test_plan_refsols/semi_aggregate.txt @@ -1,7 +1,7 @@ ROOT(columns=[('name', s_name), ('num_10parts', n_rows), ('avg_price_of_10parts', sum_p_retailprice / sum_expr), ('sum_price_of_10parts', DEFAULT_TO(sum_p_retailprice, 0:numeric))], orderings=[]) JOIN(condition=t0.s_suppkey == t1.ps_suppkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=SINGULAR_ACCESS, columns={'n_rows': t1.n_rows, 's_name': t0.s_name, 'sum_expr': t1.sum_expr, 'sum_p_retailprice': t1.sum_p_retailprice}) SCAN(table=tpch.SUPPLIER, columns={'s_name': s_name, 's_suppkey': s_suppkey}) - AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM(IFF(PRESENT(p_retailprice), 1:numeric, 0:numeric)), 'sum_p_retailprice': SUM(p_retailprice)}) + AGGREGATE(keys={'ps_suppkey': ps_suppkey}, aggregations={'n_rows': COUNT(), 'sum_expr': SUM(PRESENT(p_retailprice)), 'sum_p_retailprice': SUM(p_retailprice)}) JOIN(condition=t0.ps_partkey == t1.p_partkey, type=INNER, cardinality=SINGULAR_FILTER, reverse_cardinality=PLURAL_ACCESS, columns={'p_retailprice': t1.p_retailprice, 'ps_suppkey': t0.ps_suppkey}) SCAN(table=tpch.PARTSUPP, columns={'ps_partkey': ps_partkey, 'ps_suppkey': ps_suppkey}) FILTER(condition=p_size == 10:numeric, columns={'p_partkey': p_partkey, 'p_retailprice': p_retailprice}) diff --git a/tests/test_pydough_functions/defog_test_functions.py b/tests/test_pydough_functions/defog_test_functions.py index 8a2770086..a737977b2 100644 --- a/tests/test_pydough_functions/defog_test_functions.py +++ b/tests/test_pydough_functions/defog_test_functions.py @@ -2967,8 +2967,8 @@ def impl_defog_restaurants_gen14(): non-vegan food in San Francisco? Match food_type case insensitively """ sf_restaurants = restaurants.WHERE(LOWER(city_name) == "san francisco") - n_vegan = SUM(LOWER(sf_restaurants.food_type) == "vegan") - n_non_vegan = SUM(LOWER(sf_restaurants.food_type) != "vegan") + n_vegan = COUNT(sf_restaurants.WHERE(LOWER(food_type) == "vegan")) + n_non_vegan = COUNT(sf_restaurants.WHERE(LOWER(food_type) != "vegan")) return Restaurants.CALCULATE( ratio=(n_vegan / KEEP_IF(n_non_vegan, n_non_vegan != 0)) ) @@ -2983,7 +2983,7 @@ def impl_defog_restaurants_gen15(): Los Angeles? """ la_restaurants = restaurants.WHERE(LOWER(city_name) == "los angeles") - n_la_italian = SUM(LOWER(la_restaurants.food_type) == "italian") + n_la_italian = COUNT(la_restaurants.WHERE(LOWER(food_type) == "italian")) n_la = COUNT(la_restaurants) return Restaurants.CALCULATE(ratio=(n_la_italian / KEEP_IF(n_la, n_la != 0))) diff --git a/tests/test_sql_refsols/correl_14_sqlite.sql b/tests/test_sql_refsols/correl_14_sqlite.sql index cc4c44913..2340e279e 100644 --- a/tests/test_sql_refsols/correl_14_sqlite.sql +++ b/tests/test_sql_refsols/correl_14_sqlite.sql @@ -1,7 +1,7 @@ WITH _s4 AS ( SELECT partsupp.ps_suppkey, - SUM(IIF(NOT part.p_retailprice IS NULL, 1, 0)) AS sum_expr, + SUM(NOT part.p_retailprice IS NULL) AS sum_expr, SUM(part.p_retailprice) AS sum_p_retailprice FROM tpch.supplier AS supplier JOIN tpch.partsupp AS partsupp diff --git a/tests/test_sql_refsols/correl_15_sqlite.sql b/tests/test_sql_refsols/correl_15_sqlite.sql index e93da2ce7..8fbf54a41 100644 --- a/tests/test_sql_refsols/correl_15_sqlite.sql +++ b/tests/test_sql_refsols/correl_15_sqlite.sql @@ -6,7 +6,7 @@ WITH _s0 AS ( SELECT partsupp.ps_suppkey, MAX(_s0.avg_p_retailprice) AS anything_avg_p_retailprice, - SUM(IIF(NOT part.p_retailprice IS NULL, 1, 0)) AS sum_expr, + SUM(NOT part.p_retailprice IS NULL) AS sum_expr, SUM(part.p_retailprice) AS sum_p_retailprice FROM _s0 AS _s0 JOIN tpch.supplier AS supplier diff --git a/tests/test_sql_refsols/correl_31_sqlite.sql b/tests/test_sql_refsols/correl_31_sqlite.sql index 8376c2ada..69dbc030b 100644 --- a/tests/test_sql_refsols/correl_31_sqlite.sql +++ b/tests/test_sql_refsols/correl_31_sqlite.sql @@ -22,7 +22,7 @@ WITH _t1 AS ( 1 - lineitem.l_discount ) ELSE NULL - END AS expr_2 + END AS expr_7 FROM tpch.nation AS nation JOIN tpch.region AS region ON nation.n_regionkey = region.r_regionkey AND region.r_name = 'EUROPE' @@ -46,7 +46,7 @@ SELECT AVG(l_extendedprice * ( 1 - l_discount )) AS mean_rev, - AVG(expr_2) AS median_rev + AVG(expr_7) AS median_rev FROM _t1 GROUP BY n_nationkey diff --git a/tests/test_sql_refsols/correl_35_sqlite.sql b/tests/test_sql_refsols/correl_35_sqlite.sql index 16cab7b6a..fb6b735d7 100644 --- a/tests/test_sql_refsols/correl_35_sqlite.sql +++ b/tests/test_sql_refsols/correl_35_sqlite.sql @@ -4,12 +4,11 @@ WITH _s1 AS ( p_type FROM tpch.part ), _s10 AS ( - SELECT + SELECT DISTINCT customer.c_custkey, customer.c_nationkey, lineitem.l_partkey, - orders.o_orderpriority, - COUNT(*) AS n_rows + orders.o_orderpriority FROM tpch.customer AS customer JOIN tpch.orders AS orders ON CAST(STRFTIME('%Y', orders.o_orderdate) AS INTEGER) = 1997 @@ -18,26 +17,15 @@ WITH _s1 AS ( ON CAST(STRFTIME('%Y', lineitem.l_shipdate) AS INTEGER) = 1997 AND CAST(STRFTIME('%m', lineitem.l_shipdate) AS INTEGER) IN (1, 2, 3) AND lineitem.l_orderkey = orders.o_orderkey - GROUP BY - 1, - 2, - 3, - 4 -), _t3 AS ( - SELECT +), _s13 AS ( + SELECT DISTINCT _s10.c_custkey, _s10.c_nationkey, _s10.o_orderpriority, - _s11.p_type, - SUM(_s10.n_rows) AS sum_n_rows + _s11.p_type FROM _s10 AS _s10 JOIN _s1 AS _s11 ON _s10.l_partkey = _s11.p_partkey - GROUP BY - 1, - 2, - 3, - 4 ) SELECT COUNT(*) AS n @@ -49,11 +37,10 @@ JOIN tpch.supplier AS supplier JOIN tpch.orders AS orders ON CAST(STRFTIME('%Y', orders.o_orderdate) AS INTEGER) = 1998 AND lineitem.l_orderkey = orders.o_orderkey -JOIN _t3 AS _t3 - ON _s1.p_type = _t3.p_type - AND _t3.c_custkey = orders.o_custkey - AND _t3.c_nationkey = supplier.s_nationkey - AND _t3.o_orderpriority = orders.o_orderpriority - AND _t3.sum_n_rows <> 0 +JOIN _s13 AS _s13 + ON _s1.p_type = _s13.p_type + AND _s13.c_custkey = orders.o_custkey + AND _s13.c_nationkey = supplier.s_nationkey + AND _s13.o_orderpriority = orders.o_orderpriority WHERE CAST(STRFTIME('%Y', lineitem.l_shipdate) AS INTEGER) = 1998 diff --git a/tests/test_sql_refsols/defog_restaurants_gen11_ansi.sql b/tests/test_sql_refsols/defog_restaurants_gen11_ansi.sql index 832005452..cde69334c 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen11_ansi.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen11_ansi.sql @@ -1,15 +1,3 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant - WHERE - rating > 4.5 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant -) SELECT - _s0.n_rows / _s1.n_rows AS ratio -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 + SUM(rating > 4.5) / COUNT(*) AS ratio +FROM main.restaurant diff --git a/tests/test_sql_refsols/defog_restaurants_gen11_mysql.sql b/tests/test_sql_refsols/defog_restaurants_gen11_mysql.sql index 832005452..cde69334c 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen11_mysql.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen11_mysql.sql @@ -1,15 +1,3 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant - WHERE - rating > 4.5 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant -) SELECT - _s0.n_rows / _s1.n_rows AS ratio -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 + SUM(rating > 4.5) / COUNT(*) AS ratio +FROM main.restaurant diff --git a/tests/test_sql_refsols/defog_restaurants_gen11_postgres.sql b/tests/test_sql_refsols/defog_restaurants_gen11_postgres.sql index bf0e704bf..048ba9823 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen11_postgres.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen11_postgres.sql @@ -1,15 +1,3 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant - WHERE - rating > 4.5 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant -) SELECT - CAST(_s0.n_rows AS DOUBLE PRECISION) / _s1.n_rows AS ratio -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 + CAST(SUM(CASE WHEN rating > 4.5 THEN 1 ELSE 0 END) AS DOUBLE PRECISION) / COUNT(*) AS ratio +FROM main.restaurant diff --git a/tests/test_sql_refsols/defog_restaurants_gen11_snowflake.sql b/tests/test_sql_refsols/defog_restaurants_gen11_snowflake.sql index 832005452..f3e20bfa6 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen11_snowflake.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen11_snowflake.sql @@ -1,15 +1,3 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant - WHERE - rating > 4.5 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant -) SELECT - _s0.n_rows / _s1.n_rows AS ratio -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 + COUNT_IF(rating > 4.5) / COUNT(*) AS ratio +FROM main.restaurant diff --git a/tests/test_sql_refsols/defog_restaurants_gen11_sqlite.sql b/tests/test_sql_refsols/defog_restaurants_gen11_sqlite.sql index 4b5cce5d6..121904d46 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen11_sqlite.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen11_sqlite.sql @@ -1,15 +1,3 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant - WHERE - rating > 4.5 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM main.restaurant -) SELECT - CAST(_s0.n_rows AS REAL) / _s1.n_rows AS ratio -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 + CAST(SUM(rating > 4.5) AS REAL) / COUNT(*) AS ratio +FROM main.restaurant diff --git a/tests/test_sql_refsols/defog_restaurants_gen14_ansi.sql b/tests/test_sql_refsols/defog_restaurants_gen14_ansi.sql index d9975b8d2..5e4386913 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen14_ansi.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen14_ansi.sql @@ -1,5 +1,8 @@ SELECT - COALESCE(SUM(LOWER(food_type) = 'vegan'), 0) / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio + SUM(LOWER(food_type) = 'vegan') / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'san francisco' + AND ( + LOWER(food_type) <> 'vegan' OR LOWER(food_type) = 'vegan' + ) diff --git a/tests/test_sql_refsols/defog_restaurants_gen14_mysql.sql b/tests/test_sql_refsols/defog_restaurants_gen14_mysql.sql index d9975b8d2..5e4386913 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen14_mysql.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen14_mysql.sql @@ -1,5 +1,8 @@ SELECT - COALESCE(SUM(LOWER(food_type) = 'vegan'), 0) / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio + SUM(LOWER(food_type) = 'vegan') / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'san francisco' + AND ( + LOWER(food_type) <> 'vegan' OR LOWER(food_type) = 'vegan' + ) diff --git a/tests/test_sql_refsols/defog_restaurants_gen14_postgres.sql b/tests/test_sql_refsols/defog_restaurants_gen14_postgres.sql index 74d49c9a3..b8b8a7938 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen14_postgres.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen14_postgres.sql @@ -1,5 +1,8 @@ SELECT - CAST(COALESCE(SUM(CASE WHEN LOWER(food_type) = 'vegan' THEN 1 ELSE 0 END), 0) AS DOUBLE PRECISION) / NULLIF(SUM(CASE WHEN LOWER(food_type) <> 'vegan' THEN 1 ELSE 0 END), 0) AS ratio + CAST(SUM(CASE WHEN LOWER(food_type) = 'vegan' THEN 1 ELSE 0 END) AS DOUBLE PRECISION) / NULLIF(SUM(CASE WHEN LOWER(food_type) <> 'vegan' THEN 1 ELSE 0 END), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'san francisco' + AND ( + LOWER(food_type) <> 'vegan' OR LOWER(food_type) = 'vegan' + ) diff --git a/tests/test_sql_refsols/defog_restaurants_gen14_snowflake.sql b/tests/test_sql_refsols/defog_restaurants_gen14_snowflake.sql index 9f437c5c6..11850beb4 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen14_snowflake.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen14_snowflake.sql @@ -3,3 +3,6 @@ SELECT FROM main.restaurant WHERE LOWER(city_name) = 'san francisco' + AND ( + LOWER(food_type) <> 'vegan' OR LOWER(food_type) = 'vegan' + ) diff --git a/tests/test_sql_refsols/defog_restaurants_gen14_sqlite.sql b/tests/test_sql_refsols/defog_restaurants_gen14_sqlite.sql index 06f824271..543bb54af 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen14_sqlite.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen14_sqlite.sql @@ -1,5 +1,5 @@ SELECT - CAST(COALESCE(SUM(LOWER(food_type) = 'vegan'), 0) AS REAL) / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio + CAST(SUM(LOWER(food_type) = 'vegan') AS REAL) / NULLIF(SUM(LOWER(food_type) <> 'vegan'), 0) AS ratio FROM main.restaurant WHERE - LOWER(city_name) = 'san francisco' + LOWER(city_name) = 'san francisco' AND LOWER(food_type) <> 'vegan' diff --git a/tests/test_sql_refsols/defog_restaurants_gen15_ansi.sql b/tests/test_sql_refsols/defog_restaurants_gen15_ansi.sql index c15bdb8d1..ee1edcfa8 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen15_ansi.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen15_ansi.sql @@ -1,5 +1,5 @@ SELECT - COALESCE(SUM(LOWER(food_type) = 'italian'), 0) / NULLIF(COUNT(*), 0) AS ratio + SUM(LOWER(food_type) = 'italian') / NULLIF(COUNT(*), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'los angeles' diff --git a/tests/test_sql_refsols/defog_restaurants_gen15_mysql.sql b/tests/test_sql_refsols/defog_restaurants_gen15_mysql.sql index c15bdb8d1..ee1edcfa8 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen15_mysql.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen15_mysql.sql @@ -1,5 +1,5 @@ SELECT - COALESCE(SUM(LOWER(food_type) = 'italian'), 0) / NULLIF(COUNT(*), 0) AS ratio + SUM(LOWER(food_type) = 'italian') / NULLIF(COUNT(*), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'los angeles' diff --git a/tests/test_sql_refsols/defog_restaurants_gen15_postgres.sql b/tests/test_sql_refsols/defog_restaurants_gen15_postgres.sql index 5016a56a6..aaadabc97 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen15_postgres.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen15_postgres.sql @@ -1,5 +1,5 @@ SELECT - CAST(COALESCE(SUM(CASE WHEN LOWER(food_type) = 'italian' THEN 1 ELSE 0 END), 0) AS DOUBLE PRECISION) / NULLIF(COUNT(*), 0) AS ratio + CAST(SUM(CASE WHEN LOWER(food_type) = 'italian' THEN 1 ELSE 0 END) AS DOUBLE PRECISION) / NULLIF(COUNT(*), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'los angeles' diff --git a/tests/test_sql_refsols/defog_restaurants_gen15_sqlite.sql b/tests/test_sql_refsols/defog_restaurants_gen15_sqlite.sql index c5dcfc7fb..ca869e50a 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen15_sqlite.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen15_sqlite.sql @@ -1,5 +1,5 @@ SELECT - CAST(COALESCE(SUM(LOWER(food_type) = 'italian'), 0) AS REAL) / NULLIF(COUNT(*), 0) AS ratio + CAST(SUM(LOWER(food_type) = 'italian') AS REAL) / NULLIF(COUNT(*), 0) AS ratio FROM main.restaurant WHERE LOWER(city_name) = 'los angeles' diff --git a/tests/test_sql_refsols/defog_restaurants_gen8_ansi.sql b/tests/test_sql_refsols/defog_restaurants_gen8_ansi.sql index cf572383b..3f8169add 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen8_ansi.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen8_ansi.sql @@ -12,7 +12,7 @@ WITH _s1 AS ( ), _s7 AS ( SELECT _s3.region, - SUM(CASE WHEN NOT restaurant.rating IS NULL THEN 1 ELSE 0 END) AS sum_expr, + SUM(NOT restaurant.rating IS NULL) AS sum_expr, SUM(restaurant.rating) AS sum_rating FROM main.location AS location LEFT JOIN _s1 AS _s3 diff --git a/tests/test_sql_refsols/defog_restaurants_gen8_mysql.sql b/tests/test_sql_refsols/defog_restaurants_gen8_mysql.sql index d3c25d5a9..48afc6331 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen8_mysql.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen8_mysql.sql @@ -12,7 +12,7 @@ WITH _s1 AS ( ), _s7 AS ( SELECT _s3.region, - SUM(CASE WHEN NOT restaurant.rating IS NULL THEN 1 ELSE 0 END) AS sum_expr, + SUM(NOT restaurant.rating IS NULL) AS sum_expr, SUM(restaurant.rating) AS sum_rating FROM main.location AS location LEFT JOIN _s1 AS _s3 diff --git a/tests/test_sql_refsols/defog_restaurants_gen8_snowflake.sql b/tests/test_sql_refsols/defog_restaurants_gen8_snowflake.sql index de01bfd58..9041e014f 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen8_snowflake.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen8_snowflake.sql @@ -12,7 +12,7 @@ WITH _s1 AS ( ), _s7 AS ( SELECT _s3.region, - SUM(IFF(NOT restaurant.rating IS NULL, 1, 0)) AS sum_expr, + COUNT_IF(NOT restaurant.rating IS NULL) AS sum_expr, SUM(restaurant.rating) AS sum_rating FROM main.location AS location LEFT JOIN _s1 AS _s3 diff --git a/tests/test_sql_refsols/defog_restaurants_gen8_sqlite.sql b/tests/test_sql_refsols/defog_restaurants_gen8_sqlite.sql index 2ea3ca06a..f10e4066b 100644 --- a/tests/test_sql_refsols/defog_restaurants_gen8_sqlite.sql +++ b/tests/test_sql_refsols/defog_restaurants_gen8_sqlite.sql @@ -12,7 +12,7 @@ WITH _s1 AS ( ), _s7 AS ( SELECT _s3.region, - SUM(IIF(NOT restaurant.rating IS NULL, 1, 0)) AS sum_expr, + SUM(NOT restaurant.rating IS NULL) AS sum_expr, SUM(restaurant.rating) AS sum_rating FROM main.location AS location LEFT JOIN _s1 AS _s3 diff --git a/tests/test_sql_refsols/donor_3276_ansi.sql b/tests/test_sql_refsols/donor_3276_ansi.sql new file mode 100644 index 000000000..c180985e8 --- /dev/null +++ b/tests/test_sql_refsols/donor_3276_ansi.sql @@ -0,0 +1,7 @@ +SELECT + ( + 100.0 * SUM(LOWER(school_metro) = 'suburban') + ) / COUNT(*) AS percentage_suburban +FROM main.projects +WHERE + LOWER(school_city) = 'santa barbara' diff --git a/tests/test_sql_refsols/donor_3276_mysql.sql b/tests/test_sql_refsols/donor_3276_mysql.sql new file mode 100644 index 000000000..c180985e8 --- /dev/null +++ b/tests/test_sql_refsols/donor_3276_mysql.sql @@ -0,0 +1,7 @@ +SELECT + ( + 100.0 * SUM(LOWER(school_metro) = 'suburban') + ) / COUNT(*) AS percentage_suburban +FROM main.projects +WHERE + LOWER(school_city) = 'santa barbara' diff --git a/tests/test_sql_refsols/donor_3276_postgres.sql b/tests/test_sql_refsols/donor_3276_postgres.sql new file mode 100644 index 000000000..65e0b08d0 --- /dev/null +++ b/tests/test_sql_refsols/donor_3276_postgres.sql @@ -0,0 +1,7 @@ +SELECT + ( + 100.0 * SUM(CASE WHEN LOWER(school_metro) = 'suburban' THEN 1 ELSE 0 END) + ) / COUNT(*) AS percentage_suburban +FROM main.projects +WHERE + LOWER(school_city) = 'santa barbara' diff --git a/tests/test_sql_refsols/donor_3276_snowflake.sql b/tests/test_sql_refsols/donor_3276_snowflake.sql new file mode 100644 index 000000000..bd9c447e5 --- /dev/null +++ b/tests/test_sql_refsols/donor_3276_snowflake.sql @@ -0,0 +1,7 @@ +SELECT + ( + 100.0 * COUNT_IF(LOWER(school_metro) = 'suburban') + ) / COUNT(*) AS percentage_suburban +FROM main.projects +WHERE + LOWER(school_city) = 'santa barbara' diff --git a/tests/test_sql_refsols/donor_3276_sqlite.sql b/tests/test_sql_refsols/donor_3276_sqlite.sql new file mode 100644 index 000000000..2f303e9d6 --- /dev/null +++ b/tests/test_sql_refsols/donor_3276_sqlite.sql @@ -0,0 +1,7 @@ +SELECT + CAST(( + 100.0 * SUM(LOWER(school_metro) = 'suburban') + ) AS REAL) / COUNT(*) AS percentage_suburban +FROM main.projects +WHERE + LOWER(school_city) = 'santa barbara' diff --git a/tests/test_sql_refsols/movielens_2274_ansi.sql b/tests/test_sql_refsols/movielens_2274_ansi.sql new file mode 100644 index 000000000..24541f689 --- /dev/null +++ b/tests/test_sql_refsols/movielens_2274_ansi.sql @@ -0,0 +1,11 @@ +SELECT + CASE + WHEN COUNT(*) > 0 + THEN ( + 100.0 * SUM(LOWER(users.u_gender) = 'f') + ) / COUNT(*) + ELSE 0.0 + END AS percentage_of_female_users +FROM main.users AS users +JOIN main.u2base AS u2base + ON u2base.rating = 2 AND u2base.userid = users.userid diff --git a/tests/test_sql_refsols/movielens_2274_mysql.sql b/tests/test_sql_refsols/movielens_2274_mysql.sql new file mode 100644 index 000000000..4dcbd4e83 --- /dev/null +++ b/tests/test_sql_refsols/movielens_2274_mysql.sql @@ -0,0 +1,22 @@ +WITH _u_0 AS ( + SELECT + userid AS _u_1 + FROM main.u2base + WHERE + rating = 2 + GROUP BY + 1 +) +SELECT + CASE + WHEN COUNT(*) > 0 + THEN ( + 100.0 * SUM(LOWER(users.u_gender) = 'f') + ) / COUNT(*) + ELSE 0.0 + END AS percentage_of_female_users +FROM main.users AS users +LEFT JOIN _u_0 AS _u_0 + ON _u_0._u_1 = users.userid +WHERE + NOT _u_0._u_1 IS NULL diff --git a/tests/test_sql_refsols/movielens_2274_postgres.sql b/tests/test_sql_refsols/movielens_2274_postgres.sql new file mode 100644 index 000000000..7d41e8e1d --- /dev/null +++ b/tests/test_sql_refsols/movielens_2274_postgres.sql @@ -0,0 +1,22 @@ +WITH _u_0 AS ( + SELECT + userid AS _u_1 + FROM main.u2base + WHERE + rating = 2 + GROUP BY + 1 +) +SELECT + CASE + WHEN COUNT(*) > 0 + THEN ( + 100.0 * SUM(CASE WHEN LOWER(users.u_gender) = 'f' THEN 1 ELSE 0 END) + ) / COUNT(*) + ELSE 0.0 + END AS percentage_of_female_users +FROM main.users AS users +LEFT JOIN _u_0 AS _u_0 + ON _u_0._u_1 = users.userid +WHERE + NOT _u_0._u_1 IS NULL diff --git a/tests/test_sql_refsols/movielens_2274_snowflake.sql b/tests/test_sql_refsols/movielens_2274_snowflake.sql new file mode 100644 index 000000000..a76f7e516 --- /dev/null +++ b/tests/test_sql_refsols/movielens_2274_snowflake.sql @@ -0,0 +1,18 @@ +WITH _u_0 AS ( + SELECT + userid AS _u_1 + FROM main.u2base + WHERE + rating = 2 + GROUP BY + 1 +) +SELECT + IFF(COUNT(*) > 0, ( + 100.0 * COUNT_IF(LOWER(users.u_gender) = 'f') + ) / COUNT(*), 0.0) AS percentage_of_female_users +FROM main.users AS users +LEFT JOIN _u_0 AS _u_0 + ON _u_0._u_1 = users.userid +WHERE + NOT _u_0._u_1 IS NULL diff --git a/tests/test_sql_refsols/movielens_2274_sqlite.sql b/tests/test_sql_refsols/movielens_2274_sqlite.sql new file mode 100644 index 000000000..01d32927d --- /dev/null +++ b/tests/test_sql_refsols/movielens_2274_sqlite.sql @@ -0,0 +1,22 @@ +WITH _u_0 AS ( + SELECT + userid AS _u_1 + FROM main.u2base + WHERE + rating = 2 + GROUP BY + 1 +) +SELECT + IIF( + COUNT(*) > 0, + CAST(( + 100.0 * SUM(LOWER(users.u_gender) = 'f') + ) AS REAL) / COUNT(*), + 0.0 + ) AS percentage_of_female_users +FROM main.users AS users +LEFT JOIN _u_0 AS _u_0 + ON _u_0._u_1 = users.userid +WHERE + NOT _u_0._u_1 IS NULL diff --git a/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql b/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql index f4a984e71..4a8507776 100644 --- a/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_ts_raw_snowflake.sql @@ -1,50 +1,50 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE +SELECT + COUNT_IF( DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n1, + COUNT_IF( DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 -), _s3 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n2, + COUNT_IF( DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 -), _s5 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n3, + COUNT_IF( DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 -), _s7 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n4, + COUNT_IF( ( DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 4 OR MINUTE(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = SECOND(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) ) AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 3 -) -SELECT - _s0.n_rows AS n1, - _s1.n_rows AS n2, - _s3.n_rows AS n3, - _s5.n_rows AS n4, - _s7.n_rows AS n5 -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 -CROSS JOIN _s3 AS _s3 -CROSS JOIN _s5 AS _s5 -CROSS JOIN _s7 AS _s7 + ) AS n5 +FROM bodo.retail.transactions +WHERE + ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 4 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 3 + ) + OR ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 + ) + OR ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 1 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 + ) + OR ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 7 + ) + OR ( + DAY(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 2 + AND HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = 8 + ) + OR ( + HOUR(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) < 3 + AND MINUTE(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) = SECOND(CAST(PTY_UNPROTECT_TS(transaction_date) AS TIMESTAMP)) + ) diff --git a/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql b/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql index b8cd69ee9..6eeb2a4cf 100644 --- a/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql +++ b/tests/test_sql_refsols/retail_transactions_ts_rewrite_snowflake.sql @@ -1,43 +1,23 @@ -WITH _s0 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE - transaction_date IN ('2178-03-20 07:19:29', '2825-09-23 07:37:08') -), _s1 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE +SELECT + COUNT_IF(transaction_date IN ('2178-03-20 07:19:29', '2825-09-23 07:37:08')) AS n1, + COUNT_IF( transaction_date IN ('1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03') -), _s3 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n2, + COUNT_IF( transaction_date IN ('3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51') -), _s5 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n3, + COUNT_IF( transaction_date IN ('2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08') -), _s7 AS ( - SELECT - COUNT(*) AS n_rows - FROM bodo.retail.transactions - WHERE + ) AS n4, + COUNT_IF( transaction_date IN ('1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32') OR transaction_date IN ('2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01') -) -SELECT - _s0.n_rows AS n1, - _s1.n_rows AS n2, - _s3.n_rows AS n3, - _s5.n_rows AS n4, - _s7.n_rows AS n5 -FROM _s0 AS _s0 -CROSS JOIN _s1 AS _s1 -CROSS JOIN _s3 AS _s3 -CROSS JOIN _s5 AS _s5 -CROSS JOIN _s7 AS _s7 + ) AS n5 +FROM bodo.retail.transactions +WHERE + transaction_date IN ('1010-12-08 07:23:35', '2328-01-19 07:33:25', '1577-03-20 07:41:29', '1345-03-06 07:41:47', '0937-05-21 07:27:48', '2176-01-07 07:07:03') + OR transaction_date IN ('1752-07-20 01:18:18', '1880-04-06 00:47:47', '2956-09-24 00:03:03', '1868-06-13 01:22:22', '0780-03-23 01:14:14', '1598-04-24 01:11:11', '0763-04-15 00:16:16', '2780-03-19 01:32:32') + OR transaction_date IN ('2052-11-18 08:24:33', '2052-11-18 08:32:00', '1577-03-20 08:03:51', '1577-03-20 08:32:17', '2550-01-17 08:56:44', '1551-03-04 08:36:08') + OR transaction_date IN ('2178-03-20 07:19:29', '2825-09-23 07:37:08') + OR transaction_date IN ('2268-07-06 01:50:11', '3056-08-07 01:18:26', '3120-07-22 02:43:20', '1010-12-08 01:47:15', '1440-10-15 02:26:30', '3054-12-02 00:51:55', '3031-02-17 00:54:21', '1539-02-23 00:49:34', '2418-09-09 01:12:48', '2418-09-09 02:09:31', '2551-01-12 00:34:57', '3141-01-25 02:24:01', '2328-01-19 01:20:40', '1577-03-20 00:27:19', '1608-08-20 00:10:55', '1608-08-20 01:12:55', '1608-08-20 02:14:47', '2825-09-23 02:31:19', '1286-12-21 00:21:24', '1286-12-21 01:25:46', '3300-07-12 00:15:35', '2059-07-23 01:56:15', '2955-06-27 00:48:34', '2955-06-27 01:24:43', '0937-05-21 00:40:43', '0930-11-28 02:44:19', '1605-10-12 00:58:57', '0781-08-29 02:28:10', '2374-09-21 00:21:42', '2374-09-21 02:10:55', '3022-05-13 01:56:21', '3088-03-30 01:09:15', '3088-03-30 02:38:56', '1757-01-16 00:20:29', '3287-10-20 01:17:31', '2555-09-08 00:40:20', '2555-09-08 01:20:22', '2555-09-08 02:36:58', '2176-01-07 02:50:08', '2282-06-16 00:21:35', '2595-05-23 01:32:01', '3237-05-26 01:19:24', '3237-05-26 01:52:49', '2780-03-19 01:32:32', '2780-03-19 02:33:01') + OR transaction_date IN ('3120-07-22 08:30:44', '1890-02-18 08:21:13', '1890-02-18 08:46:51')