Skip to content

Bug: Queries fail when column names contain hyphens #43

@tcjordao

Description

@tcjordao

The function _random_query() in singling_out_evaluator.py generates queries using pandas.DataFrame.query(). However, if the column name contains a hyphen (e.g., "capital-gain"), df.query() raises a NameError like:

Query capital-gain == 5000 failed with name 'capital' is not defined.

Cause

pandas.query() interprets column names with hyphens as arithmetic expressions unless they are escaped with backticks (e.g., `capital-gain`).

Suggestion: modify "_random_query", "_query_expression", "_query_from_record" and "univariate_singling_out_queries".

  1. #############################

def _random_query(unique_values: Dict[str, List[Any]], cols: List[str]) -> str:
"""Generate a random query using given columns."""
query = []

for col in sorted(cols):
    values = unique_values[col]
    val = rng.choice(values)

    # Escapes the column name with backticks for safe use in df.query()
    col_expr = f"`{col}`"

    if pd.isna(val):
        expression = f"{_random_operator('boolean')}{col_expr}.isna()"
    elif is_bool_dtype(values):
        expression = f"{_random_operator('boolean')}{col_expr}"
    elif isinstance(values, pd.CategoricalDtype):
        expression = f"{col_expr} {_random_operator('categorical')} {val}"
    elif is_numeric_dtype(values):
        expression = f"{col_expr} {_random_operator('numerical')} {val}"
    elif isinstance(val, str):
        expression = f"{col_expr} {_random_operator('categorical')} '{_escape_quotes(val)}'"
    else:
        expression = f"{col_expr} {_random_operator('categorical')} '{val}'"

    query.append(expression)

return " & ".join(query)
  1. #############################

def _query_expression(col: str, val: Any, dtype: np.dtype) -> str:
"""Generate type-aware query expression."""
# Escapes the column name with backticks for safe use in df.query()
col_escaped = f"{col}"
query: str = ""

if pd.api.types.is_datetime64_any_dtype(dtype):
    query = f"{col_escaped} == '{val}'"
elif isinstance(val, str):
    query = f"{col_escaped} == '{_escape_quotes(val)}'"
else:
    query = f"{col_escaped} == {val}"

return query
  1. ###################################

def _query_from_record(record: pd.Series, dtypes: pd.Series, columns: List[str], medians: Optional[pd.Series]) -> str:
"""Construct a query from the attributes in a record."""
query = []

for col in sorted(columns):
    # Escapes the column name with backticks for safe use in df.query()
    col_escaped = f"`{col}`"
    item = "" # Initializes item

    if pd.isna(record[col]):
        item = f"{col_escaped}.isna()" # Now directly in the string to avoid the extra ==
    elif is_bool_dtype(dtypes[col]):
        item = f"{col_escaped} == {record[col]}"
    elif is_numeric_dtype(dtypes[col]):
        if medians is None:
            operator = rng.choice([">=", "<="])
        else:
            if record[col] > medians[col]:
                operator = ">="
            else:
                operator = "<="
        item = f"{col_escaped} {operator} {record[col]}"
    elif isinstance(dtypes[col], pd.CategoricalDtype) and is_numeric_dtype(dtypes[col].categories.dtype):
        item = f"{col_escaped} == {record[col]}"
    else:
        if isinstance(record[col], str):
            item = f"{col_escaped} == '{_escape_quotes(record[col])}'"
        else:
            item = f"{col_escaped} == '{record[col]}'" # Using single quotes for consistency with strings

    query.append(item) # No need to concatenate col with item anymore, item is already the complete expression

return " & ".join(query)
  1. ##############################

def univariate_singling_out_queries(df: pd.DataFrame, n_queries: int) -> List[str]:
"""Generate singling out queries from rare attributes.

Parameters
----------
df: pd.DataFrame
        Input dataframe from which queries will be generated.
n_queries: int
    Number of queries to generate.

Returns
-------
List[str]
        The singling out queries.

"""
queries = []

for col in sorted(df.columns):
    col_escaped = f"`{col}`" #Escapes the column name with backticks for safe use in df.query()

    if df[col].isna().sum() == 1:
        queries.append(f"{col_escaped}.isna()")

    if pd.api.types.is_numeric_dtype(df.dtypes[col]):
        values = df[col].dropna().sort_values()

        if len(values) > 0:
            queries.extend([f"{col_escaped} <= {values.iloc[0]}", f"{col_escaped} >= {values.iloc[-1]}"])

    counts = df[col].value_counts()
    rare_values = counts[counts == 1]

    if len(rare_values) > 0:
        # _query_expression already handles escaping, but it's good to make sure the passed col is the original one.
        queries.extend([_query_expression(col=col, val=val, dtype=df.dtypes[col]) for val in rare_values.index])

rng.shuffle(queries)

so_queries = UniqueSinglingOutQueries()

for query in queries:
    so_queries.check_and_append(query, df=df)

    if len(so_queries) == n_queries:
        break

return so_queries.queries

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions