-
Notifications
You must be signed in to change notification settings - Fork 26
Description
The function _random_query() in singling_out_evaluator.py generates queries using pandas.DataFrame.query(). However, if the column name contains a hyphen (e.g., "capital-gain"), df.query() raises a NameError like:
Query capital-gain == 5000 failed with name 'capital' is not defined.
Cause
pandas.query() interprets column names with hyphens as arithmetic expressions unless they are escaped with backticks (e.g., `capital-gain`).
Suggestion: modify "_random_query", "_query_expression", "_query_from_record" and "univariate_singling_out_queries".
- #############################
def _random_query(unique_values: Dict[str, List[Any]], cols: List[str]) -> str:
"""Generate a random query using given columns."""
query = []
for col in sorted(cols):
values = unique_values[col]
val = rng.choice(values)
# Escapes the column name with backticks for safe use in df.query()
col_expr = f"`{col}`"
if pd.isna(val):
expression = f"{_random_operator('boolean')}{col_expr}.isna()"
elif is_bool_dtype(values):
expression = f"{_random_operator('boolean')}{col_expr}"
elif isinstance(values, pd.CategoricalDtype):
expression = f"{col_expr} {_random_operator('categorical')} {val}"
elif is_numeric_dtype(values):
expression = f"{col_expr} {_random_operator('numerical')} {val}"
elif isinstance(val, str):
expression = f"{col_expr} {_random_operator('categorical')} '{_escape_quotes(val)}'"
else:
expression = f"{col_expr} {_random_operator('categorical')} '{val}'"
query.append(expression)
return " & ".join(query)
- #############################
def _query_expression(col: str, val: Any, dtype: np.dtype) -> str:
"""Generate type-aware query expression."""
# Escapes the column name with backticks for safe use in df.query()
col_escaped = f"{col}"
query: str = ""
if pd.api.types.is_datetime64_any_dtype(dtype):
query = f"{col_escaped} == '{val}'"
elif isinstance(val, str):
query = f"{col_escaped} == '{_escape_quotes(val)}'"
else:
query = f"{col_escaped} == {val}"
return query
- ###################################
def _query_from_record(record: pd.Series, dtypes: pd.Series, columns: List[str], medians: Optional[pd.Series]) -> str:
"""Construct a query from the attributes in a record."""
query = []
for col in sorted(columns):
# Escapes the column name with backticks for safe use in df.query()
col_escaped = f"`{col}`"
item = "" # Initializes item
if pd.isna(record[col]):
item = f"{col_escaped}.isna()" # Now directly in the string to avoid the extra ==
elif is_bool_dtype(dtypes[col]):
item = f"{col_escaped} == {record[col]}"
elif is_numeric_dtype(dtypes[col]):
if medians is None:
operator = rng.choice([">=", "<="])
else:
if record[col] > medians[col]:
operator = ">="
else:
operator = "<="
item = f"{col_escaped} {operator} {record[col]}"
elif isinstance(dtypes[col], pd.CategoricalDtype) and is_numeric_dtype(dtypes[col].categories.dtype):
item = f"{col_escaped} == {record[col]}"
else:
if isinstance(record[col], str):
item = f"{col_escaped} == '{_escape_quotes(record[col])}'"
else:
item = f"{col_escaped} == '{record[col]}'" # Using single quotes for consistency with strings
query.append(item) # No need to concatenate col with item anymore, item is already the complete expression
return " & ".join(query)
- ##############################
def univariate_singling_out_queries(df: pd.DataFrame, n_queries: int) -> List[str]:
"""Generate singling out queries from rare attributes.
Parameters
----------
df: pd.DataFrame
Input dataframe from which queries will be generated.
n_queries: int
Number of queries to generate.
Returns
-------
List[str]
The singling out queries.
"""
queries = []
for col in sorted(df.columns):
col_escaped = f"`{col}`" #Escapes the column name with backticks for safe use in df.query()
if df[col].isna().sum() == 1:
queries.append(f"{col_escaped}.isna()")
if pd.api.types.is_numeric_dtype(df.dtypes[col]):
values = df[col].dropna().sort_values()
if len(values) > 0:
queries.extend([f"{col_escaped} <= {values.iloc[0]}", f"{col_escaped} >= {values.iloc[-1]}"])
counts = df[col].value_counts()
rare_values = counts[counts == 1]
if len(rare_values) > 0:
# _query_expression already handles escaping, but it's good to make sure the passed col is the original one.
queries.extend([_query_expression(col=col, val=val, dtype=df.dtypes[col]) for val in rare_values.index])
rng.shuffle(queries)
so_queries = UniqueSinglingOutQueries()
for query in queries:
so_queries.check_and_append(query, df=df)
if len(so_queries) == n_queries:
break
return so_queries.queries