diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py index be8ff1ad..f2f88b75 100644 --- a/dataframely/filter_result.py +++ b/dataframely/filter_result.py @@ -114,6 +114,29 @@ def invalid(self) -> pl.DataFrame: """The rows of the original data frame containing the invalid rows.""" return self._df.drop(self._rule_columns) + def details(self) -> pl.DataFrame: + """Same as :meth:`invalid` but with additional columns indicating the results of + each individual rule. + + For each row, this includes: + 1. All columns of the original data frame. + 2. One column for each rule indicating whether the value of the column + is `valid`, `invalid`, or `unknown`. + + If a rule column has a value of `unknown` for a given row, that means the rule + could not be evaluated reliably. + This may happen when calling :meth:`Collection.filter` with collection-level + filters in addition to member-level rules, or when calling :meth:`Schema.filter` + with `cast=True` and dtype-casting fails for a value. + """ + return self._df.select( + pl.exclude(self._rule_columns), + pl.col(*self._rule_columns).replace_strict( + {True: "valid", False: "invalid", None: "unknown"}, + return_dtype=pl.Enum(["valid", "invalid", "unknown"]), + ), + ) + def counts(self) -> dict[str, int]: """The number of validation failures for each individual rule. diff --git a/docs/guides/features/serialization.md b/docs/guides/features/serialization.md index e62b71f9..7e7296ac 100644 --- a/docs/guides/features/serialization.md +++ b/docs/guides/features/serialization.md @@ -139,7 +139,7 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) @@ -190,9 +190,9 @@ json.loads(HouseSchema.serialize()) 'primary_key': False, 'regex': None}}, 'name': 'HouseSchema', - 'rules': {'reasonable_bathroom_to_bedrooom_ratio': {'expr': {'__type__': 'expression', - 'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'}, - 'rule_type': 'Rule'}}, + 'rules': {'reasonable_bathroom_to_bedroom_ratio': {'expr': {'__type__': 'expression', + 'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'}, + 'rule_type': 'Rule'}}, 'versions': {'dataframely': '2.0.0', 'format': '1', 'polars': '1.33.1'}} ``` diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index c6216b91..4eb341ae 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -54,13 +54,13 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) ``` The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e. -`reasonable_bathroom_to_bedrooom_ratio`). +`reasonable_bathroom_to_bedroom_ratio`). The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data are valid with respect to this rule. @@ -81,7 +81,7 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) @@ -189,7 +189,7 @@ Using the `counts` method on the :class:`~dataframely.FailureInfo` object will r ```python { - "reasonable_bathroom_to_bedrooom_ratio": 1, + "reasonable_bathroom_to_bedroom_ratio": 1, "minimum_zip_code_count": 2, "zip_code|min_length": 1, "num_bedrooms|nullability": 2, @@ -205,6 +205,19 @@ failed_df = failure.invalid() This information tends to be very useful in tracking down issues with the data, both in productive systems and analytics environments. +```{note} +New in `dataframely` v2.8.0: The `FailureInfo.details()` method now returns additional columns indicating which rules were violated for each row. +``` + +For the example above, `failure.details()` would look as follows (we omitted some columns for readability): + +| zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... | +| -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- | +| 1 | 1 | 1 | 50000 | valid | invalid | invalid | valid | | +| 213 | null | 1 | 80000 | valid | valid | valid | invalid | | +| 123 | null | 0 | 60000 | valid | invalid | valid | invalid | | +| 213 | 2 | 8 | 160000 | invalid | valid | valid | valid | | + ## Type casting In rare cases, you might already be _absolutely certain_ that a data frame is valid with @@ -229,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2])) Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing -- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to +- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be + used to create SQL tables using types and constraints in line with the schema - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information diff --git a/tests/collection/test_filter_validate.py b/tests/collection/test_filter_validate.py index fe1afce0..926ea3a9 100644 --- a/tests/collection/test_filter_validate.py +++ b/tests/collection/test_filter_validate.py @@ -304,3 +304,57 @@ def test_maintain_order() -> None: out = MyShufflingCollection.validate(out.to_dict()) assert out.first.select("a").collect().to_series().is_sorted() assert out.second.select("a").collect().to_series().is_sorted() + + +def test_unknown_rule_outcomes( + data_without_filter_with_rule_violation: tuple[pl.DataFrame, pl.DataFrame], +) -> None: + _, fails = MyCollection.filter( + { + "first": data_without_filter_with_rule_violation[0], + "second": data_without_filter_with_rule_violation[1], + } + ) + assert fails["first"].details().to_dicts() == [ + { + "a": 1, + "b": 1, + "a|nullability": "valid", + "b|nullability": "valid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + "primary_key": "invalid", + }, + { + "a": 1, + "b": 3, + "a|nullability": "valid", + "b|nullability": "valid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + "primary_key": "invalid", + }, + ] + + assert fails["second"].details().to_dicts() == [ + { + "a": 1, + "b": 0, + "primary_key": "valid", + "a|nullability": "valid", + "b|nullability": "valid", + "b|min": "invalid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + }, + { + "a": 3, + "b": 2, + "primary_key": "unknown", + "a|nullability": "unknown", + "b|nullability": "unknown", + "b|min": "unknown", + "equal_primary_key": "invalid", + "first_b_greater_second_b": "valid", + }, + ] diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 6aa1e3d7..99557e79 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -243,3 +243,35 @@ def test_filter_maintain_order(eager: bool) -> None: ) out, _ = _filter_and_collect(schema, df, cast=True, eager=eager) assert out.get_column("a").is_sorted() + + +@pytest.mark.parametrize("eager", [True, False]) +def test_filter_details(eager: bool) -> None: + df = pl.DataFrame( + { + "a": [2, 2], + "b": ["bar", "foobar"], + } + ) + _, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager) + + assert fails.details().to_dicts() == [ + { + "a": 2, + "b": "bar", + "a|dtype": "valid", + "a|nullability": "valid", + "b|dtype": "valid", + "b|max_length": "valid", + "primary_key": "invalid", + }, + { + "a": 2, + "b": "foobar", + "a|dtype": "valid", + "a|nullability": "valid", + "b|dtype": "valid", + "b|max_length": "invalid", + "primary_key": "invalid", + }, + ]