From 3a7511af573a2ddd8bf6de82fc3a145f9b49e33d Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 11:17:07 +0100 Subject: [PATCH 01/10] feat: Expose a user-friendly version of `FailureInfo._lf` --- dataframely/filter_result.py | 21 +++++++++++++++-- docs/guides/features/serialization.md | 8 +++---- docs/guides/quickstart.md | 24 +++++++++++++++---- tests/column_types/test_list.py | 4 +++- tests/column_types/test_struct.py | 6 ++--- tests/schema/test_filter.py | 34 ++++++++++++++++++++++++++- 6 files changed, 81 insertions(+), 16 deletions(-) diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py index be8ff1ad..efdaa64b 100644 --- a/dataframely/filter_result.py +++ b/dataframely/filter_result.py @@ -111,8 +111,25 @@ def _df(self) -> pl.DataFrame: return self._lf.collect() def invalid(self) -> pl.DataFrame: - """The rows of the original data frame containing the invalid rows.""" - return self._df.drop(self._rule_columns) + """The rows of the original data frame containing the invalid rows. + + For each row, this includes: + 1. All columns of the original data frame. + 2. One column for each rule indicating whether the value of the column + is `valid`, `invalid`, or `unknown`. + + If a rule column has a value of `unknown` for a given row, that means the rule + could not be evaluated reliably. + This may happen when calling :meth:`Collection.filter` with collection-level + filters in addition to member-level rules, or when calling :meth:`Schema.filter` + with `cast=True` and dtype-casting fails for a value. + """ + return self._lf.select( + pl.exclude(self._rule_columns), + pl.col(*self._rule_columns) + .replace_strict({True: "valid", False: "invalid", None: "unknown"}) + .cast(pl.Enum(["valid", "invalid", "unknown"])), + ).collect() def counts(self) -> dict[str, int]: """The number of validation failures for each individual rule. diff --git a/docs/guides/features/serialization.md b/docs/guides/features/serialization.md index e62b71f9..7e7296ac 100644 --- a/docs/guides/features/serialization.md +++ b/docs/guides/features/serialization.md @@ -139,7 +139,7 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) @@ -190,9 +190,9 @@ json.loads(HouseSchema.serialize()) 'primary_key': False, 'regex': None}}, 'name': 'HouseSchema', - 'rules': {'reasonable_bathroom_to_bedrooom_ratio': {'expr': {'__type__': 'expression', - 'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'}, - 'rule_type': 'Rule'}}, + 'rules': {'reasonable_bathroom_to_bedroom_ratio': {'expr': {'__type__': 'expression', + 'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'}, + 'rule_type': 'Rule'}}, 'versions': {'dataframely': '2.0.0', 'format': '1', 'polars': '1.33.1'}} ``` diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index c6216b91..4f674fb6 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -54,13 +54,13 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) ``` The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e. -`reasonable_bathroom_to_bedrooom_ratio`). +`reasonable_bathroom_to_bedroom_ratio`). The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data are valid with respect to this rule. @@ -81,7 +81,7 @@ class HouseSchema(dy.Schema): price = dy.Float64(nullable=False) @dy.rule() - def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr: + def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr: ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms") return (ratio >= 1 / 3) & (ratio <= 3) @@ -189,7 +189,7 @@ Using the `counts` method on the :class:`~dataframely.FailureInfo` object will r ```python { - "reasonable_bathroom_to_bedrooom_ratio": 1, + "reasonable_bathroom_to_bedroom_ratio": 1, "minimum_zip_code_count": 2, "zip_code|min_length": 1, "num_bedrooms|nullability": 2, @@ -205,6 +205,19 @@ failed_df = failure.invalid() This information tends to be very useful in tracking down issues with the data, both in productive systems and analytics environments. +```{comment} +New in `dataframely` v2.8.0: The `FailureInfo.invalid()` method now returns additional columns indicating which rules were violated for each row. +``` + +For the example above, `failed_df` would look as follows (we omitted some columns for readability): + +| zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... | +| -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- | +| 1 | 1 | 1 | 50000 | valid | invalid | invalid | valid | | +| 213 | null | 1 | 80000 | valid | valid | valid | invalid | | +| 123 | null | 0 | 60000 | valid | invalid | valid | invalid | | +| 213 | 2 | 8 | 160000 | invalid | valid | valid | valid | | + ## Type casting In rare cases, you might already be _absolutely certain_ that a data frame is valid with @@ -229,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2])) Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing -- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to +- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be + used to create SQL tables using types and constraints in line with the schema - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information diff --git a/tests/column_types/test_list.py b/tests/column_types/test_list.py index a430a6e7..c91f3c09 100644 --- a/tests/column_types/test_list.py +++ b/tests/column_types/test_list.py @@ -89,7 +89,9 @@ def test_nested_list_with_rules() -> None: df = pl.DataFrame({"a": [[["ab"]], [["a"]], [[None]]]}) _, failures = schema.filter(df) # NOTE: `validation_mask` currently fails for multiply nested lists - assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]} + assert failures.invalid().select("a").to_dict(as_series=False) == { + "a": [[["a"]], [[None]]] + } assert failures.counts() == { "a|inner_inner_nullability": 1, "a|inner_inner_min_length": 1, diff --git a/tests/column_types/test_struct.py b/tests/column_types/test_struct.py index 4d2375a7..3f91382c 100644 --- a/tests/column_types/test_struct.py +++ b/tests/column_types/test_struct.py @@ -109,7 +109,7 @@ def test_struct_with_pk() -> None: {"s": [{"a": "foo", "b": 1}, {"a": "bar", "b": 1}, {"a": "bar", "b": 1}]} ) _, failures = schema.filter(df) - assert failures.invalid().to_dict(as_series=False) == { + assert failures.invalid().select("s").to_dict(as_series=False) == { "s": [{"a": "bar", "b": 1}, {"a": "bar", "b": 1}] } assert failures.counts() == {"primary_key": 2} @@ -121,7 +121,7 @@ def test_struct_with_rules() -> None: ) df = pl.DataFrame({"s": [{"a": "ab"}, {"a": "a"}, {"a": None}]}) _, failures = schema.filter(df) - assert failures.invalid().to_dict(as_series=False) == { + assert failures.invalid().select("s").to_dict(as_series=False) == { "s": [{"a": "a"}, {"a": None}] } assert failures.counts() == {"s|inner_a_nullability": 1, "s|inner_a_min_length": 1} @@ -140,7 +140,7 @@ def test_nested_struct_with_rules() -> None: {"s1": [{"s2": {"a": "ab"}}, {"s2": {"a": "a"}}, {"s2": {"a": None}}]} ) _, failures = schema.filter(df) - assert failures.invalid().to_dict(as_series=False) == { + assert failures.invalid().select("s1").to_dict(as_series=False) == { "s1": [{"s2": {"a": "a"}}, {"s2": {"a": None}}] } assert failures.counts() == { diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 6aa1e3d7..2bca127e 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -220,7 +220,7 @@ def test_filter_failure_info_original_dtype(eager: bool) -> None: assert failures.counts() == {"a|dtype": 1} assert failures.invalid().get_column("a").to_list() == [300] - assert failures.invalid().dtypes == [pl.Int64] + assert failures.invalid().select("a").dtypes == [pl.Int64] @pytest.mark.parametrize("eager", [True, False]) @@ -243,3 +243,35 @@ def test_filter_maintain_order(eager: bool) -> None: ) out, _ = _filter_and_collect(schema, df, cast=True, eager=eager) assert out.get_column("a").is_sorted() + + +@pytest.mark.parametrize("eager", [True, False]) +def test_filter_invalid_rows(eager: bool) -> None: + df = pl.DataFrame( + { + "a": [2, 2], + "b": ["bar", "foobar"], + } + ) + _, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager) + + assert fails.invalid().to_dicts() == [ + { + "a": 2, + "b": "bar", + "a|dtype": "valid", + "a|nullability": "valid", + "b|dtype": "valid", + "b|max_length": "valid", + "primary_key": "invalid", + }, + { + "a": 2, + "b": "foobar", + "a|dtype": "valid", + "a|nullability": "valid", + "b|dtype": "valid", + "b|max_length": "invalid", + "primary_key": "invalid", + }, + ] From 702834a702a8a3df43112b197f83c9e78b5af0ee Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 12:22:18 +0100 Subject: [PATCH 02/10] review --- dataframely/filter_result.py | 4 +- docs/guides/quickstart.md | 2 +- tests/collection/test_filter_validate.py | 54 ++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 3 deletions(-) diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py index efdaa64b..0b673307 100644 --- a/dataframely/filter_result.py +++ b/dataframely/filter_result.py @@ -124,12 +124,12 @@ def invalid(self) -> pl.DataFrame: filters in addition to member-level rules, or when calling :meth:`Schema.filter` with `cast=True` and dtype-casting fails for a value. """ - return self._lf.select( + return self._df.select( pl.exclude(self._rule_columns), pl.col(*self._rule_columns) .replace_strict({True: "valid", False: "invalid", None: "unknown"}) .cast(pl.Enum(["valid", "invalid", "unknown"])), - ).collect() + ) def counts(self) -> dict[str, int]: """The number of validation failures for each individual rule. diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 4f674fb6..7cc25af8 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -205,7 +205,7 @@ failed_df = failure.invalid() This information tends to be very useful in tracking down issues with the data, both in productive systems and analytics environments. -```{comment} +```{note} New in `dataframely` v2.8.0: The `FailureInfo.invalid()` method now returns additional columns indicating which rules were violated for each row. ``` diff --git a/tests/collection/test_filter_validate.py b/tests/collection/test_filter_validate.py index fe1afce0..a59ce59a 100644 --- a/tests/collection/test_filter_validate.py +++ b/tests/collection/test_filter_validate.py @@ -304,3 +304,57 @@ def test_maintain_order() -> None: out = MyShufflingCollection.validate(out.to_dict()) assert out.first.select("a").collect().to_series().is_sorted() assert out.second.select("a").collect().to_series().is_sorted() + + +def test_unknown_rule_outcomes( + data_without_filter_with_rule_violation: tuple[pl.DataFrame, pl.DataFrame], +) -> None: + _, fails = MyCollection.filter( + { + "first": data_without_filter_with_rule_violation[0], + "second": data_without_filter_with_rule_violation[1], + } + ) + assert fails["first"].invalid().to_dicts() == [ + { + "a": 1, + "b": 1, + "a|nullability": "valid", + "b|nullability": "valid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + "primary_key": "invalid", + }, + { + "a": 1, + "b": 3, + "a|nullability": "valid", + "b|nullability": "valid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + "primary_key": "invalid", + }, + ] + + assert fails["second"].invalid().to_dicts() == [ + { + "a": 1, + "b": 0, + "primary_key": "valid", + "a|nullability": "valid", + "b|nullability": "valid", + "b|min": "invalid", + "equal_primary_key": "unknown", + "first_b_greater_second_b": "unknown", + }, + { + "a": 3, + "b": 2, + "primary_key": "unknown", + "a|nullability": "unknown", + "b|nullability": "unknown", + "b|min": "unknown", + "equal_primary_key": "invalid", + "first_b_greater_second_b": "valid", + }, + ] From e082961aff405cd9f2f49f44764fc2d31311969d Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 17:17:02 +0100 Subject: [PATCH 03/10] trigger docs build From 07533f289edb6dd70e534abeb9bc3984f9d33cdf Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:17:35 +0100 Subject: [PATCH 04/10] rename --- dataframely/filter_result.py | 7 ++++++- tests/collection/test_filter_validate.py | 4 ++-- tests/schema/test_filter.py | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py index 0b673307..3227d902 100644 --- a/dataframely/filter_result.py +++ b/dataframely/filter_result.py @@ -111,7 +111,12 @@ def _df(self) -> pl.DataFrame: return self._lf.collect() def invalid(self) -> pl.DataFrame: - """The rows of the original data frame containing the invalid rows. + """The rows of the original data frame containing the invalid rows.""" + return self._df.drop(self._rule_columns) + + def violation_details(self) -> pl.DataFrame: + """Same as :meth:`invalid` but with additional columns indicating the results of + each individual rule. For each row, this includes: 1. All columns of the original data frame. diff --git a/tests/collection/test_filter_validate.py b/tests/collection/test_filter_validate.py index a59ce59a..d18e0747 100644 --- a/tests/collection/test_filter_validate.py +++ b/tests/collection/test_filter_validate.py @@ -315,7 +315,7 @@ def test_unknown_rule_outcomes( "second": data_without_filter_with_rule_violation[1], } ) - assert fails["first"].invalid().to_dicts() == [ + assert fails["first"].violation_details().to_dicts() == [ { "a": 1, "b": 1, @@ -336,7 +336,7 @@ def test_unknown_rule_outcomes( }, ] - assert fails["second"].invalid().to_dicts() == [ + assert fails["second"].violation_details().to_dicts() == [ { "a": 1, "b": 0, diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 2bca127e..162df988 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -246,7 +246,7 @@ def test_filter_maintain_order(eager: bool) -> None: @pytest.mark.parametrize("eager", [True, False]) -def test_filter_invalid_rows(eager: bool) -> None: +def test_filter_violation_details(eager: bool) -> None: df = pl.DataFrame( { "a": [2, 2], @@ -255,7 +255,7 @@ def test_filter_invalid_rows(eager: bool) -> None: ) _, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager) - assert fails.invalid().to_dicts() == [ + assert fails.violation_details().to_dicts() == [ { "a": 2, "b": "bar", From 45c92f2f37ce5707578bd37dd4ac0a6ddc75e425 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:20:54 +0100 Subject: [PATCH 05/10] doc --- docs/guides/quickstart.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 7cc25af8..241e6fee 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -206,10 +206,10 @@ This information tends to be very useful in tracking down issues with the data, both in productive systems and analytics environments. ```{note} -New in `dataframely` v2.8.0: The `FailureInfo.invalid()` method now returns additional columns indicating which rules were violated for each row. +New in `dataframely` v2.8.0: The `FailureInfo.violation_details()` method now returns additional columns indicating which rules were violated for each row. ``` -For the example above, `failed_df` would look as follows (we omitted some columns for readability): +For the example above, `failure.violation_details()` would look as follows (we omitted some columns for readability): | zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... | | -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- | From 41f66b5115c1addea5df1cdb4d696aaf847cdf8d Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:23:16 +0100 Subject: [PATCH 06/10] revert --- tests/column_types/test_list.py | 4 +--- tests/column_types/test_struct.py | 6 +++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/tests/column_types/test_list.py b/tests/column_types/test_list.py index c91f3c09..a430a6e7 100644 --- a/tests/column_types/test_list.py +++ b/tests/column_types/test_list.py @@ -89,9 +89,7 @@ def test_nested_list_with_rules() -> None: df = pl.DataFrame({"a": [[["ab"]], [["a"]], [[None]]]}) _, failures = schema.filter(df) # NOTE: `validation_mask` currently fails for multiply nested lists - assert failures.invalid().select("a").to_dict(as_series=False) == { - "a": [[["a"]], [[None]]] - } + assert failures.invalid().to_dict(as_series=False) == {"a": [[["a"]], [[None]]]} assert failures.counts() == { "a|inner_inner_nullability": 1, "a|inner_inner_min_length": 1, diff --git a/tests/column_types/test_struct.py b/tests/column_types/test_struct.py index 3f91382c..4d2375a7 100644 --- a/tests/column_types/test_struct.py +++ b/tests/column_types/test_struct.py @@ -109,7 +109,7 @@ def test_struct_with_pk() -> None: {"s": [{"a": "foo", "b": 1}, {"a": "bar", "b": 1}, {"a": "bar", "b": 1}]} ) _, failures = schema.filter(df) - assert failures.invalid().select("s").to_dict(as_series=False) == { + assert failures.invalid().to_dict(as_series=False) == { "s": [{"a": "bar", "b": 1}, {"a": "bar", "b": 1}] } assert failures.counts() == {"primary_key": 2} @@ -121,7 +121,7 @@ def test_struct_with_rules() -> None: ) df = pl.DataFrame({"s": [{"a": "ab"}, {"a": "a"}, {"a": None}]}) _, failures = schema.filter(df) - assert failures.invalid().select("s").to_dict(as_series=False) == { + assert failures.invalid().to_dict(as_series=False) == { "s": [{"a": "a"}, {"a": None}] } assert failures.counts() == {"s|inner_a_nullability": 1, "s|inner_a_min_length": 1} @@ -140,7 +140,7 @@ def test_nested_struct_with_rules() -> None: {"s1": [{"s2": {"a": "ab"}}, {"s2": {"a": "a"}}, {"s2": {"a": None}}]} ) _, failures = schema.filter(df) - assert failures.invalid().select("s1").to_dict(as_series=False) == { + assert failures.invalid().to_dict(as_series=False) == { "s1": [{"s2": {"a": "a"}}, {"s2": {"a": None}}] } assert failures.counts() == { From 290ba94ce385d0830ac9658ba6c5936de4a94ff0 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:32:57 +0100 Subject: [PATCH 07/10] doc --- docs/guides/quickstart.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 241e6fee..0a2066d1 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -243,8 +243,7 @@ Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing - `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be - used to - create SQL tables using types and constraints in line with the schema + used to create SQL tables using types and constraints in line with the schema - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information - You can use `dy.DataFrame[HouseSchema]` (or the `LazyFrame` equivalent) as fields in From 7a37afcabfe822e23e349cb57e5264f72cdcb931 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:33:17 +0100 Subject: [PATCH 08/10] fix --- docs/guides/quickstart.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index 0a2066d1..c11e2230 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -242,8 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2])) Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing -- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be - used to create SQL tables using types and constraints in line with the schema +- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to + create SQL tables using types and constraints in line with the schema - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information - You can use `dy.DataFrame[HouseSchema]` (or the `LazyFrame` equivalent) as fields in From 3677d3ba199a4ba4f363b326515bb9af09c70623 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:34:24 +0100 Subject: [PATCH 09/10] revert --- tests/schema/test_filter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 162df988..18a37e79 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -220,7 +220,7 @@ def test_filter_failure_info_original_dtype(eager: bool) -> None: assert failures.counts() == {"a|dtype": 1} assert failures.invalid().get_column("a").to_list() == [300] - assert failures.invalid().select("a").dtypes == [pl.Int64] + assert failures.invalid().dtypes == [pl.Int64] @pytest.mark.parametrize("eager", [True, False]) From fde12c998061b553e26782170f3a2893b84076f2 Mon Sep 17 00:00:00 2001 From: Andreas Albert Date: Mon, 16 Feb 2026 18:37:05 +0100 Subject: [PATCH 10/10] details --- dataframely/filter_result.py | 9 +++++---- docs/guides/quickstart.md | 7 ++++--- tests/collection/test_filter_validate.py | 4 ++-- tests/schema/test_filter.py | 4 ++-- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/dataframely/filter_result.py b/dataframely/filter_result.py index 3227d902..f2f88b75 100644 --- a/dataframely/filter_result.py +++ b/dataframely/filter_result.py @@ -114,7 +114,7 @@ def invalid(self) -> pl.DataFrame: """The rows of the original data frame containing the invalid rows.""" return self._df.drop(self._rule_columns) - def violation_details(self) -> pl.DataFrame: + def details(self) -> pl.DataFrame: """Same as :meth:`invalid` but with additional columns indicating the results of each individual rule. @@ -131,9 +131,10 @@ def violation_details(self) -> pl.DataFrame: """ return self._df.select( pl.exclude(self._rule_columns), - pl.col(*self._rule_columns) - .replace_strict({True: "valid", False: "invalid", None: "unknown"}) - .cast(pl.Enum(["valid", "invalid", "unknown"])), + pl.col(*self._rule_columns).replace_strict( + {True: "valid", False: "invalid", None: "unknown"}, + return_dtype=pl.Enum(["valid", "invalid", "unknown"]), + ), ) def counts(self) -> dict[str, int]: diff --git a/docs/guides/quickstart.md b/docs/guides/quickstart.md index c11e2230..4eb341ae 100644 --- a/docs/guides/quickstart.md +++ b/docs/guides/quickstart.md @@ -206,10 +206,10 @@ This information tends to be very useful in tracking down issues with the data, both in productive systems and analytics environments. ```{note} -New in `dataframely` v2.8.0: The `FailureInfo.violation_details()` method now returns additional columns indicating which rules were violated for each row. +New in `dataframely` v2.8.0: The `FailureInfo.details()` method now returns additional columns indicating which rules were violated for each row. ``` -For the example above, `failure.violation_details()` would look as follows (we omitted some columns for readability): +For the example above, `failure.details()` would look as follows (we omitted some columns for readability): | zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... | | -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- | @@ -242,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2])) Lastly, `dataframely` schemas can be used to integrate with external tools: - `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing -- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to +- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be + used to create SQL tables using types and constraints in line with the schema - `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with appropriate column dtypes and nullability information diff --git a/tests/collection/test_filter_validate.py b/tests/collection/test_filter_validate.py index d18e0747..926ea3a9 100644 --- a/tests/collection/test_filter_validate.py +++ b/tests/collection/test_filter_validate.py @@ -315,7 +315,7 @@ def test_unknown_rule_outcomes( "second": data_without_filter_with_rule_violation[1], } ) - assert fails["first"].violation_details().to_dicts() == [ + assert fails["first"].details().to_dicts() == [ { "a": 1, "b": 1, @@ -336,7 +336,7 @@ def test_unknown_rule_outcomes( }, ] - assert fails["second"].violation_details().to_dicts() == [ + assert fails["second"].details().to_dicts() == [ { "a": 1, "b": 0, diff --git a/tests/schema/test_filter.py b/tests/schema/test_filter.py index 18a37e79..99557e79 100644 --- a/tests/schema/test_filter.py +++ b/tests/schema/test_filter.py @@ -246,7 +246,7 @@ def test_filter_maintain_order(eager: bool) -> None: @pytest.mark.parametrize("eager", [True, False]) -def test_filter_violation_details(eager: bool) -> None: +def test_filter_details(eager: bool) -> None: df = pl.DataFrame( { "a": [2, 2], @@ -255,7 +255,7 @@ def test_filter_violation_details(eager: bool) -> None: ) _, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager) - assert fails.violation_details().to_dicts() == [ + assert fails.details().to_dicts() == [ { "a": 2, "b": "bar",