Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions dataframely/filter_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,29 @@ def invalid(self) -> pl.DataFrame:
"""The rows of the original data frame containing the invalid rows."""
return self._df.drop(self._rule_columns)

def details(self) -> pl.DataFrame:
"""Same as :meth:`invalid` but with additional columns indicating the results of
each individual rule.

For each row, this includes:
1. All columns of the original data frame.
2. One column for each rule indicating whether the value of the column
is `valid`, `invalid`, or `unknown`.

If a rule column has a value of `unknown` for a given row, that means the rule
could not be evaluated reliably.
This may happen when calling :meth:`Collection.filter` with collection-level
filters in addition to member-level rules, or when calling :meth:`Schema.filter`
with `cast=True` and dtype-casting fails for a value.
"""
return self._df.select(
pl.exclude(self._rule_columns),
pl.col(*self._rule_columns).replace_strict(
{True: "valid", False: "invalid", None: "unknown"},
return_dtype=pl.Enum(["valid", "invalid", "unknown"]),
),
)

def counts(self) -> dict[str, int]:
"""The number of validation failures for each individual rule.

Expand Down
8 changes: 4 additions & 4 deletions docs/guides/features/serialization.md
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ class HouseSchema(dy.Schema):
price = dy.Float64(nullable=False)

@dy.rule()
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
return (ratio >= 1 / 3) & (ratio <= 3)

Expand Down Expand Up @@ -190,9 +190,9 @@ json.loads(HouseSchema.serialize())
'primary_key': False,
'regex': None}},
'name': 'HouseSchema',
'rules': {'reasonable_bathroom_to_bedrooom_ratio': {'expr': {'__type__': 'expression',
'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
'rule_type': 'Rule'}},
'rules': {'reasonable_bathroom_to_bedroom_ratio': {'expr': {'__type__': 'expression',
'value': 'gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEd0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGlRmxvYXTLP9VVVVVVVVWib3CjQW5kpXJpZ2h0gapCaW5hcnlFeHByg6RsZWZ0gapCaW5hcnlFeHByg6RsZWZ0gaZDb2x1bW6tbnVtX2JhdGhyb29tc6JvcKpUcnVlRGl2aWRlpXJpZ2h0gaZDb2x1bW6sbnVtX2JlZHJvb21zom9wpEx0RXGlcmlnaHSBp0xpdGVyYWyBo0R5boGjSW50xBAAAAAAAAAAAAAAAAAAAAAD'},
'rule_type': 'Rule'}},
'versions': {'dataframely': '2.0.0', 'format': '1', 'polars': '1.33.1'}}
```

Expand Down
24 changes: 19 additions & 5 deletions docs/guides/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,13 @@ class HouseSchema(dy.Schema):
price = dy.Float64(nullable=False)

@dy.rule()
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
return (ratio >= 1 / 3) & (ratio <= 3)
```

The decorator `@dy.rule()` "registers" the function as a rule using its name (i.e.
`reasonable_bathroom_to_bedrooom_ratio`).
`reasonable_bathroom_to_bedroom_ratio`).
The returned expression provides a boolean value for each row of the data which evaluates to `True` whenever the data
are valid with respect to this rule.

Expand All @@ -81,7 +81,7 @@ class HouseSchema(dy.Schema):
price = dy.Float64(nullable=False)

@dy.rule()
def reasonable_bathroom_to_bedrooom_ratio(cls) -> pl.Expr:
def reasonable_bathroom_to_bedroom_ratio(cls) -> pl.Expr:
ratio = pl.col("num_bathrooms") / pl.col("num_bedrooms")
return (ratio >= 1 / 3) & (ratio <= 3)

Expand Down Expand Up @@ -189,7 +189,7 @@ Using the `counts` method on the :class:`~dataframely.FailureInfo` object will r

```python
{
"reasonable_bathroom_to_bedrooom_ratio": 1,
"reasonable_bathroom_to_bedroom_ratio": 1,
"minimum_zip_code_count": 2,
"zip_code|min_length": 1,
"num_bedrooms|nullability": 2,
Expand All @@ -205,6 +205,19 @@ failed_df = failure.invalid()
This information tends to be very useful in tracking down issues with the data,
both in productive systems and analytics environments.

```{note}
New in `dataframely` v2.8.0: The `FailureInfo.details()` method now returns additional columns indicating which rules were violated for each row.
```

For the example above, `failure.details()` would look as follows (we omitted some columns for readability):

| zip_code | num_bedrooms | num_bathrooms | price | reasonable_bathroom_to_bedroom... | minimum_zip_code_count | zip_code\|min_length | num_bedrooms\|nullability | ... |
| -------- | ------------ | ------------- | ------ | --------------------------------- | ---------------------- | -------------------- | ------------------------- | --- |
| 1 | 1 | 1 | 50000 | valid | invalid | invalid | valid | |
| 213 | null | 1 | 80000 | valid | valid | valid | invalid | |
| 123 | null | 0 | 60000 | valid | invalid | valid | invalid | |
| 213 | 2 | 8 | 160000 | invalid | valid | valid | valid | |

## Type casting

In rare cases, you might already be _absolutely certain_ that a data frame is valid with
Expand All @@ -229,7 +242,8 @@ df_concat = HouseSchema.cast(pl.concat([df1, df2]))
Lastly, `dataframely` schemas can be used to integrate with external tools:

- `HouseSchema.create_empty()` creates an empty `dy.DataFrame[HouseSchema]` that can be used for testing
- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be used to
- `HouseSchema.to_sqlalchemy_columns()` provides a list of [sqlalchemy](https://www.sqlalchemy.org) columns that can be
used to
create SQL tables using types and constraints in line with the schema
- `HouseSchema.to_pyarrow_schema()` provides a [pyarrow](https://arrow.apache.org/docs/python/index.html) schema with
appropriate column dtypes and nullability information
Expand Down
54 changes: 54 additions & 0 deletions tests/collection/test_filter_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,57 @@ def test_maintain_order() -> None:
out = MyShufflingCollection.validate(out.to_dict())
assert out.first.select("a").collect().to_series().is_sorted()
assert out.second.select("a").collect().to_series().is_sorted()


def test_unknown_rule_outcomes(
data_without_filter_with_rule_violation: tuple[pl.DataFrame, pl.DataFrame],
) -> None:
_, fails = MyCollection.filter(
{
"first": data_without_filter_with_rule_violation[0],
"second": data_without_filter_with_rule_violation[1],
}
)
assert fails["first"].details().to_dicts() == [
{
"a": 1,
"b": 1,
"a|nullability": "valid",
"b|nullability": "valid",
"equal_primary_key": "unknown",
"first_b_greater_second_b": "unknown",
"primary_key": "invalid",
},
{
"a": 1,
"b": 3,
"a|nullability": "valid",
"b|nullability": "valid",
"equal_primary_key": "unknown",
"first_b_greater_second_b": "unknown",
"primary_key": "invalid",
},
]

assert fails["second"].details().to_dicts() == [
{
"a": 1,
"b": 0,
"primary_key": "valid",
"a|nullability": "valid",
"b|nullability": "valid",
"b|min": "invalid",
"equal_primary_key": "unknown",
"first_b_greater_second_b": "unknown",
},
{
"a": 3,
"b": 2,
"primary_key": "unknown",
"a|nullability": "unknown",
"b|nullability": "unknown",
"b|min": "unknown",
"equal_primary_key": "invalid",
"first_b_greater_second_b": "valid",
},
]
32 changes: 32 additions & 0 deletions tests/schema/test_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,35 @@ def test_filter_maintain_order(eager: bool) -> None:
)
out, _ = _filter_and_collect(schema, df, cast=True, eager=eager)
assert out.get_column("a").is_sorted()


@pytest.mark.parametrize("eager", [True, False])
def test_filter_details(eager: bool) -> None:
df = pl.DataFrame(
{
"a": [2, 2],
"b": ["bar", "foobar"],
}
)
_, fails = _filter_and_collect(MySchema, df, cast=True, eager=eager)

assert fails.details().to_dicts() == [
{
"a": 2,
"b": "bar",
"a|dtype": "valid",
"a|nullability": "valid",
"b|dtype": "valid",
"b|max_length": "valid",
"primary_key": "invalid",
},
{
"a": 2,
"b": "foobar",
"a|dtype": "valid",
"a|nullability": "valid",
"b|dtype": "valid",
"b|max_length": "invalid",
"primary_key": "invalid",
},
]
Loading