Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 40 additions & 3 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,15 @@
apply_rounding,
is_in,
)

from cdisc_rules_engine.enums.dataset_title_case import DatasetTitleCase
from cdisc_rules_engine.constants import NULL_FLAVORS
from cdisc_rules_engine.utilities.utils import dates_overlap, parse_date
import numpy as np
import dask.dataframe as dd
import pandas as pd
import re
import operator
from titlecase import titlecase
from uuid import uuid4
from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset
from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface
Expand Down Expand Up @@ -1073,7 +1074,7 @@ def not_contains_all(self, other_value: dict):
@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def invalid_date(self, other_value):
target = self.replace_prefix(other_value.get("target"))
target = other_value.get("target")
results = ~vectorized_is_valid(self.value[target])
return self.value.convert_to_series(results)

Expand Down Expand Up @@ -1140,7 +1141,7 @@ def is_incomplete_date(self, other_value):
@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def is_complete_date(self, other_value):
target = self.replace_prefix(other_value.get("target"))
target = other_value.get("target")
results = vectorized_is_complete_date(self.value[target])
return self.value.convert_to_series(results)

Expand Down Expand Up @@ -1944,3 +1945,39 @@ def check_order(row):
@type_operator(FIELD_DATAFRAME)
def is_not_ordered_subset_of(self, other_value: dict):
return ~self.is_ordered_subset_of(other_value)

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def is_title_case(self, other_value: dict):
"""
Checks if target column values are in proper title case.
"""
target = other_value.get("target")
acronyms = DatasetTitleCase.Acronyms.value
lowercase_exceptions = DatasetTitleCase.Lowercase_Exceptions.value

def acronym_callback(word, **kwargs):
if word.lower() in lowercase_exceptions:
return word.lower()
if any(word.upper() == acr.upper() for acr in acronyms):
return word.upper()
return None

def check_title_case(value):
if pd.isna(value) or value == "" or value in NULL_FLAVORS:
return True
str_value = str(value).strip()
expected = titlecase(str_value, callback=acronym_callback)
expected = expected[0].upper() + expected[1:]
return str_value == expected

results = self.value[target].apply(check_title_case)
return self.value.convert_to_series(results)

@log_operator_execution
@type_operator(FIELD_DATAFRAME)
def is_not_title_case(self, other_value: dict):
"""
Checks if target column values are NOT in proper title case.
"""
return ~self.is_title_case(other_value)
6 changes: 6 additions & 0 deletions cdisc_rules_engine/enums/dataset_title_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from cdisc_rules_engine.enums.base_enum import BaseEnum


class DatasetTitleCase(BaseEnum):
Acronyms = ["ID"]
Lowercase_Exceptions = ["per", "and/or", "is", "with"]
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,3 +23,4 @@ pyyaml==6.0.2
redis==4.5.0
requests~=2.32.3
setuptools~=75.6.0
titlecase==2.4.1
10 changes: 10 additions & 0 deletions resources/schema/rule/Operator.json
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,16 @@
"properties": { "operator": { "const": "is_not_ordered_subset_of" } },
"required": ["operator", "value"],
"type": "object"
},
{
"properties": { "operator": { "const": "is_title_case" } },
"required": ["operator"],
"type": "object"
},
{
"properties": { "operator": { "const": "is_not_title_case" } },
"required": ["operator"],
"type": "object"
}
],
"properties": {
Expand Down
23 changes: 23 additions & 0 deletions resources/schema/rule/Operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -446,6 +446,29 @@ Complement of `split_parts_have_equal_length`. Returns True when parts have uneq
separator: "/"
```

### is_title_case

Validates that variable labels follow proper title case formatting rules using the titlecase PyPi library. Title case capitalizes the first word and all major words, while keeping articles (a, an, the), conjunctions (and, but, or), and prepositions (in, of, for) in lowercase unless they are the first word.
NOTE: The titlecase library may produce false positives or false negatives in syntactic edge cases (e.g. hyphenated words, slash-separated terms, uncommon prepositions).

> Check that AELABEL values are in proper title case

```yaml
- name: AELABEL
operator: is_title_case
```

### is_not_title_case

Complement of `is_title_case`. Returns True when values are NOT in proper title case.

> Flag AELABEL values that violate title case rules

```yaml
- name: AELABEL
operator: is_not_title_case
```

## Date

Date and time specific operations for comparing dates, validating date completeness, checking date formats, and validating ISO-8601 durations.
Expand Down
Loading
Loading