diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 5c00e5e5c..a0a9dd18d 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -15,7 +15,7 @@ apply_rounding, is_in, ) - +from cdisc_rules_engine.enums.dataset_title_case import DatasetTitleCase from cdisc_rules_engine.constants import NULL_FLAVORS from cdisc_rules_engine.utilities.utils import dates_overlap, parse_date import numpy as np @@ -23,6 +23,7 @@ import pandas as pd import re import operator +from titlecase import titlecase from uuid import uuid4 from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface @@ -1073,7 +1074,7 @@ def not_contains_all(self, other_value: dict): @log_operator_execution @type_operator(FIELD_DATAFRAME) def invalid_date(self, other_value): - target = self.replace_prefix(other_value.get("target")) + target = other_value.get("target") results = ~vectorized_is_valid(self.value[target]) return self.value.convert_to_series(results) @@ -1140,7 +1141,7 @@ def is_incomplete_date(self, other_value): @log_operator_execution @type_operator(FIELD_DATAFRAME) def is_complete_date(self, other_value): - target = self.replace_prefix(other_value.get("target")) + target = other_value.get("target") results = vectorized_is_complete_date(self.value[target]) return self.value.convert_to_series(results) @@ -1944,3 +1945,39 @@ def check_order(row): @type_operator(FIELD_DATAFRAME) def is_not_ordered_subset_of(self, other_value: dict): return ~self.is_ordered_subset_of(other_value) + + @log_operator_execution + @type_operator(FIELD_DATAFRAME) + def is_title_case(self, other_value: dict): + """ + Checks if target column values are in proper title case. + """ + target = other_value.get("target") + acronyms = DatasetTitleCase.Acronyms.value + lowercase_exceptions = DatasetTitleCase.Lowercase_Exceptions.value + + def acronym_callback(word, **kwargs): + if word.lower() in lowercase_exceptions: + return word.lower() + if any(word.upper() == acr.upper() for acr in acronyms): + return word.upper() + return None + + def check_title_case(value): + if pd.isna(value) or value == "" or value in NULL_FLAVORS: + return True + str_value = str(value).strip() + expected = titlecase(str_value, callback=acronym_callback) + expected = expected[0].upper() + expected[1:] + return str_value == expected + + results = self.value[target].apply(check_title_case) + return self.value.convert_to_series(results) + + @log_operator_execution + @type_operator(FIELD_DATAFRAME) + def is_not_title_case(self, other_value: dict): + """ + Checks if target column values are NOT in proper title case. + """ + return ~self.is_title_case(other_value) diff --git a/cdisc_rules_engine/enums/dataset_title_case.py b/cdisc_rules_engine/enums/dataset_title_case.py new file mode 100644 index 000000000..880cea9f1 --- /dev/null +++ b/cdisc_rules_engine/enums/dataset_title_case.py @@ -0,0 +1,6 @@ +from cdisc_rules_engine.enums.base_enum import BaseEnum + + +class DatasetTitleCase(BaseEnum): + Acronyms = ["ID"] + Lowercase_Exceptions = ["per", "and/or", "is", "with"] diff --git a/requirements.txt b/requirements.txt index 4e130d38f..4481051f6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,3 +23,4 @@ pyyaml==6.0.2 redis==4.5.0 requests~=2.32.3 setuptools~=75.6.0 +titlecase==2.4.1 \ No newline at end of file diff --git a/resources/schema/rule/Operator.json b/resources/schema/rule/Operator.json index 7055c3d09..f18d9937f 100644 --- a/resources/schema/rule/Operator.json +++ b/resources/schema/rule/Operator.json @@ -525,6 +525,16 @@ "properties": { "operator": { "const": "is_not_ordered_subset_of" } }, "required": ["operator", "value"], "type": "object" + }, + { + "properties": { "operator": { "const": "is_title_case" } }, + "required": ["operator"], + "type": "object" + }, + { + "properties": { "operator": { "const": "is_not_title_case" } }, + "required": ["operator"], + "type": "object" } ], "properties": { diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 1c01837ee..a5ce269bf 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -446,6 +446,29 @@ Complement of `split_parts_have_equal_length`. Returns True when parts have uneq separator: "/" ``` +### is_title_case + +Validates that variable labels follow proper title case formatting rules using the titlecase PyPi library. Title case capitalizes the first word and all major words, while keeping articles (a, an, the), conjunctions (and, but, or), and prepositions (in, of, for) in lowercase unless they are the first word. +NOTE: The titlecase library may produce false positives or false negatives in syntactic edge cases (e.g. hyphenated words, slash-separated terms, uncommon prepositions). + +> Check that AELABEL values are in proper title case + +```yaml +- name: AELABEL + operator: is_title_case +``` + +### is_not_title_case + +Complement of `is_title_case`. Returns True when values are NOT in proper title case. + +> Flag AELABEL values that violate title case rules + +```yaml +- name: AELABEL + operator: is_not_title_case +``` + ## Date Date and time specific operations for comparing dates, validating date completeness, checking date formats, and validating ISO-8601 durations. diff --git a/tests/unit/test_check_operators/test_titlecase_checks.py b/tests/unit/test_check_operators/test_titlecase_checks.py new file mode 100644 index 000000000..53268b638 --- /dev/null +++ b/tests/unit/test_check_operators/test_titlecase_checks.py @@ -0,0 +1,436 @@ +import pytest +from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType +from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["Subject ID", "Adverse Event", "Date of First Dose"]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["SUBJECT ID", "ADVERSE EVENT", "DATE OF FIRST DOSE"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["subject id", "adverse event", "date of first dose"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["Subject ID", "ADVERSE EVENT", "date of first dose"]}, + PandasDataset, + [True, False, False], + ), + ( + {"target": ["Subject ID", "Adverse Event", "Date of First Dose"]}, + DaskDataset, + [True, True, True], + ), + ( + {"target": ["SUBJECT ID", "ADVERSE EVENT"]}, + DaskDataset, + [False, False], + ), + ], +) +def test_is_title_case_basic(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["Subject ID", "Study ID", "Patient ID"]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["Subject Id", "Study id", "Patient Id"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["Unique Subject ID", " Primary Study ID"]}, + PandasDataset, + [True, True], + ), + ( + {"target": ["Subject ID", " Study ID"]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_id_acronym(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + { + "target": [ + "Date of First Dose", + "End of Study", + "Reason for No Treatment", + ] + }, + PandasDataset, + [True, True, True], + ), + ( + { + "target": [ + "Date Of First Dose", + "End Of Study", + "Reason For No Treatment", + ] + }, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["Of the Study", "For No Reason"]}, + PandasDataset, + [True, True], + ), + ( + {"target": ["Study Of The Year", "Reason For The Record"]}, + PandasDataset, + [False, False], + ), + ( + {"target": ["Date of First Dose", "End of Study"]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_small_words(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["Epi/Pandemic", "Date/Time", "Start/End"]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["epi/pandemic", "date/time", "start/end"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["EPI/PANDEMIC", "DATE/TIME", "START/END"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["Subject ID/Number", "Study ID/Code"]}, + PandasDataset, + [True, True], + ), + ( + {"target": ["Epi/Pandemic Pre-Specified"]}, + PandasDataset, + [True], + ), + ( + {"target": ["epi/pandemic pre-specified"]}, + PandasDataset, + [False], + ), + ( + {"target": ["EPI/PANDEMIC PRE-SPECIFIED"]}, + PandasDataset, + [False], + ), + ( + {"target": ["Epi/Pandemic", "Date/Time"]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_slash_separated(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["Pre-Specified", "Post-Treatment", "Non-Serious"]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["pre-specified", "post-treatment", "non-serious"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["PRE-SPECIFIED", "POST-TREATMENT", "NON-SERIOUS"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["Pre-Specified Event", "Post-Treatment Assessment"]}, + PandasDataset, + [True, True], + ), + ( + {"target": ["Pre-Specified", "Post-Treatment"]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_hyphenated(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["", "", ""]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": [None, None, None]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["Subject ID", "", None, "ADVERSE EVENT"]}, + PandasDataset, + [True, True, True, False], + ), + ( + {"target": ["", None]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_null_empty(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + { + "target": [ + "Subject ID", + "Adverse Event ID", + "Date of First Dose", + "Reason for No Treatment", + "Pre-Specified Event", + "Epi/Pandemic Study", + ] + }, + PandasDataset, + [True, True, True, True, True, True], + ), + ( + { + "target": [ + "subject id", + "ADVERSE EVENT ID", + "Date Of First Dose", + "reason for no treatment", + "PRE-SPECIFIED EVENT", + "epi/pandemic study", + ] + }, + PandasDataset, + [False, False, False, False, False, False], + ), + ( + { + "target": [ + "Subject ID", + "adverse event id", + "Date of First Dose", + "REASON FOR NO TREATMENT", + ] + }, + PandasDataset, + [True, False, True, False], + ), + ( + { + "target": [ + "Subject ID", + "Date of First Dose", + "Pre-Specified Event", + ] + }, + DaskDataset, + [True, True, True], + ), + ], +) +def test_is_title_case_complex(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": ["Subject ID", "Adverse Event", "Date of First Dose"]}, + PandasDataset, + [False, False, False], + ), + ( + {"target": ["SUBJECT ID", "adverse event", "Date Of First Dose"]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["Subject ID", "ADVERSE EVENT", "Date of First Dose"]}, + PandasDataset, + [False, True, False], + ), + ( + {"target": ["Subject ID", "ADVERSE EVENT"]}, + DaskDataset, + [False, True], + ), + ], +) +def test_is_not_title_case(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_not_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + {"target": [123, 456, 789]}, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["Subject ID", 123, None, "ADVERSE EVENT"]}, + PandasDataset, + [True, True, True, False], + ), + ], +) +def test_is_title_case_numeric_values(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,dataset_type,expected_result", + [ + ( + { + "target": [ + " Subject ID ", + " Adverse Event ", + " Date of First Dose ", + ] + }, + PandasDataset, + [True, True, True], + ), + ( + { + "target": [ + " SUBJECT ID ", + " adverse event ", + " DATE OF FIRST DOSE ", + ] + }, + PandasDataset, + [False, False, False], + ), + ( + { + "target": [ + "\tSubject ID\t", + "\tAdverse Event\t", + "\tDate of First Dose\t", + ] + }, + PandasDataset, + [True, True, True], + ), + ( + {"target": ["\tSUBJECT ID\t", "\tadverse event\t"]}, + PandasDataset, + [False, False], + ), + ( + {"target": ["\t Subject ID \t", " \tAdverse Event\t "]}, + PandasDataset, + [True, True], + ), + ( + { + "target": [ + " Subject ID ", + "\tADVERSE EVENT\t", + " Date of First Dose ", + ] + }, + PandasDataset, + [True, False, True], + ), + ( + {"target": [" Subject ID ", "\tAdverse Event\t"]}, + DaskDataset, + [True, True], + ), + ], +) +def test_is_title_case_with_whitespace(data, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.is_title_case({"target": "target"}) + assert result.equals(df.convert_to_series(expected_result))