diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index e7cbee0d0..c2ae1bf80 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1152,6 +1152,9 @@ def is_complete_date(self, other_value): def is_inconsistent_across_dataset(self, other_value): target = other_value.get("target") comparator = other_value.get("comparator") + regex = other_value.get("regex") + if isinstance(regex, list) and regex: + regex = regex[0] grouping_cols = [] if isinstance(comparator, str): if comparator in self.value.columns: @@ -1162,6 +1165,17 @@ def is_inconsistent_across_dataset(self, other_value): grouping_cols.append(col) df_check = self.value[grouping_cols + [target]].copy() df_check = df_check.fillna("_NaN_") + if regex: + pattern = re.compile(regex) + if pattern.groups == 0: + regex = f"({regex})" + extracted = df_check[target].astype(str).str.extract(regex, expand=False) + df_check[target] = extracted.fillna(df_check[target]) + results = self._check_inconsistency(df_check, grouping_cols, target) + return results + + @staticmethod + def _check_inconsistency(df_check, grouping_cols: list[Any], target): results = pd.Series(False, index=df_check.index) for name, group in df_check.groupby(grouping_cols, dropna=False): if group[target].nunique() > 1: diff --git a/resources/schema/rule-merged/MetaVariables.json b/resources/schema/rule-merged/MetaVariables.json index 5af22ca7f..0180bfeb4 100644 --- a/resources/schema/rule-merged/MetaVariables.json +++ b/resources/schema/rule-merged/MetaVariables.json @@ -198,6 +198,14 @@ "const": "library_variable_core", "markdownDescription": "\ncore attribute of a variable from the CDISC Library\n" }, + { + "const": "library_variable_has_codelist", + "markdownDescription": "\nIndicates whether a variable has an associated codelist in the CDISC Library\n" + }, + { + "const": "library_variable_ccode", + "markdownDescription": "\nccode attribute of a variable from the CDISC Library\n" + }, { "const": "library_variable_data_type", "markdownDescription": "\nsimpleDatatype attribute of a variable from the CDISC Library\n" diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json index 34dcf20a3..80c62ac56 100644 --- a/resources/schema/rule-merged/Operator.json +++ b/resources/schema/rule-merged/Operator.json @@ -462,7 +462,7 @@ "properties": { "operator": { "const": "is_inconsistent_across_dataset", - "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n" + "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\nIf a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).\n\n- regex is optional.\n- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).\n- Only the first capture group is used for comparison.\n- If the pattern does not match a value, the original value is used.\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n" } }, "required": ["operator", "value"], diff --git a/resources/schema/rule-merged/Rule_Type.json b/resources/schema/rule-merged/Rule_Type.json index ee910a5cb..b26257061 100644 --- a/resources/schema/rule-merged/Rule_Type.json +++ b/resources/schema/rule-merged/Rule_Type.json @@ -20,7 +20,7 @@ { "const": "Define Item Metadata Check against Library Metadata", "title": "Define xml metadata at variable level and corresponding library variable metadata", - "markdownDescription": "\n#### Columns\n\n- `define_variable_name`\n- `define_variable_label`\n- `define_variable_data_type`\n- `define_variable_role`\n- `define_variable_size`\n- `define_variable_ccode`\n- `define_variable_format`\n- `define_variable_allowed_terms`\n- `define_variable_origin_type`\n- `define_variable_is_collected`\n- `define_variable_has_no_data`\n- `define_variable_order_number`\n- `define_variable_has_codelist`\n- `define_variable_codelist_coded_values`\n- `define_variable_codelist_coded_codes`\n- `define_variable_mandatory`\n- `define_variable_has_comment`\n- `library_variable_name`\n- `library_variable_order_number`\n- `library_variable_label`\n- `library_variable_data_type`\n- `library_variable_role`\n- `library_variable_core`\n- `library_variable_ccode`\n\n#### Rule Macro\n\nChecks variable-level metadata, codelists, and codelist terms from Define-XML against the corresponding standard variable definitions from the CDISC Library.\n" + "markdownDescription": "\n#### Columns\n\n- `define_variable_name`\n- `define_variable_label`\n- `define_variable_data_type`\n- `define_variable_role`\n- `define_variable_size`\n- `define_variable_ccode`\n- `define_variable_format`\n- `define_variable_allowed_terms`\n- `define_variable_origin_type`\n- `define_variable_is_collected`\n- `define_variable_has_no_data`\n- `define_variable_order_number`\n- `define_variable_has_codelist`\n- `define_variable_codelist_coded_values`\n- `define_variable_codelist_coded_codes`\n- `define_variable_mandatory`\n- `define_variable_has_comment`\n- `library_variable_name`\n- `library_variable_order_number`\n- `library_variable_label`\n- `library_variable_data_type`\n- `library_variable_role`\n- `library_variable_core`\n- `library_variable_has_codelist`\n- `library_variable_ccode`\n\n#### Rule Macro\n\nChecks variable-level metadata, codelists, and codelist terms from Define-XML against the corresponding standard variable definitions from the CDISC Library.\n" }, { "const": "Domain Presence Check", diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 0405b25d5..2ba4b3015 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -996,6 +996,13 @@ Checks if a variable maintains consistent values within groups defined by one or Single grouping variable - true if the values of BGSTRESU differ within USUBJID: +If a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value). + +- regex is optional. +- The pattern must include at least one capture group(or whole regex will be wrapped to capture group). +- Only the first capture group is used for comparison. +- If the pattern does not match a value, the original value is used. + ```yaml - name: "BGSTRESU" operator: is_inconsistent_across_dataset diff --git a/tests/unit/test_check_operators/test_value_set_checks.py b/tests/unit/test_check_operators/test_value_set_checks.py index 638605083..6f866f2ca 100644 --- a/tests/unit/test_check_operators/test_value_set_checks.py +++ b/tests/unit/test_check_operators/test_value_set_checks.py @@ -244,6 +244,63 @@ def test_is_inconsistent_across_dataset( assert result.equals(df.convert_to_series(expected_result)) +@pytest.mark.parametrize( + "values,regex,expected", + [ + # regex disabled + (["A", "B"], None, [True, True]), + (["A", "B"], "", [True, True]), + # regex collapsing values + (["TEST_v1", "TEST_v2"], r"^(TEST)", [False, False]), + (["ABC123", "XYZ123"], r"(\d+)", [False, False]), + (["HEIGHT_cm", "HEIGHT_mm"], r"^(HEIGHT)", [False, False]), + # datetime normalization + ( + ["2014-09-30T11:09", "2014-09-30T11:07"], + r"^(\d{4}-\d{2}-\d{2})", + [False, False], + ), + (["TEST_A", "TEST_B"], r"^(TEST_[A-Z])", [True, True]), + (["SUBJ-001", "SUBJ-002"], r"SUBJ-(\d+)", [True, True]), + ( + ["2014-09-30T11:09", "2014-09-29T11:07"], + r"^(\d{4}-\d{2}-\d{2})", + [True, True], + ), + # regex no capture group + (["ABC", "DEF"], r"^XYZ", [True, True]), + (["TEST_v1", "CONTROL"], r"^(TEST)", [True, True]), + (["A", "B"], r"(.*)", [True, True]), + (["A", None], r"(A)", [True, True]), + ([None, None], r"(.*)", [False, False]), + ([1, 1], r"(\d+)", [False, False]), + ], +) +def test_is_inconsistent_across_dataset_regex(values, regex, expected): + df = pd.DataFrame( + { + "VISIT": ["WEEK1"] * len(values), + "EPOCH": ["TREATMENT"] * len(values), + "VALUE": values, + } + ) + + other_value = { + "target": "VALUE", + "comparator": ["VISIT", "EPOCH"], + "regex": regex, + } + + obj = DataframeType( + { + "value": df, + } + ) + result = obj.is_inconsistent_across_dataset(other_value) + + assert result.tolist() == expected + + @pytest.mark.parametrize( "target, comparator, dataset_type, expected_result", [