From 189ad57e28e911af214a38b927910cce89d1a9ff Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 10 Mar 2026 15:49:51 +0100 Subject: [PATCH 1/7] #1654 added regex to is_inconsistent_across_dataset operation --- .../check_operators/dataframe_operators.py | 4 ++ .../test_value_set_checks.py | 41 +++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index f28ff69b7..01480bffd 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1152,6 +1152,7 @@ def is_complete_date(self, other_value): def is_inconsistent_across_dataset(self, other_value): target = other_value.get("target") comparator = other_value.get("comparator") + regex = other_value.get("regex") grouping_cols = [] if isinstance(comparator, str): if comparator in self.value.columns: @@ -1162,6 +1163,9 @@ def is_inconsistent_across_dataset(self, other_value): grouping_cols.append(col) df_check = self.value[grouping_cols + [target]].copy() df_check = df_check.fillna("_NaN_") + if regex: + extracted = df_check[target].astype(str).str.extract(regex, expand=False) + df_check[target] = extracted.fillna(df_check[target]) results = pd.Series(False, index=df_check.index) for name, group in df_check.groupby(grouping_cols, dropna=False): if group[target].nunique() > 1: diff --git a/tests/unit/test_check_operators/test_value_set_checks.py b/tests/unit/test_check_operators/test_value_set_checks.py index 638605083..f7a46b99c 100644 --- a/tests/unit/test_check_operators/test_value_set_checks.py +++ b/tests/unit/test_check_operators/test_value_set_checks.py @@ -244,6 +244,47 @@ def test_is_inconsistent_across_dataset( assert result.equals(df.convert_to_series(expected_result)) +def test_is_inconsistent_across_dataset_regex(): + data = { + "VISIT": [ + "SCREENING 1", + "SCREENING 1", + "BASELINE", + "BASELINE", + "BASELINE", + "WEEK 1", + "WEEK 1", + ], + "EPOCH": [ + "SCREENING", + "SCREENING", + "SCREENING", + "SCREENING", + "SCREENING", + "TREATMENT", + "TREATMENT", + ], + "VSDTC": [ + "2012-11-23", + "2012-11-28", + "2012-11-30", + "2012-11-30", + "2012-11-30", + "2014-09-30T11:09", + "2014-09-30T11:07", + ], + } + df = PandasDataset.from_dict(data) + result = DataframeType( + {"value": df, "column_prefix_map": {"--": ""}} + ).is_inconsistent_across_dataset( + {"target": "VSDTC", "comparator": "VISIT", "regex": r"^(\d{4}-\d{2}-\d{2})"} + ) + assert result.equals( + df.convert_to_series([True, True, False, False, False, False, False]) + ) + + @pytest.mark.parametrize( "target, comparator, dataset_type, expected_result", [ From 6f963d36258d7e6fee5124d88977ce6fc9932e94 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 11 Mar 2026 15:19:44 +0100 Subject: [PATCH 2/7] #1654 additional tests for regexp in is_inconsistent_across_dataset operator --- .../check_operators/dataframe_operators.py | 3 + .../test_value_set_checks.py | 90 +++++++++++-------- 2 files changed, 56 insertions(+), 37 deletions(-) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 01480bffd..550bc6f57 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1164,6 +1164,9 @@ def is_inconsistent_across_dataset(self, other_value): df_check = self.value[grouping_cols + [target]].copy() df_check = df_check.fillna("_NaN_") if regex: + pattern = re.compile(regex) + if pattern.groups == 0: + regex = f"({regex})" extracted = df_check[target].astype(str).str.extract(regex, expand=False) df_check[target] = extracted.fillna(df_check[target]) results = pd.Series(False, index=df_check.index) diff --git a/tests/unit/test_check_operators/test_value_set_checks.py b/tests/unit/test_check_operators/test_value_set_checks.py index f7a46b99c..6f866f2ca 100644 --- a/tests/unit/test_check_operators/test_value_set_checks.py +++ b/tests/unit/test_check_operators/test_value_set_checks.py @@ -244,45 +244,61 @@ def test_is_inconsistent_across_dataset( assert result.equals(df.convert_to_series(expected_result)) -def test_is_inconsistent_across_dataset_regex(): - data = { - "VISIT": [ - "SCREENING 1", - "SCREENING 1", - "BASELINE", - "BASELINE", - "BASELINE", - "WEEK 1", - "WEEK 1", - ], - "EPOCH": [ - "SCREENING", - "SCREENING", - "SCREENING", - "SCREENING", - "SCREENING", - "TREATMENT", - "TREATMENT", - ], - "VSDTC": [ - "2012-11-23", - "2012-11-28", - "2012-11-30", - "2012-11-30", - "2012-11-30", - "2014-09-30T11:09", - "2014-09-30T11:07", - ], - } - df = PandasDataset.from_dict(data) - result = DataframeType( - {"value": df, "column_prefix_map": {"--": ""}} - ).is_inconsistent_across_dataset( - {"target": "VSDTC", "comparator": "VISIT", "regex": r"^(\d{4}-\d{2}-\d{2})"} +@pytest.mark.parametrize( + "values,regex,expected", + [ + # regex disabled + (["A", "B"], None, [True, True]), + (["A", "B"], "", [True, True]), + # regex collapsing values + (["TEST_v1", "TEST_v2"], r"^(TEST)", [False, False]), + (["ABC123", "XYZ123"], r"(\d+)", [False, False]), + (["HEIGHT_cm", "HEIGHT_mm"], r"^(HEIGHT)", [False, False]), + # datetime normalization + ( + ["2014-09-30T11:09", "2014-09-30T11:07"], + r"^(\d{4}-\d{2}-\d{2})", + [False, False], + ), + (["TEST_A", "TEST_B"], r"^(TEST_[A-Z])", [True, True]), + (["SUBJ-001", "SUBJ-002"], r"SUBJ-(\d+)", [True, True]), + ( + ["2014-09-30T11:09", "2014-09-29T11:07"], + r"^(\d{4}-\d{2}-\d{2})", + [True, True], + ), + # regex no capture group + (["ABC", "DEF"], r"^XYZ", [True, True]), + (["TEST_v1", "CONTROL"], r"^(TEST)", [True, True]), + (["A", "B"], r"(.*)", [True, True]), + (["A", None], r"(A)", [True, True]), + ([None, None], r"(.*)", [False, False]), + ([1, 1], r"(\d+)", [False, False]), + ], +) +def test_is_inconsistent_across_dataset_regex(values, regex, expected): + df = pd.DataFrame( + { + "VISIT": ["WEEK1"] * len(values), + "EPOCH": ["TREATMENT"] * len(values), + "VALUE": values, + } ) - assert result.equals( - df.convert_to_series([True, True, False, False, False, False, False]) + + other_value = { + "target": "VALUE", + "comparator": ["VISIT", "EPOCH"], + "regex": regex, + } + + obj = DataframeType( + { + "value": df, + } ) + result = obj.is_inconsistent_across_dataset(other_value) + + assert result.tolist() == expected @pytest.mark.parametrize( From da36f8f00e6b5771614d720de3f30120afe90f7e Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 11 Mar 2026 15:27:59 +0100 Subject: [PATCH 3/7] #1654 added info about regex in is_inconsistent_across_dataset --- resources/schema/rule/Operator.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 5233ce5cb..13763c4ab 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -996,6 +996,12 @@ Checks if a variable maintains consistent values within groups defined by one or Single grouping variable - true if the values of BGSTRESU differ within USUBJID: +If a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value). +- regex is optional. +- The pattern must include at least one capture group(or whole regex will be wrapped to capture group). +- Only the first capture group is used for comparison. +- If the pattern does not match a value, the original value is used. + ```yaml - name: "BGSTRESU" operator: is_inconsistent_across_dataset From 2345bb8ea7bc64ad39f0141fc6e45bb02f56b36e Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 11 Mar 2026 16:01:34 +0100 Subject: [PATCH 4/7] #1654 Operator.md prettier fix --- resources/schema/rule/Operator.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 13763c4ab..d1d720a5e 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -992,7 +992,7 @@ Checking for consistent values across groups and validating that variables maint ### is_inconsistent_across_dataset -Checks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists — i.e., two or more values are tied for the highest frequency — all records in that group are flagged. +Checks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists — i.e., two or more values are tied for the highest frequency — all records in that group are flagged. Single grouping variable - true if the values of BGSTRESU differ within USUBJID: From 1c1dad9846253cdfd4f4bbca8e02f9f842c6030a Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 16 Mar 2026 12:28:24 +0100 Subject: [PATCH 5/7] #1654 Operator.md prettier --- resources/schema/rule/Operator.md | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index 2958633bd..2ba4b3015 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -997,6 +997,7 @@ Checks if a variable maintains consistent values within groups defined by one or Single grouping variable - true if the values of BGSTRESU differ within USUBJID: If a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value). + - regex is optional. - The pattern must include at least one capture group(or whole regex will be wrapped to capture group). - Only the first capture group is used for comparison. From 32bb3965916e6c52235e7da2e20e26e6344252ec Mon Sep 17 00:00:00 2001 From: github-actions Date: Mon, 16 Mar 2026 15:03:15 +0000 Subject: [PATCH 6/7] Update merged schema files with markdown descriptions --- resources/schema/rule-merged/MetaVariables.json | 8 ++++++++ resources/schema/rule-merged/Operator.json | 2 +- resources/schema/rule-merged/Rule_Type.json | 2 +- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/resources/schema/rule-merged/MetaVariables.json b/resources/schema/rule-merged/MetaVariables.json index 5af22ca7f..0180bfeb4 100644 --- a/resources/schema/rule-merged/MetaVariables.json +++ b/resources/schema/rule-merged/MetaVariables.json @@ -198,6 +198,14 @@ "const": "library_variable_core", "markdownDescription": "\ncore attribute of a variable from the CDISC Library\n" }, + { + "const": "library_variable_has_codelist", + "markdownDescription": "\nIndicates whether a variable has an associated codelist in the CDISC Library\n" + }, + { + "const": "library_variable_ccode", + "markdownDescription": "\nccode attribute of a variable from the CDISC Library\n" + }, { "const": "library_variable_data_type", "markdownDescription": "\nsimpleDatatype attribute of a variable from the CDISC Library\n" diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json index 34dcf20a3..80c62ac56 100644 --- a/resources/schema/rule-merged/Operator.json +++ b/resources/schema/rule-merged/Operator.json @@ -462,7 +462,7 @@ "properties": { "operator": { "const": "is_inconsistent_across_dataset", - "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n" + "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\nIf a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).\n\n- regex is optional.\n- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).\n- Only the first capture group is used for comparison.\n- If the pattern does not match a value, the original value is used.\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n" } }, "required": ["operator", "value"], diff --git a/resources/schema/rule-merged/Rule_Type.json b/resources/schema/rule-merged/Rule_Type.json index ee910a5cb..b26257061 100644 --- a/resources/schema/rule-merged/Rule_Type.json +++ b/resources/schema/rule-merged/Rule_Type.json @@ -20,7 +20,7 @@ { "const": "Define Item Metadata Check against Library Metadata", "title": "Define xml metadata at variable level and corresponding library variable metadata", - "markdownDescription": "\n#### Columns\n\n- `define_variable_name`\n- `define_variable_label`\n- `define_variable_data_type`\n- `define_variable_role`\n- `define_variable_size`\n- `define_variable_ccode`\n- `define_variable_format`\n- `define_variable_allowed_terms`\n- `define_variable_origin_type`\n- `define_variable_is_collected`\n- `define_variable_has_no_data`\n- `define_variable_order_number`\n- `define_variable_has_codelist`\n- `define_variable_codelist_coded_values`\n- `define_variable_codelist_coded_codes`\n- `define_variable_mandatory`\n- `define_variable_has_comment`\n- `library_variable_name`\n- `library_variable_order_number`\n- `library_variable_label`\n- `library_variable_data_type`\n- `library_variable_role`\n- `library_variable_core`\n- `library_variable_ccode`\n\n#### Rule Macro\n\nChecks variable-level metadata, codelists, and codelist terms from Define-XML against the corresponding standard variable definitions from the CDISC Library.\n" + "markdownDescription": "\n#### Columns\n\n- `define_variable_name`\n- `define_variable_label`\n- `define_variable_data_type`\n- `define_variable_role`\n- `define_variable_size`\n- `define_variable_ccode`\n- `define_variable_format`\n- `define_variable_allowed_terms`\n- `define_variable_origin_type`\n- `define_variable_is_collected`\n- `define_variable_has_no_data`\n- `define_variable_order_number`\n- `define_variable_has_codelist`\n- `define_variable_codelist_coded_values`\n- `define_variable_codelist_coded_codes`\n- `define_variable_mandatory`\n- `define_variable_has_comment`\n- `library_variable_name`\n- `library_variable_order_number`\n- `library_variable_label`\n- `library_variable_data_type`\n- `library_variable_role`\n- `library_variable_core`\n- `library_variable_has_codelist`\n- `library_variable_ccode`\n\n#### Rule Macro\n\nChecks variable-level metadata, codelists, and codelist terms from Define-XML against the corresponding standard variable definitions from the CDISC Library.\n" }, { "const": "Domain Presence Check", From 0444f60e05195c7dbc182533c029c161db699160 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 17 Mar 2026 18:10:08 +0100 Subject: [PATCH 7/7] #1654 fix when regex passed as list(from the rule) --- cdisc_rules_engine/check_operators/dataframe_operators.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index ccfcaacb4..c2ae1bf80 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1153,6 +1153,8 @@ def is_inconsistent_across_dataset(self, other_value): target = other_value.get("target") comparator = other_value.get("comparator") regex = other_value.get("regex") + if isinstance(regex, list) and regex: + regex = regex[0] grouping_cols = [] if isinstance(comparator, str): if comparator in self.value.columns: @@ -1169,6 +1171,11 @@ def is_inconsistent_across_dataset(self, other_value): regex = f"({regex})" extracted = df_check[target].astype(str).str.extract(regex, expand=False) df_check[target] = extracted.fillna(df_check[target]) + results = self._check_inconsistency(df_check, grouping_cols, target) + return results + + @staticmethod + def _check_inconsistency(df_check, grouping_cols: list[Any], target): results = pd.Series(False, index=df_check.index) for name, group in df_check.groupby(grouping_cols, dropna=False): if group[target].nunique() > 1: