From 0bd47ec62ae8db1b182aac9632e9c63c71606af8 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 27 Oct 2025 16:15:47 -0400 Subject: [PATCH 1/5] initial --- cdisc_rules_engine/models/operation_params.py | 1 + cdisc_rules_engine/operations/distinct.py | 40 ++++++- .../utilities/rule_processor.py | 9 +- resources/schema/Operations.json | 3 + tests/unit/test_operations/test_distinct.py | 105 ++++++++++++++++++ 5 files changed, 151 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index c5d956261..d3c789dee 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -57,3 +57,4 @@ class OperationParams: original_target: str = None returntype: str = None target: str = None + value_is_reference: bool = False diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index 6d9717d59..ed226a818 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -2,25 +2,53 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation +def _get_value_from_reference(row, target_col_name): + ref_col_name = row[target_col_name] + if pd.notna(ref_col_name) and ref_col_name in row.index: + return row[ref_col_name] + return None + + class Distinct(BaseOperation): def _execute_operation(self): result = self.params.dataframe if self.params.filter: result = self._filter_data(result) + value_is_reference = getattr(self.params, "value_is_reference", False) if not self.params.grouping: - data = result[self.params.target].unique() - if isinstance(data[0], bytes): + if value_is_reference: + target = self.params.target + data = result.apply( + lambda row: _get_value_from_reference(row, target), axis=1 + ) + data = data.dropna().unique() + else: + data = result[self.params.target].unique() + if len(data) > 0 and isinstance(data[0], bytes): data = data.astype(str) result = set(data) else: grouped = result.groupby( self.params.grouping, as_index=False, group_keys=False - ).data - if isinstance(result.data, pd.DataFrame): - result = grouped[self.params.target].agg(self._unique_values_for_column) + ) + if value_is_reference: + target = self.params.target + operation_id = self.params.operation_id + + def get_referenced_unique_values(group): + values = group.apply( + lambda row: _get_value_from_reference(row, target), axis=1 + ) + return pd.Series({operation_id: set(values.dropna().unique())}) + + result = grouped.apply(get_referenced_unique_values).reset_index() + elif isinstance(result.data, pd.DataFrame): + result = grouped.data[self.params.target].agg( + self._unique_values_for_column + ) else: result = ( - grouped[self.params.target] + grouped.data[self.params.target] .unique() .rename({self.params.target: self.params.operation_id}) ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index cf2db9cfe..bf0efe1b3 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -421,6 +421,7 @@ def perform_rule_operations( term_code=operation.get("term_code"), term_value=operation.get("term_value"), term_pref_term=operation.get("term_pref_term"), + value_is_reference=operation.get("value_is_reference", False), ) # execute operation @@ -469,7 +470,13 @@ def _execute_operation( # download other domain domain_details: dict = search_in_list_of_dicts( operation_params.datasets, - lambda item: item.unsplit_name == operation_params.domain, + lambda item: ( + item.unsplit_name == operation_params.domain + or ( + operation_params.domain.endswith("--") + and item.unsplit_name.startswith(operation_params.domain[:-2]) + ) + ), ) if domain_details is None: raise DomainNotFoundError( diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index 6af7c3b46..76ccac8cc 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -536,6 +536,9 @@ "term_pref_term": { "type": "string" }, + "value_is_reference": { + "type": "boolean" + }, "version": { "type": "string" } diff --git a/tests/unit/test_operations/test_distinct.py b/tests/unit/test_operations/test_distinct.py index 2dfc4ee9b..9c9f9faa8 100644 --- a/tests/unit/test_operations/test_distinct.py +++ b/tests/unit/test_operations/test_distinct.py @@ -176,3 +176,108 @@ def test_filtered_grouped_distinct( assert grouping_column in result for _, val in result.iterrows(): assert val[operation_params.operation_id] == expected.get(val[grouping_column]) + + +@pytest.mark.parametrize( + "data, expected", + [ + ( + PandasDataset.from_dict( + { + "column_ref": ["result", "baseline", "result", "baseline"], + "result": [10, 20, 30, 40], + "baseline": [5, 15, 25, 35], + } + ), + {10, 15, 30, 35}, + ), + ( + DaskDataset.from_dict( + { + "column_ref": ["result", "baseline", "result", "baseline"], + "result": [10, 20, 30, 40], + "baseline": [5, 15, 25, 35], + } + ), + {10, 15, 30, 35}, + ), + ], +) +def test_distinct_value_is_reference(data, expected, operation_params: OperationParams): + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + data_service = DataServiceFactory(config, cache).get_data_service() + operation_params.dataframe = data + operation_params.target = "column_ref" + operation_params.value_is_reference = True + result = Distinct(operation_params, data, cache, data_service).execute() + assert operation_params.operation_id in result + assert len(result[operation_params.operation_id]) > 0 + for val in result[operation_params.operation_id]: + assert val == expected + + +@pytest.mark.parametrize( + "data, expected, grouping_aliases", + [ + ( + PandasDataset.from_dict( + { + "column_ref": [ + "result", + "baseline", + "result", + "baseline", + "result", + "baseline", + ], + "result": [10, 20, 30, 40, 50, 60], + "baseline": [5, 15, 25, 35, 45, 55], + "patient": [1, 1, 2, 2, 1, 2], + "subject": [1, 1, 2, 2, 1, 2], + } + ), + {1: {10, 15, 50}, 2: {30, 35, 55}}, + ["subject"], + ), + ( + DaskDataset.from_dict( + { + "column_ref": [ + "result", + "baseline", + "result", + "baseline", + "result", + "baseline", + ], + "result": [10, 20, 30, 40, 50, 60], + "baseline": [5, 15, 25, 35, 45, 55], + "patient": [1, 1, 2, 2, 1, 2], + "subject": [1, 1, 2, 2, 1, 2], + } + ), + {1: {10, 15, 50}, 2: {30, 35, 55}}, + ["subject"], + ), + ], +) +def test_grouped_distinct_value_is_reference( + data, expected, grouping_aliases, operation_params: OperationParams +): + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + data_service = DataServiceFactory(config, cache).get_data_service() + operation_params.dataframe = data + operation_params.target = "column_ref" + operation_params.value_is_reference = True + operation_params.grouping = ["patient"] + operation_params.grouping_aliases = grouping_aliases + result = Distinct(operation_params, data, cache, data_service).execute() + grouping_column = "".join( + operation_params.grouping_aliases or operation_params.grouping + ) + assert operation_params.operation_id in result + assert grouping_column in result + for _, val in result.iterrows(): + assert val[operation_params.operation_id] == expected.get(val[grouping_column]) From 56303baa64e8304d2966ee4a4542a077432139ac Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 28 Oct 2025 10:30:27 -0400 Subject: [PATCH 2/5] schema --- resources/schema/Operator.json | 45 +++++++--------------------------- 1 file changed, 9 insertions(+), 36 deletions(-) diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index b6302cbde..88a7024eb 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -205,15 +205,6 @@ "properties": { "operator": { "const": "equal_to" - }, - "round_values": { - "type": "boolean" - }, - "value_is_reference": { - "type": "boolean" - }, - "type_insensitive": { - "type": "boolean" } }, "required": ["operator", "value"], @@ -223,15 +214,6 @@ "properties": { "operator": { "const": "equal_to_case_insensitive" - }, - "round_values": { - "type": "boolean" - }, - "value_is_reference": { - "type": "boolean" - }, - "type_insensitive": { - "type": "boolean" } }, "required": ["operator", "value"], @@ -565,15 +547,6 @@ "properties": { "operator": { "const": "not_equal_to" - }, - "round_values": { - "type": "boolean" - }, - "value_is_reference": { - "type": "boolean" - }, - "type_insensitive": { - "type": "boolean" } }, "required": ["operator", "value"], @@ -583,15 +556,6 @@ "properties": { "operator": { "const": "not_equal_to_case_insensitive" - }, - "round_values": { - "type": "boolean" - }, - "value_is_reference": { - "type": "boolean" - }, - "type_insensitive": { - "type": "boolean" } }, "required": ["operator", "value"], @@ -967,6 +931,15 @@ } ] }, + "round_values": { + "type": "boolean" + }, + "value_is_reference": { + "type": "boolean" + }, + "type_insensitive": { + "type": "boolean" + }, "value_is_literal": { "type": "boolean" }, From b3c09d5e52c34e369df11a4153e9f6ef428100e6 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 25 Nov 2025 10:40:35 -0500 Subject: [PATCH 3/5] schema --- resources/schema/CORE-base.json | 3 --- resources/schema/Operations.json | 3 +++ 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/resources/schema/CORE-base.json b/resources/schema/CORE-base.json index 9eb632fa7..1619ba6da 100644 --- a/resources/schema/CORE-base.json +++ b/resources/schema/CORE-base.json @@ -423,9 +423,6 @@ } ] }, - "Is Relationship": { - "const": true - }, "Child": { "const": true }, diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index 7e9013d75..88b35f19c 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -462,6 +462,9 @@ "ct_version": { "type": "string" }, + "delimiter": { + "type": "string" + }, "dictionary_term_type": { "enum": ["LLT", "PT", "HLT", "HLGT", "SOC"] }, From 14315405c47ba67c69e3955263b1a2f39de723cd Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 25 Nov 2025 10:48:48 -0500 Subject: [PATCH 4/5] schema --- resources/schema/Operator.json | 1 + 1 file changed, 1 insertion(+) diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index 459a19607..ac5f95b51 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -561,6 +561,7 @@ "negative": { "type": "boolean" }, "codelistcheck": { "enum": ["code", "value"], "type": "string" }, "codelistlevel": { "enum": ["term", "codelist"], "type": "string" }, + "delimiter": { "type": "string" }, "operator": { "type": "string" }, "order": { "enum": ["asc", "dsc"], "type": "string" }, "ordering": { "$ref": "CORE-base.json#/$defs/VariableName" }, From 58a13025988711d2e6c1195e8b84adc0ac7a53d5 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 25 Nov 2025 10:51:00 -0500 Subject: [PATCH 5/5] remove operator --- resources/schema/Operator.json | 1 - 1 file changed, 1 deletion(-) diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index ac5f95b51..459a19607 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -561,7 +561,6 @@ "negative": { "type": "boolean" }, "codelistcheck": { "enum": ["code", "value"], "type": "string" }, "codelistlevel": { "enum": ["term", "codelist"], "type": "string" }, - "delimiter": { "type": "string" }, "operator": { "type": "string" }, "order": { "enum": ["asc", "dsc"], "type": "string" }, "ordering": { "$ref": "CORE-base.json#/$defs/VariableName" },