diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 02d16cedc..7174f09e9 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -58,6 +58,7 @@ class OperationParams: returntype: str = None source: str = None target: str = None + subtract: str = None value_is_reference: bool = False namespace: str = None delimiter: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py new file mode 100644 index 000000000..71061b3f1 --- /dev/null +++ b/cdisc_rules_engine/operations/minus.py @@ -0,0 +1,46 @@ +""" +Set difference operation: name minus subtract. +Returns elements in name that are not in subtract, preserving order from name. +""" + +from cdisc_rules_engine.operations.base_operation import BaseOperation + + +def _normalize_to_list(val): + """Convert value to a list for set operations.""" + if val is None: + return [] + if isinstance(val, list): + return val + if isinstance(val, (set, tuple)): + return list(val) + return [val] + + +def _set_difference_preserve_order(list_a: list, list_b: list) -> list: + """ + Compute set difference A \\ B (elements in A not in B). + Preserves order from list_a. + """ + set_b = set(_normalize_to_list(list_b)) + return [x for x in _normalize_to_list(list_a) if x not in set_b] + + +class Minus(BaseOperation): + """ + Operation that computes set difference: name minus subtract. + name (minuend) and subtract (subtrahend) reference other operation results. + Returns elements in name that are not in subtract. + """ + + def _execute_operation(self): + name_ref = self.params.target + subtract_ref = self.params.subtract + + if not name_ref or name_ref not in self.evaluation_dataset.columns: + return [] + list_a = self.evaluation_dataset[name_ref].iloc[0] + if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns: + return _normalize_to_list(list_a) + list_b = self.evaluation_dataset[subtract_ref].iloc[0] + return _set_difference_preserve_order(list_a, list_b) diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index d943144d1..477f25f99 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -41,6 +41,7 @@ MedDRATermReferencesValidator, ) from cdisc_rules_engine.operations.min_date import MinDate +from cdisc_rules_engine.operations.minus import Minus from cdisc_rules_engine.operations.minimum import Minimum from cdisc_rules_engine.operations.record_count import RecordCount from cdisc_rules_engine.operations.split_by import SplitBy @@ -109,6 +110,7 @@ class OperationsFactory(FactoryInterface): "mean": Mean, "min": Minimum, "min_date": MinDate, + "minus": Minus, "record_count": RecordCount, "valid_meddra_code_references": MedDRACodeReferencesValidator, "valid_whodrug_references": WhodrugReferencesValidator, diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 85e128a61..061979714 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -391,6 +391,7 @@ def perform_rule_operations( operation_id=operation.get("id"), operation_name=operation.get("operator"), original_target=original_target, + subtract=operation.get("subtract"), regex=operation.get("regex"), returntype=operation.get("returntype"), source=operation.get("source"), diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 5ec2f1f10..6e2747a57 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -50,42 +50,54 @@ }, { "properties": { - "operator": { "const": "domain_is_custom" } + "operator": { + "const": "domain_is_custom" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "domain_label" } + "operator": { + "const": "domain_label" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "dy" } + "operator": { + "const": "dy" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "extract_metadata" } + "operator": { + "const": "extract_metadata" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "expected_variables" } + "operator": { + "const": "expected_variables" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "get_codelist_attributes" } + "operator": { + "const": "get_codelist_attributes" + } }, "required": ["id", "operator", "name", "ct_attribute", "version"], "type": "object" @@ -162,46 +174,67 @@ }, { "properties": { - "operator": { "const": "map" } + "operator": { + "const": "map" + } }, "required": ["id", "operator", "map"], "type": "object" }, { "properties": { - "operator": { "const": "max" } + "operator": { + "const": "max" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "max_date" } + "operator": { + "const": "max_date" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "mean" } + "operator": { + "const": "mean" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "min" } + "operator": { + "const": "min" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "min_date" } + "operator": { + "const": "min_date" + } }, "required": ["id", "operator", "name"], "type": "object" }, + { + "properties": { + "operator": { + "const": "minus" + } + }, + "required": ["id", "operator", "name", "subtract"], + "type": "object" + }, { "properties": { "operator": { @@ -222,42 +255,54 @@ }, { "properties": { - "operator": { "const": "record_count" } + "operator": { + "const": "record_count" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "required_variables" } + "operator": { + "const": "required_variables" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "split_by" } + "operator": { + "const": "split_by" + } }, "required": ["id", "operator", "delimiter", "name"], "type": "object" }, { "properties": { - "operator": { "const": "study_domains" } + "operator": { + "const": "study_domains" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "dataset_names" } + "operator": { + "const": "dataset_names" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "standard_domains" } + "operator": { + "const": "standard_domains" + } }, "required": ["id", "operator"], "type": "object" @@ -280,7 +325,6 @@ "required": ["id", "operator", "external_dictionary_type"], "type": "object" }, - { "properties": { "operator": { @@ -364,28 +408,36 @@ }, { "properties": { - "operator": { "const": "variable_count" } + "operator": { + "const": "variable_count" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_exists" } + "operator": { + "const": "variable_exists" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_is_null" } + "operator": { + "const": "variable_is_null" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_names" } + "operator": { + "const": "variable_names" + } }, "required": ["id", "operator"], "type": "object" @@ -410,7 +462,9 @@ }, { "properties": { - "operator": { "const": "get_xhtml_errors" } + "operator": { + "const": "get_xhtml_errors" + } }, "required": ["id", "operator", "name", "namespace"], "type": "object" @@ -457,7 +511,9 @@ ] }, "ct_package_types": { - "items": { "$ref": "Operations.json#/properties/ct_package_type" }, + "items": { + "$ref": "Operations.json#/properties/ct_package_type" + }, "type": "array" }, "ct_packages": { @@ -488,7 +544,9 @@ "external_dictionary_type": { "enum": ["meddra"] }, - "filter": { "type": "object" }, + "filter": { + "type": "object" + }, "filter_key": { "type": "string" }, @@ -536,7 +594,9 @@ "items": { "type": "object", "properties": { - "output": { "type": "string" } + "output": { + "type": "string" + } }, "required": ["output"] } @@ -560,6 +620,9 @@ "source": { "type": "string" }, + "subtract": { + "type": "string" + }, "term_value": { "type": "string" }, diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 9eb3e89e2..846dc0156 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -750,6 +750,22 @@ Operations: operator: get_column_order_from_dataset ``` +### minus + +Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. + +```yaml +Operations: + - id: $expected_variables + operator: expected_variables + - id: $dataset_variables + operator: get_column_order_from_dataset + - id: $expected_minus_dataset + name: $expected_variables + operator: minus + subtract: $dataset_variables +``` + ### label_referenced_variable_metadata Generates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable label found in the column provided in name. The metadata column names are prefixed with the string provided in `id`. diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md index 2f227d87d..8eb7c2984 100644 --- a/resources/schema/rule/check_parameter.md +++ b/resources/schema/rule/check_parameter.md @@ -1,4 +1,3 @@ - ## Overview Check parameters are configuration elements that define how validation rules are applied within the CDISC rules engine. These parameters control the behavior, scope, and criteria for data validation checks across clinical trial datasets. Each parameter serves a specific purpose in customizing rule logic to ensure data integrity and compliance with CDISC standards. @@ -267,6 +266,17 @@ Expected return type for operation results. Valid values are "code" (for NCI cod Either "submission" or "evaluation" for which dataset to check the variable_is_null from. Evaluation is the dataset constructed by the rule type while submission is the raw dataset submitted that is being evaluated. +### subtract + +Reference to another operation result, used as the second operand in operations that take two inputs. For example, in the `minus` operation, `name` references the minuend (first list) and `subtract` references the subtrahend (second list); the operation returns elements in the first list that are not in the second. + +```yaml +- id: $expected_minus_dataset + name: $expected_variables + operator: minus + subtract: $dataset_variables +``` + ### term_code Terminology code value used in controlled terminology operations for code-based lookups. diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py new file mode 100644 index 000000000..6969ea99b --- /dev/null +++ b/tests/unit/test_operations/test_minus.py @@ -0,0 +1,123 @@ +from unittest.mock import MagicMock + +from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset + +from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order +import pytest + + +@pytest.fixture +def minus_params(operation_params: OperationParams) -> OperationParams: + """Configure operation_params for minus operation tests.""" + operation_params.operation_id = "$expected_minus_dataset" + operation_params.target = "$expected_variables" + operation_params.subtract = "$dataset_variables" + return operation_params + + +@pytest.mark.parametrize( + "list_a,list_b,expected", + [ + (["a", "b", "c"], ["b"], ["a", "c"]), + (["a", "b", "c"], [], ["a", "b", "c"]), + (["a", "b", "c"], ["a", "b", "c"], []), + (["A", "B", "C", "D"], ["B", "D"], ["A", "C"]), + (["a", "b", "b", "c"], ["b"], ["a", "c"]), + (["a", "a", "b"], ["b"], ["a", "a"]), + ("x", ["a"], ["x"]), + (["x"], "a", ["x"]), + ("a", "b", ["a"]), + (["a", "", "b"], [""], ["a", "b"]), + (["a", "", "b"], ["c"], ["a", "", "b"]), + ([""], [""], []), + ], +) +def test_set_difference_preserve_order(list_a, list_b, expected): + assert _set_difference_preserve_order(list_a, list_b) == expected + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation(minus_params: OperationParams, dataset_type): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM"], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_empty_subtract_returns_all_of_name( + minus_params: OperationParams, dataset_type +): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["A", "B", "C"], + ["A", "B", "C"], + ], + "$dataset_variables": [ + [], + [], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + + assert list(result[minus_params.operation_id].iloc[0]) == ["A", "B", "C"] + + +@pytest.mark.parametrize( + "expected_vars,dataset_vars,expected", + [ + ([["a", "b", "b", "c"], ["a", "b", "b", "c"]], [["b"], ["b"]], ["a", "c"]), + ([["a", "a", "b"], ["a", "a", "b"]], [["b"], ["b"]], ["a", "a"]), + ([["a", "", "b"], ["a", "", "b"]], [[""], [""]], ["a", "b"]), + ([["a", "", "b"], ["a", "", "b"]], [["c"], ["c"]], ["a", "", "b"]), + ([[""], [""]], [[""], [""]], []), + (["x", "y"], [["a"], ["a"]], ["x"]), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation_edge_cases( + minus_params: OperationParams, + dataset_type, + expected_vars, + dataset_vars, + expected, +): + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": expected_vars, + "$dataset_variables": dataset_vars, + } + ) + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == expected + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_name_ref_missing_returns_empty( + minus_params: OperationParams, dataset_type +): + eval_dataset = dataset_type.from_dict( + {"$dataset_variables": [["STUDYID", "DOMAIN"], ["STUDYID", "DOMAIN"]]} + ) + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == []