From 9fad0ae167c014764fb63425b31fbc3e76762ba9 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 11 Feb 2026 15:34:28 -0500 Subject: [PATCH 1/6] Add minus operation for set difference to support CORE-000334 expected-variable checks --- cdisc_rules_engine/models/operation_params.py | 1 + cdisc_rules_engine/operations/minus.py | 47 +++ .../operations/operations_factory.py | 368 +++++++++--------- .../utilities/rule_processor.py | 1 + resources/schema/Operations.json | 117 ++++-- resources/schema/Operations.md | 28 ++ tests/unit/test_operations/test_minus.py | 96 +++++ 7 files changed, 448 insertions(+), 210 deletions(-) create mode 100644 cdisc_rules_engine/operations/minus.py create mode 100644 tests/unit/test_operations/test_minus.py diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 72913040c..98edce90c 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -56,6 +56,7 @@ class OperationParams: regex: str = None returntype: str = None target: str = None + value: str = None value_is_reference: bool = False namespace: str = None delimiter: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py new file mode 100644 index 000000000..76d126d0f --- /dev/null +++ b/cdisc_rules_engine/operations/minus.py @@ -0,0 +1,47 @@ +""" +Set difference operation: name minus value. +Returns elements in name that are not in value, preserving order from name. +""" + +from cdisc_rules_engine.operations.base_operation import BaseOperation + + +def _normalize_to_list(val): + """Convert value to a list for set operations.""" + if val is None: + return [] + if isinstance(val, list): + return val + if isinstance(val, (set, tuple)): + return list(val) + return [val] + + +def _set_difference_preserve_order(list_a: list, list_b: list) -> list: + """ + Compute set difference A \\ B (elements in A not in B). + Preserves order from list_a. + """ + set_b = set(_normalize_to_list(list_b)) + return [x for x in _normalize_to_list(list_a) if x not in set_b] + + +class Minus(BaseOperation): + """ + Operation that computes set difference: name minus value. + name (minuend) and value (subtrahend) reference other operation results. + Returns elements in name that are not in value. + """ + + def _execute_operation(self): + name_ref = self.params.target # minuend (list A) + value_ref = self.params.value # subtrahend (list B) + + if not name_ref or name_ref not in self.evaluation_dataset.columns: + return [] + list_a = self.evaluation_dataset[name_ref].iloc[0] + if not value_ref or value_ref not in self.evaluation_dataset.columns: + # listA minus empty = all of listA (per Sam) + return _normalize_to_list(list_a) + list_b = self.evaluation_dataset[value_ref].iloc[0] + return _set_difference_preserve_order(list_a, list_b) diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index d943144d1..9340f8b64 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -1,183 +1,185 @@ -from typing import Type - -from cdisc_rules_engine.interfaces import FactoryInterface -from cdisc_rules_engine.operations.base_operation import BaseOperation -from cdisc_rules_engine.operations.dataset_names import DatasetNames -from cdisc_rules_engine.operations.dataset_column_order import DatasetColumnOrder -from cdisc_rules_engine.operations.day_data_validator import DayDataValidator -from cdisc_rules_engine.operations.define_dictionary_version_validator import ( - DefineDictionaryVersionValidator, -) -from cdisc_rules_engine.operations.distinct import Distinct -from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata -from cdisc_rules_engine.operations.get_xhtml_errors import GetXhtmlErrors -from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -from cdisc_rules_engine.operations.library_model_column_order import ( - LibraryModelColumnOrder, -) -from cdisc_rules_engine.operations.map import Map -from cdisc_rules_engine.operations.parent_library_model_column_order import ( - ParentLibraryModelColumnOrder, -) -from cdisc_rules_engine.operations.get_codelist_attributes import ( - CodeListAttributes, -) -from cdisc_rules_engine.operations.get_model_filtered_variables import ( - LibraryModelVariablesFilter, -) -from cdisc_rules_engine.operations.max_date import MaxDate -from cdisc_rules_engine.operations.maximum import Maximum -from cdisc_rules_engine.operations.mean import Mean -from cdisc_rules_engine.operations.domain_is_custom import DomainIsCustom -from cdisc_rules_engine.operations.domain_label import DomainLabel -from cdisc_rules_engine.operations.standard_domains import StandardDomains -from cdisc_rules_engine.operations.meddra_code_references_validator import ( - MedDRACodeReferencesValidator, -) -from cdisc_rules_engine.operations.meddra_code_term_pairs_validator import ( - MedDRACodeTermPairsValidator, -) -from cdisc_rules_engine.operations.meddra_term_references_validator import ( - MedDRATermReferencesValidator, -) -from cdisc_rules_engine.operations.min_date import MinDate -from cdisc_rules_engine.operations.minimum import Minimum -from cdisc_rules_engine.operations.record_count import RecordCount -from cdisc_rules_engine.operations.split_by import SplitBy -from cdisc_rules_engine.operations.valid_external_dictionary_code import ( - ValidExternalDictionaryCode, -) -from cdisc_rules_engine.operations.valid_external_dictionary_code_term_pair import ( - ValidExternalDictionaryCodeTermPair, -) -from cdisc_rules_engine.operations.variable_exists import VariableExists -from cdisc_rules_engine.operations.variable_names import VariableNames -from cdisc_rules_engine.operations.variable_value_count import VariableValueCount -from cdisc_rules_engine.operations.whodrug_references_validator import ( - WhodrugReferencesValidator, -) -from cdisc_rules_engine.operations.whodrug_hierarchy_validator import ( - WhodrugHierarchyValidator, -) -from cdisc_rules_engine.operations.variable_count import VariableCount -from cdisc_rules_engine.operations.variable_is_null import VariableIsNull -from cdisc_rules_engine.operations.required_variables import RequiredVariables -from cdisc_rules_engine.operations.expected_variables import ExpectedVariables -from cdisc_rules_engine.operations.permissible_variables import PermissibleVariables -from cdisc_rules_engine.operations.study_domains import StudyDomains -from cdisc_rules_engine.operations.valid_codelist_dates import ValidCodelistDates -from cdisc_rules_engine.operations.label_referenced_variable_metadata import ( - LabelReferencedVariableMetadata, -) -from cdisc_rules_engine.operations.name_referenced_variable_metadata import ( - NameReferencedVariableMetadata, -) -from cdisc_rules_engine.operations.define_variable_metadata import ( - DefineVariableMetadata, -) -from cdisc_rules_engine.operations.valid_external_dictionary_value import ( - ValidExternalDictionaryValue, -) -from cdisc_rules_engine.operations.codelist_terms import CodelistTerms -from cdisc_rules_engine.operations.codelist_extensible import CodelistExtensible -from cdisc_rules_engine.operations.define_xml_extensible_codelists import ( - DefineCodelists, -) -from cdisc_rules_engine.operations.get_dataset_filtered_variables import ( - GetDatasetFilteredVariables, -) - - -class OperationsFactory(FactoryInterface): - _operations_map = { - "codelist_extensible": CodelistExtensible, - "codelist_terms": CodelistTerms, - "dataset_names": DatasetNames, - "define_extensible_codelists": DefineCodelists, - "distinct": Distinct, - "dy": DayDataValidator, - "extract_metadata": ExtractMetadata, - "get_column_order_from_dataset": DatasetColumnOrder, - "get_column_order_from_library": LibraryColumnOrder, - "get_codelist_attributes": CodeListAttributes, - "get_model_column_order": LibraryModelColumnOrder, - "get_model_filtered_variables": LibraryModelVariablesFilter, - "get_parent_model_column_order": ParentLibraryModelColumnOrder, - "map": Map, - "max": Maximum, - "max_date": MaxDate, - "mean": Mean, - "min": Minimum, - "min_date": MinDate, - "record_count": RecordCount, - "valid_meddra_code_references": MedDRACodeReferencesValidator, - "valid_whodrug_references": WhodrugReferencesValidator, - "whodrug_code_hierarchy": WhodrugHierarchyValidator, - "valid_meddra_term_references": MedDRATermReferencesValidator, - "valid_meddra_code_term_pairs": MedDRACodeTermPairsValidator, - "variable_exists": VariableExists, - "variable_names": VariableNames, - "variable_value_count": VariableValueCount, - "variable_count": VariableCount, - "variable_is_null": VariableIsNull, - "domain_is_custom": DomainIsCustom, - "domain_label": DomainLabel, - "standard_domains": StandardDomains, - "required_variables": RequiredVariables, - "split_by": SplitBy, - "expected_variables": ExpectedVariables, - "permissible_variables": PermissibleVariables, - "study_domains": StudyDomains, - "valid_codelist_dates": ValidCodelistDates, - "label_referenced_variable_metadata": LabelReferencedVariableMetadata, - "name_referenced_variable_metadata": NameReferencedVariableMetadata, - "define_variable_metadata": DefineVariableMetadata, - "valid_external_dictionary_value": ValidExternalDictionaryValue, - "valid_external_dictionary_code": ValidExternalDictionaryCode, - "valid_external_dictionary_code_term_pair": ValidExternalDictionaryCodeTermPair, - "valid_define_external_dictionary_version": DefineDictionaryVersionValidator, - "get_dataset_filtered_variables": GetDatasetFilteredVariables, - "get_xhtml_errors": GetXhtmlErrors, - } - - @classmethod - def register_service(cls, name: str, operation: Type[BaseOperation]) -> None: - """ - Save mapping of operation name and it's implementation - """ - if not name: - raise ValueError("Operation name must not be empty!") - if not issubclass(operation, BaseOperation): - raise TypeError("Implementation of BaseOperation required!") - cls._operations_map[name] = operation - - def get_service( - self, - name, - **kwargs, - ) -> BaseOperation: - """Get instance of operation that matches operation specified in params""" - required_args = { - "operation_params", - "original_dataset", - "cache", - "data_service", - "library_metadata", - } - if not required_args.issubset(kwargs.keys()): - raise ValueError( - f"One of the following required key word arguments is missing: " - f"{required_args}" - ) - if name in self._operations_map: - return self._operations_map.get(name)( - kwargs.get("operation_params"), - kwargs.get("original_dataset"), - kwargs.get("cache"), - kwargs.get("data_service"), - kwargs.get("library_metadata"), - ) - raise ValueError( - f"Operation name must be in {list(self._operations_map.keys())}, " - f"given operation name is {name}" - ) +from typing import Type + +from cdisc_rules_engine.interfaces import FactoryInterface +from cdisc_rules_engine.operations.base_operation import BaseOperation +from cdisc_rules_engine.operations.dataset_names import DatasetNames +from cdisc_rules_engine.operations.dataset_column_order import DatasetColumnOrder +from cdisc_rules_engine.operations.day_data_validator import DayDataValidator +from cdisc_rules_engine.operations.define_dictionary_version_validator import ( + DefineDictionaryVersionValidator, +) +from cdisc_rules_engine.operations.distinct import Distinct +from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata +from cdisc_rules_engine.operations.get_xhtml_errors import GetXhtmlErrors +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder +from cdisc_rules_engine.operations.library_model_column_order import ( + LibraryModelColumnOrder, +) +from cdisc_rules_engine.operations.map import Map +from cdisc_rules_engine.operations.parent_library_model_column_order import ( + ParentLibraryModelColumnOrder, +) +from cdisc_rules_engine.operations.get_codelist_attributes import ( + CodeListAttributes, +) +from cdisc_rules_engine.operations.get_model_filtered_variables import ( + LibraryModelVariablesFilter, +) +from cdisc_rules_engine.operations.max_date import MaxDate +from cdisc_rules_engine.operations.maximum import Maximum +from cdisc_rules_engine.operations.mean import Mean +from cdisc_rules_engine.operations.domain_is_custom import DomainIsCustom +from cdisc_rules_engine.operations.domain_label import DomainLabel +from cdisc_rules_engine.operations.standard_domains import StandardDomains +from cdisc_rules_engine.operations.meddra_code_references_validator import ( + MedDRACodeReferencesValidator, +) +from cdisc_rules_engine.operations.meddra_code_term_pairs_validator import ( + MedDRACodeTermPairsValidator, +) +from cdisc_rules_engine.operations.meddra_term_references_validator import ( + MedDRATermReferencesValidator, +) +from cdisc_rules_engine.operations.min_date import MinDate +from cdisc_rules_engine.operations.minus import Minus +from cdisc_rules_engine.operations.minimum import Minimum +from cdisc_rules_engine.operations.record_count import RecordCount +from cdisc_rules_engine.operations.split_by import SplitBy +from cdisc_rules_engine.operations.valid_external_dictionary_code import ( + ValidExternalDictionaryCode, +) +from cdisc_rules_engine.operations.valid_external_dictionary_code_term_pair import ( + ValidExternalDictionaryCodeTermPair, +) +from cdisc_rules_engine.operations.variable_exists import VariableExists +from cdisc_rules_engine.operations.variable_names import VariableNames +from cdisc_rules_engine.operations.variable_value_count import VariableValueCount +from cdisc_rules_engine.operations.whodrug_references_validator import ( + WhodrugReferencesValidator, +) +from cdisc_rules_engine.operations.whodrug_hierarchy_validator import ( + WhodrugHierarchyValidator, +) +from cdisc_rules_engine.operations.variable_count import VariableCount +from cdisc_rules_engine.operations.variable_is_null import VariableIsNull +from cdisc_rules_engine.operations.required_variables import RequiredVariables +from cdisc_rules_engine.operations.expected_variables import ExpectedVariables +from cdisc_rules_engine.operations.permissible_variables import PermissibleVariables +from cdisc_rules_engine.operations.study_domains import StudyDomains +from cdisc_rules_engine.operations.valid_codelist_dates import ValidCodelistDates +from cdisc_rules_engine.operations.label_referenced_variable_metadata import ( + LabelReferencedVariableMetadata, +) +from cdisc_rules_engine.operations.name_referenced_variable_metadata import ( + NameReferencedVariableMetadata, +) +from cdisc_rules_engine.operations.define_variable_metadata import ( + DefineVariableMetadata, +) +from cdisc_rules_engine.operations.valid_external_dictionary_value import ( + ValidExternalDictionaryValue, +) +from cdisc_rules_engine.operations.codelist_terms import CodelistTerms +from cdisc_rules_engine.operations.codelist_extensible import CodelistExtensible +from cdisc_rules_engine.operations.define_xml_extensible_codelists import ( + DefineCodelists, +) +from cdisc_rules_engine.operations.get_dataset_filtered_variables import ( + GetDatasetFilteredVariables, +) + + +class OperationsFactory(FactoryInterface): + _operations_map = { + "codelist_extensible": CodelistExtensible, + "codelist_terms": CodelistTerms, + "dataset_names": DatasetNames, + "define_extensible_codelists": DefineCodelists, + "distinct": Distinct, + "dy": DayDataValidator, + "extract_metadata": ExtractMetadata, + "get_column_order_from_dataset": DatasetColumnOrder, + "get_column_order_from_library": LibraryColumnOrder, + "get_codelist_attributes": CodeListAttributes, + "get_model_column_order": LibraryModelColumnOrder, + "get_model_filtered_variables": LibraryModelVariablesFilter, + "get_parent_model_column_order": ParentLibraryModelColumnOrder, + "map": Map, + "max": Maximum, + "max_date": MaxDate, + "mean": Mean, + "min": Minimum, + "min_date": MinDate, + "minus": Minus, + "record_count": RecordCount, + "valid_meddra_code_references": MedDRACodeReferencesValidator, + "valid_whodrug_references": WhodrugReferencesValidator, + "whodrug_code_hierarchy": WhodrugHierarchyValidator, + "valid_meddra_term_references": MedDRATermReferencesValidator, + "valid_meddra_code_term_pairs": MedDRACodeTermPairsValidator, + "variable_exists": VariableExists, + "variable_names": VariableNames, + "variable_value_count": VariableValueCount, + "variable_count": VariableCount, + "variable_is_null": VariableIsNull, + "domain_is_custom": DomainIsCustom, + "domain_label": DomainLabel, + "standard_domains": StandardDomains, + "required_variables": RequiredVariables, + "split_by": SplitBy, + "expected_variables": ExpectedVariables, + "permissible_variables": PermissibleVariables, + "study_domains": StudyDomains, + "valid_codelist_dates": ValidCodelistDates, + "label_referenced_variable_metadata": LabelReferencedVariableMetadata, + "name_referenced_variable_metadata": NameReferencedVariableMetadata, + "define_variable_metadata": DefineVariableMetadata, + "valid_external_dictionary_value": ValidExternalDictionaryValue, + "valid_external_dictionary_code": ValidExternalDictionaryCode, + "valid_external_dictionary_code_term_pair": ValidExternalDictionaryCodeTermPair, + "valid_define_external_dictionary_version": DefineDictionaryVersionValidator, + "get_dataset_filtered_variables": GetDatasetFilteredVariables, + "get_xhtml_errors": GetXhtmlErrors, + } + + @classmethod + def register_service(cls, name: str, operation: Type[BaseOperation]) -> None: + """ + Save mapping of operation name and it's implementation + """ + if not name: + raise ValueError("Operation name must not be empty!") + if not issubclass(operation, BaseOperation): + raise TypeError("Implementation of BaseOperation required!") + cls._operations_map[name] = operation + + def get_service( + self, + name, + **kwargs, + ) -> BaseOperation: + """Get instance of operation that matches operation specified in params""" + required_args = { + "operation_params", + "original_dataset", + "cache", + "data_service", + "library_metadata", + } + if not required_args.issubset(kwargs.keys()): + raise ValueError( + f"One of the following required key word arguments is missing: " + f"{required_args}" + ) + if name in self._operations_map: + return self._operations_map.get(name)( + kwargs.get("operation_params"), + kwargs.get("original_dataset"), + kwargs.get("cache"), + kwargs.get("data_service"), + kwargs.get("library_metadata"), + ) + raise ValueError( + f"Operation name must be in {list(self._operations_map.keys())}, " + f"given operation name is {name}" + ) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index d7c834321..6f05b8167 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -397,6 +397,7 @@ def perform_rule_operations( term_value=operation.get("term_value"), term_pref_term=operation.get("term_pref_term"), namespace=operation.get("namespace"), + value=operation.get("value"), value_is_reference=operation.get("value_is_reference", False), delimiter=operation.get("delimiter"), regex=operation.get("regex"), diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index ea439a54d..492310614 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -50,42 +50,54 @@ }, { "properties": { - "operator": { "const": "domain_is_custom" } + "operator": { + "const": "domain_is_custom" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "domain_label" } + "operator": { + "const": "domain_label" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "dy" } + "operator": { + "const": "dy" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "extract_metadata" } + "operator": { + "const": "extract_metadata" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "expected_variables" } + "operator": { + "const": "expected_variables" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "get_codelist_attributes" } + "operator": { + "const": "get_codelist_attributes" + } }, "required": ["id", "operator", "name", "ct_attribute", "version"], "type": "object" @@ -155,46 +167,67 @@ }, { "properties": { - "operator": { "const": "map" } + "operator": { + "const": "map" + } }, "required": ["id", "operator", "map"], "type": "object" }, { "properties": { - "operator": { "const": "max" } + "operator": { + "const": "max" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "max_date" } + "operator": { + "const": "max_date" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "mean" } + "operator": { + "const": "mean" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "min" } + "operator": { + "const": "min" + } }, "required": ["id", "operator", "name"], "type": "object" }, { "properties": { - "operator": { "const": "min_date" } + "operator": { + "const": "min_date" + } }, "required": ["id", "operator", "name"], "type": "object" }, + { + "properties": { + "operator": { + "const": "minus" + } + }, + "required": ["id", "operator", "name", "value"], + "type": "object" + }, { "properties": { "operator": { @@ -215,42 +248,54 @@ }, { "properties": { - "operator": { "const": "record_count" } + "operator": { + "const": "record_count" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "required_variables" } + "operator": { + "const": "required_variables" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "split_by" } + "operator": { + "const": "split_by" + } }, "required": ["id", "operator", "delimiter", "name"], "type": "object" }, { "properties": { - "operator": { "const": "study_domains" } + "operator": { + "const": "study_domains" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "dataset_names" } + "operator": { + "const": "dataset_names" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "standard_domains" } + "operator": { + "const": "standard_domains" + } }, "required": ["id", "operator"], "type": "object" @@ -273,7 +318,6 @@ "required": ["id", "operator", "external_dictionary_type"], "type": "object" }, - { "properties": { "operator": { @@ -357,28 +401,36 @@ }, { "properties": { - "operator": { "const": "variable_count" } + "operator": { + "const": "variable_count" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_exists" } + "operator": { + "const": "variable_exists" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_is_null" } + "operator": { + "const": "variable_is_null" + } }, "required": ["id", "operator"], "type": "object" }, { "properties": { - "operator": { "const": "variable_names" } + "operator": { + "const": "variable_names" + } }, "required": ["id", "operator"], "type": "object" @@ -403,7 +455,9 @@ }, { "properties": { - "operator": { "const": "get_xhtml_errors" } + "operator": { + "const": "get_xhtml_errors" + } }, "required": ["id", "operator", "name", "namespace"], "type": "object" @@ -450,7 +504,9 @@ ] }, "ct_package_types": { - "items": { "$ref": "Operations.json#/properties/ct_package_type" }, + "items": { + "$ref": "Operations.json#/properties/ct_package_type" + }, "type": "array" }, "ct_packages": { @@ -481,7 +537,9 @@ "external_dictionary_type": { "enum": ["meddra"] }, - "filter": { "type": "object" }, + "filter": { + "type": "object" + }, "filter_key": { "type": "string" }, @@ -529,7 +587,9 @@ "items": { "type": "object", "properties": { - "output": { "type": "string" } + "output": { + "type": "string" + } }, "required": ["output"] } @@ -559,6 +619,9 @@ "term_pref_term": { "type": "string" }, + "value": { + "type": "string" + }, "value_is_reference": { "type": "boolean" }, diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index e7b553a09..8d5cb8fe4 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -750,6 +750,34 @@ Operations: operator: get_column_order_from_dataset ``` +### minus + +Computes set difference: elements in `name` that are not in `value`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `value` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). + +When `value` is empty or missing, returns all elements from `name`. + +```yaml +Operations: + - id: $expected_variables + operator: expected_variables + - id: $dataset_variables + operator: get_column_order_from_dataset + - id: $expected_minus_dataset + name: $expected_variables + operator: minus + value: $dataset_variables +Check: + all: + - name: $expected_minus_dataset + operator: non_empty +Outcome: + Message: At least one expected variable is missing from dataset + Output Variables: + - $dataset_variables + - $expected_variables + - $expected_minus_dataset +``` + ### label_referenced_variable_metadata Generates a dataframe where each record in the dataframe is the library ig variable metadata corresponding with the variable label found in the column provided in name. The metadata column names are prefixed with the string provided in `id`. diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py new file mode 100644 index 000000000..0f97745bd --- /dev/null +++ b/tests/unit/test_operations/test_minus.py @@ -0,0 +1,96 @@ +from unittest.mock import MagicMock + +from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset + +from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.operations.minus import Minus, _set_difference_preserve_order +import pytest + + +@pytest.fixture +def minus_params(operation_params: OperationParams) -> OperationParams: + """Configure operation_params for minus operation tests.""" + operation_params.operation_id = "$expected_minus_dataset" + operation_params.target = "$expected_variables" + operation_params.value = "$dataset_variables" + return operation_params + + +def test_set_difference_preserve_order(): + """Test set difference preserves order from first list.""" + assert _set_difference_preserve_order(["a", "b", "c"], ["b"]) == ["a", "c"] + assert _set_difference_preserve_order(["a", "b", "c"], []) == ["a", "b", "c"] + assert _set_difference_preserve_order(["a", "b", "c"], ["a", "b", "c"]) == [] + assert _set_difference_preserve_order(["A", "B", "C", "D"], ["B", "D"]) == [ + "A", + "C", + ] + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_operation(minus_params: OperationParams, dataset_type): + """Test minus operation computes set difference correctly.""" + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM", "AEDECOD"], + ], + "$dataset_variables": [ + ["STUDYID", "DOMAIN", "AESEQ", "AETERM"], + ["STUDYID", "DOMAIN", "AESEQ", "AETERM"], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + + expected = ["AEDECOD"] # in expected but not in dataset + assert list(result[minus_params.operation_id].iloc[0]) == expected + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_empty_value_returns_all_of_name( + minus_params: OperationParams, dataset_type +): + """When value is empty, minus returns all of name (per Sam).""" + eval_dataset = dataset_type.from_dict( + { + "$expected_variables": [ + ["A", "B", "C"], + ["A", "B", "C"], + ], + "$dataset_variables": [ + [], + [], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + + assert list(result[minus_params.operation_id].iloc[0]) == ["A", "B", "C"] + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_name_ref_missing_returns_empty( + minus_params: OperationParams, dataset_type +): + """When name ref is missing from dataset columns, minus returns empty list.""" + # Dataset has $dataset_variables but not $expected_variables + eval_dataset = dataset_type.from_dict( + { + "$dataset_variables": [ + ["STUDYID", "DOMAIN"], + ["STUDYID", "DOMAIN"], + ], + } + ) + + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() + + assert list(result[minus_params.operation_id].iloc[0]) == [] From d839379d2e8c199e59c2fb81e73b08c4ab9118a8 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 17 Feb 2026 08:56:34 -0500 Subject: [PATCH 2/6] update diff issue in operations factory --- .../operations/operations_factory.py | 370 +++++++++--------- 1 file changed, 185 insertions(+), 185 deletions(-) diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index 9340f8b64..477f25f99 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -1,185 +1,185 @@ -from typing import Type - -from cdisc_rules_engine.interfaces import FactoryInterface -from cdisc_rules_engine.operations.base_operation import BaseOperation -from cdisc_rules_engine.operations.dataset_names import DatasetNames -from cdisc_rules_engine.operations.dataset_column_order import DatasetColumnOrder -from cdisc_rules_engine.operations.day_data_validator import DayDataValidator -from cdisc_rules_engine.operations.define_dictionary_version_validator import ( - DefineDictionaryVersionValidator, -) -from cdisc_rules_engine.operations.distinct import Distinct -from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata -from cdisc_rules_engine.operations.get_xhtml_errors import GetXhtmlErrors -from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder -from cdisc_rules_engine.operations.library_model_column_order import ( - LibraryModelColumnOrder, -) -from cdisc_rules_engine.operations.map import Map -from cdisc_rules_engine.operations.parent_library_model_column_order import ( - ParentLibraryModelColumnOrder, -) -from cdisc_rules_engine.operations.get_codelist_attributes import ( - CodeListAttributes, -) -from cdisc_rules_engine.operations.get_model_filtered_variables import ( - LibraryModelVariablesFilter, -) -from cdisc_rules_engine.operations.max_date import MaxDate -from cdisc_rules_engine.operations.maximum import Maximum -from cdisc_rules_engine.operations.mean import Mean -from cdisc_rules_engine.operations.domain_is_custom import DomainIsCustom -from cdisc_rules_engine.operations.domain_label import DomainLabel -from cdisc_rules_engine.operations.standard_domains import StandardDomains -from cdisc_rules_engine.operations.meddra_code_references_validator import ( - MedDRACodeReferencesValidator, -) -from cdisc_rules_engine.operations.meddra_code_term_pairs_validator import ( - MedDRACodeTermPairsValidator, -) -from cdisc_rules_engine.operations.meddra_term_references_validator import ( - MedDRATermReferencesValidator, -) -from cdisc_rules_engine.operations.min_date import MinDate -from cdisc_rules_engine.operations.minus import Minus -from cdisc_rules_engine.operations.minimum import Minimum -from cdisc_rules_engine.operations.record_count import RecordCount -from cdisc_rules_engine.operations.split_by import SplitBy -from cdisc_rules_engine.operations.valid_external_dictionary_code import ( - ValidExternalDictionaryCode, -) -from cdisc_rules_engine.operations.valid_external_dictionary_code_term_pair import ( - ValidExternalDictionaryCodeTermPair, -) -from cdisc_rules_engine.operations.variable_exists import VariableExists -from cdisc_rules_engine.operations.variable_names import VariableNames -from cdisc_rules_engine.operations.variable_value_count import VariableValueCount -from cdisc_rules_engine.operations.whodrug_references_validator import ( - WhodrugReferencesValidator, -) -from cdisc_rules_engine.operations.whodrug_hierarchy_validator import ( - WhodrugHierarchyValidator, -) -from cdisc_rules_engine.operations.variable_count import VariableCount -from cdisc_rules_engine.operations.variable_is_null import VariableIsNull -from cdisc_rules_engine.operations.required_variables import RequiredVariables -from cdisc_rules_engine.operations.expected_variables import ExpectedVariables -from cdisc_rules_engine.operations.permissible_variables import PermissibleVariables -from cdisc_rules_engine.operations.study_domains import StudyDomains -from cdisc_rules_engine.operations.valid_codelist_dates import ValidCodelistDates -from cdisc_rules_engine.operations.label_referenced_variable_metadata import ( - LabelReferencedVariableMetadata, -) -from cdisc_rules_engine.operations.name_referenced_variable_metadata import ( - NameReferencedVariableMetadata, -) -from cdisc_rules_engine.operations.define_variable_metadata import ( - DefineVariableMetadata, -) -from cdisc_rules_engine.operations.valid_external_dictionary_value import ( - ValidExternalDictionaryValue, -) -from cdisc_rules_engine.operations.codelist_terms import CodelistTerms -from cdisc_rules_engine.operations.codelist_extensible import CodelistExtensible -from cdisc_rules_engine.operations.define_xml_extensible_codelists import ( - DefineCodelists, -) -from cdisc_rules_engine.operations.get_dataset_filtered_variables import ( - GetDatasetFilteredVariables, -) - - -class OperationsFactory(FactoryInterface): - _operations_map = { - "codelist_extensible": CodelistExtensible, - "codelist_terms": CodelistTerms, - "dataset_names": DatasetNames, - "define_extensible_codelists": DefineCodelists, - "distinct": Distinct, - "dy": DayDataValidator, - "extract_metadata": ExtractMetadata, - "get_column_order_from_dataset": DatasetColumnOrder, - "get_column_order_from_library": LibraryColumnOrder, - "get_codelist_attributes": CodeListAttributes, - "get_model_column_order": LibraryModelColumnOrder, - "get_model_filtered_variables": LibraryModelVariablesFilter, - "get_parent_model_column_order": ParentLibraryModelColumnOrder, - "map": Map, - "max": Maximum, - "max_date": MaxDate, - "mean": Mean, - "min": Minimum, - "min_date": MinDate, - "minus": Minus, - "record_count": RecordCount, - "valid_meddra_code_references": MedDRACodeReferencesValidator, - "valid_whodrug_references": WhodrugReferencesValidator, - "whodrug_code_hierarchy": WhodrugHierarchyValidator, - "valid_meddra_term_references": MedDRATermReferencesValidator, - "valid_meddra_code_term_pairs": MedDRACodeTermPairsValidator, - "variable_exists": VariableExists, - "variable_names": VariableNames, - "variable_value_count": VariableValueCount, - "variable_count": VariableCount, - "variable_is_null": VariableIsNull, - "domain_is_custom": DomainIsCustom, - "domain_label": DomainLabel, - "standard_domains": StandardDomains, - "required_variables": RequiredVariables, - "split_by": SplitBy, - "expected_variables": ExpectedVariables, - "permissible_variables": PermissibleVariables, - "study_domains": StudyDomains, - "valid_codelist_dates": ValidCodelistDates, - "label_referenced_variable_metadata": LabelReferencedVariableMetadata, - "name_referenced_variable_metadata": NameReferencedVariableMetadata, - "define_variable_metadata": DefineVariableMetadata, - "valid_external_dictionary_value": ValidExternalDictionaryValue, - "valid_external_dictionary_code": ValidExternalDictionaryCode, - "valid_external_dictionary_code_term_pair": ValidExternalDictionaryCodeTermPair, - "valid_define_external_dictionary_version": DefineDictionaryVersionValidator, - "get_dataset_filtered_variables": GetDatasetFilteredVariables, - "get_xhtml_errors": GetXhtmlErrors, - } - - @classmethod - def register_service(cls, name: str, operation: Type[BaseOperation]) -> None: - """ - Save mapping of operation name and it's implementation - """ - if not name: - raise ValueError("Operation name must not be empty!") - if not issubclass(operation, BaseOperation): - raise TypeError("Implementation of BaseOperation required!") - cls._operations_map[name] = operation - - def get_service( - self, - name, - **kwargs, - ) -> BaseOperation: - """Get instance of operation that matches operation specified in params""" - required_args = { - "operation_params", - "original_dataset", - "cache", - "data_service", - "library_metadata", - } - if not required_args.issubset(kwargs.keys()): - raise ValueError( - f"One of the following required key word arguments is missing: " - f"{required_args}" - ) - if name in self._operations_map: - return self._operations_map.get(name)( - kwargs.get("operation_params"), - kwargs.get("original_dataset"), - kwargs.get("cache"), - kwargs.get("data_service"), - kwargs.get("library_metadata"), - ) - raise ValueError( - f"Operation name must be in {list(self._operations_map.keys())}, " - f"given operation name is {name}" - ) +from typing import Type + +from cdisc_rules_engine.interfaces import FactoryInterface +from cdisc_rules_engine.operations.base_operation import BaseOperation +from cdisc_rules_engine.operations.dataset_names import DatasetNames +from cdisc_rules_engine.operations.dataset_column_order import DatasetColumnOrder +from cdisc_rules_engine.operations.day_data_validator import DayDataValidator +from cdisc_rules_engine.operations.define_dictionary_version_validator import ( + DefineDictionaryVersionValidator, +) +from cdisc_rules_engine.operations.distinct import Distinct +from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata +from cdisc_rules_engine.operations.get_xhtml_errors import GetXhtmlErrors +from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder +from cdisc_rules_engine.operations.library_model_column_order import ( + LibraryModelColumnOrder, +) +from cdisc_rules_engine.operations.map import Map +from cdisc_rules_engine.operations.parent_library_model_column_order import ( + ParentLibraryModelColumnOrder, +) +from cdisc_rules_engine.operations.get_codelist_attributes import ( + CodeListAttributes, +) +from cdisc_rules_engine.operations.get_model_filtered_variables import ( + LibraryModelVariablesFilter, +) +from cdisc_rules_engine.operations.max_date import MaxDate +from cdisc_rules_engine.operations.maximum import Maximum +from cdisc_rules_engine.operations.mean import Mean +from cdisc_rules_engine.operations.domain_is_custom import DomainIsCustom +from cdisc_rules_engine.operations.domain_label import DomainLabel +from cdisc_rules_engine.operations.standard_domains import StandardDomains +from cdisc_rules_engine.operations.meddra_code_references_validator import ( + MedDRACodeReferencesValidator, +) +from cdisc_rules_engine.operations.meddra_code_term_pairs_validator import ( + MedDRACodeTermPairsValidator, +) +from cdisc_rules_engine.operations.meddra_term_references_validator import ( + MedDRATermReferencesValidator, +) +from cdisc_rules_engine.operations.min_date import MinDate +from cdisc_rules_engine.operations.minus import Minus +from cdisc_rules_engine.operations.minimum import Minimum +from cdisc_rules_engine.operations.record_count import RecordCount +from cdisc_rules_engine.operations.split_by import SplitBy +from cdisc_rules_engine.operations.valid_external_dictionary_code import ( + ValidExternalDictionaryCode, +) +from cdisc_rules_engine.operations.valid_external_dictionary_code_term_pair import ( + ValidExternalDictionaryCodeTermPair, +) +from cdisc_rules_engine.operations.variable_exists import VariableExists +from cdisc_rules_engine.operations.variable_names import VariableNames +from cdisc_rules_engine.operations.variable_value_count import VariableValueCount +from cdisc_rules_engine.operations.whodrug_references_validator import ( + WhodrugReferencesValidator, +) +from cdisc_rules_engine.operations.whodrug_hierarchy_validator import ( + WhodrugHierarchyValidator, +) +from cdisc_rules_engine.operations.variable_count import VariableCount +from cdisc_rules_engine.operations.variable_is_null import VariableIsNull +from cdisc_rules_engine.operations.required_variables import RequiredVariables +from cdisc_rules_engine.operations.expected_variables import ExpectedVariables +from cdisc_rules_engine.operations.permissible_variables import PermissibleVariables +from cdisc_rules_engine.operations.study_domains import StudyDomains +from cdisc_rules_engine.operations.valid_codelist_dates import ValidCodelistDates +from cdisc_rules_engine.operations.label_referenced_variable_metadata import ( + LabelReferencedVariableMetadata, +) +from cdisc_rules_engine.operations.name_referenced_variable_metadata import ( + NameReferencedVariableMetadata, +) +from cdisc_rules_engine.operations.define_variable_metadata import ( + DefineVariableMetadata, +) +from cdisc_rules_engine.operations.valid_external_dictionary_value import ( + ValidExternalDictionaryValue, +) +from cdisc_rules_engine.operations.codelist_terms import CodelistTerms +from cdisc_rules_engine.operations.codelist_extensible import CodelistExtensible +from cdisc_rules_engine.operations.define_xml_extensible_codelists import ( + DefineCodelists, +) +from cdisc_rules_engine.operations.get_dataset_filtered_variables import ( + GetDatasetFilteredVariables, +) + + +class OperationsFactory(FactoryInterface): + _operations_map = { + "codelist_extensible": CodelistExtensible, + "codelist_terms": CodelistTerms, + "dataset_names": DatasetNames, + "define_extensible_codelists": DefineCodelists, + "distinct": Distinct, + "dy": DayDataValidator, + "extract_metadata": ExtractMetadata, + "get_column_order_from_dataset": DatasetColumnOrder, + "get_column_order_from_library": LibraryColumnOrder, + "get_codelist_attributes": CodeListAttributes, + "get_model_column_order": LibraryModelColumnOrder, + "get_model_filtered_variables": LibraryModelVariablesFilter, + "get_parent_model_column_order": ParentLibraryModelColumnOrder, + "map": Map, + "max": Maximum, + "max_date": MaxDate, + "mean": Mean, + "min": Minimum, + "min_date": MinDate, + "minus": Minus, + "record_count": RecordCount, + "valid_meddra_code_references": MedDRACodeReferencesValidator, + "valid_whodrug_references": WhodrugReferencesValidator, + "whodrug_code_hierarchy": WhodrugHierarchyValidator, + "valid_meddra_term_references": MedDRATermReferencesValidator, + "valid_meddra_code_term_pairs": MedDRACodeTermPairsValidator, + "variable_exists": VariableExists, + "variable_names": VariableNames, + "variable_value_count": VariableValueCount, + "variable_count": VariableCount, + "variable_is_null": VariableIsNull, + "domain_is_custom": DomainIsCustom, + "domain_label": DomainLabel, + "standard_domains": StandardDomains, + "required_variables": RequiredVariables, + "split_by": SplitBy, + "expected_variables": ExpectedVariables, + "permissible_variables": PermissibleVariables, + "study_domains": StudyDomains, + "valid_codelist_dates": ValidCodelistDates, + "label_referenced_variable_metadata": LabelReferencedVariableMetadata, + "name_referenced_variable_metadata": NameReferencedVariableMetadata, + "define_variable_metadata": DefineVariableMetadata, + "valid_external_dictionary_value": ValidExternalDictionaryValue, + "valid_external_dictionary_code": ValidExternalDictionaryCode, + "valid_external_dictionary_code_term_pair": ValidExternalDictionaryCodeTermPair, + "valid_define_external_dictionary_version": DefineDictionaryVersionValidator, + "get_dataset_filtered_variables": GetDatasetFilteredVariables, + "get_xhtml_errors": GetXhtmlErrors, + } + + @classmethod + def register_service(cls, name: str, operation: Type[BaseOperation]) -> None: + """ + Save mapping of operation name and it's implementation + """ + if not name: + raise ValueError("Operation name must not be empty!") + if not issubclass(operation, BaseOperation): + raise TypeError("Implementation of BaseOperation required!") + cls._operations_map[name] = operation + + def get_service( + self, + name, + **kwargs, + ) -> BaseOperation: + """Get instance of operation that matches operation specified in params""" + required_args = { + "operation_params", + "original_dataset", + "cache", + "data_service", + "library_metadata", + } + if not required_args.issubset(kwargs.keys()): + raise ValueError( + f"One of the following required key word arguments is missing: " + f"{required_args}" + ) + if name in self._operations_map: + return self._operations_map.get(name)( + kwargs.get("operation_params"), + kwargs.get("original_dataset"), + kwargs.get("cache"), + kwargs.get("data_service"), + kwargs.get("library_metadata"), + ) + raise ValueError( + f"Operation name must be in {list(self._operations_map.keys())}, " + f"given operation name is {name}" + ) From 7582890de77e1ba318660bf2624af1597f1fb6a2 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 17 Feb 2026 17:13:40 -0500 Subject: [PATCH 3/6] add test cases for duplicate value and scalars and remove unnecessary comments --- cdisc_rules_engine/operations/minus.py | 5 +- tests/unit/test_operations/test_minus.py | 73 ++++++++++++++++-------- 2 files changed, 52 insertions(+), 26 deletions(-) diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py index 76d126d0f..2412a596e 100644 --- a/cdisc_rules_engine/operations/minus.py +++ b/cdisc_rules_engine/operations/minus.py @@ -34,14 +34,13 @@ class Minus(BaseOperation): """ def _execute_operation(self): - name_ref = self.params.target # minuend (list A) - value_ref = self.params.value # subtrahend (list B) + name_ref = self.params.target + value_ref = self.params.value if not name_ref or name_ref not in self.evaluation_dataset.columns: return [] list_a = self.evaluation_dataset[name_ref].iloc[0] if not value_ref or value_ref not in self.evaluation_dataset.columns: - # listA minus empty = all of listA (per Sam) return _normalize_to_list(list_a) list_b = self.evaluation_dataset[value_ref].iloc[0] return _set_difference_preserve_order(list_a, list_b) diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py index 0f97745bd..e56f6ec71 100644 --- a/tests/unit/test_operations/test_minus.py +++ b/tests/unit/test_operations/test_minus.py @@ -17,20 +17,29 @@ def minus_params(operation_params: OperationParams) -> OperationParams: return operation_params -def test_set_difference_preserve_order(): - """Test set difference preserves order from first list.""" - assert _set_difference_preserve_order(["a", "b", "c"], ["b"]) == ["a", "c"] - assert _set_difference_preserve_order(["a", "b", "c"], []) == ["a", "b", "c"] - assert _set_difference_preserve_order(["a", "b", "c"], ["a", "b", "c"]) == [] - assert _set_difference_preserve_order(["A", "B", "C", "D"], ["B", "D"]) == [ - "A", - "C", - ] +@pytest.mark.parametrize( + "list_a,list_b,expected", + [ + (["a", "b", "c"], ["b"], ["a", "c"]), + (["a", "b", "c"], [], ["a", "b", "c"]), + (["a", "b", "c"], ["a", "b", "c"], []), + (["A", "B", "C", "D"], ["B", "D"], ["A", "C"]), + (["a", "b", "b", "c"], ["b"], ["a", "c"]), + (["a", "a", "b"], ["b"], ["a", "a"]), + ("x", ["a"], ["x"]), + (["x"], "a", ["x"]), + ("a", "b", ["a"]), + (["a", "", "b"], [""], ["a", "b"]), + (["a", "", "b"], ["c"], ["a", "", "b"]), + ([""], [""], []), + ], +) +def test_set_difference_preserve_order(list_a, list_b, expected): + assert _set_difference_preserve_order(list_a, list_b) == expected @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_minus_operation(minus_params: OperationParams, dataset_type): - """Test minus operation computes set difference correctly.""" eval_dataset = dataset_type.from_dict( { "$expected_variables": [ @@ -46,16 +55,13 @@ def test_minus_operation(minus_params: OperationParams, dataset_type): operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) result = operation.execute() - - expected = ["AEDECOD"] # in expected but not in dataset - assert list(result[minus_params.operation_id].iloc[0]) == expected + assert list(result[minus_params.operation_id].iloc[0]) == ["AEDECOD"] @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_minus_empty_value_returns_all_of_name( minus_params: OperationParams, dataset_type ): - """When value is empty, minus returns all of name (per Sam).""" eval_dataset = dataset_type.from_dict( { "$expected_variables": [ @@ -75,22 +81,43 @@ def test_minus_empty_value_returns_all_of_name( assert list(result[minus_params.operation_id].iloc[0]) == ["A", "B", "C"] +@pytest.mark.parametrize( + "expected_vars,dataset_vars,expected", + [ + ([["a", "b", "b", "c"], ["a", "b", "b", "c"]], [["b"], ["b"]], ["a", "c"]), + ([["a", "a", "b"], ["a", "a", "b"]], [["b"], ["b"]], ["a", "a"]), + ([["a", "", "b"], ["a", "", "b"]], [[""], [""]], ["a", "b"]), + ([["a", "", "b"], ["a", "", "b"]], [["c"], ["c"]], ["a", "", "b"]), + ([[""], [""]], [[""], [""]], []), + (["x", "y"], [["a"], ["a"]], ["x"]), + ], +) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_minus_name_ref_missing_returns_empty( - minus_params: OperationParams, dataset_type +def test_minus_operation_edge_cases( + minus_params: OperationParams, + dataset_type, + expected_vars, + dataset_vars, + expected, ): - """When name ref is missing from dataset columns, minus returns empty list.""" - # Dataset has $dataset_variables but not $expected_variables eval_dataset = dataset_type.from_dict( { - "$dataset_variables": [ - ["STUDYID", "DOMAIN"], - ["STUDYID", "DOMAIN"], - ], + "$expected_variables": expected_vars, + "$dataset_variables": dataset_vars, } ) - operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) result = operation.execute() + assert list(result[minus_params.operation_id].iloc[0]) == expected + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_minus_name_ref_missing_returns_empty( + minus_params: OperationParams, dataset_type +): + eval_dataset = dataset_type.from_dict( + {"$dataset_variables": [["STUDYID", "DOMAIN"], ["STUDYID", "DOMAIN"]]} + ) + operation = Minus(minus_params, eval_dataset, MagicMock(), MagicMock()) + result = operation.execute() assert list(result[minus_params.operation_id].iloc[0]) == [] From b9d9a2c5aa63839dc154b74b6cb6da1e9757d672 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Fri, 20 Feb 2026 11:09:24 -0500 Subject: [PATCH 4/6] minus: rename second operand to subtract, tighten Operations.md per review --- cdisc_rules_engine/models/operation_params.py | 1 + cdisc_rules_engine/operations/minus.py | 16 ++++++++-------- cdisc_rules_engine/utilities/rule_processor.py | 1 + resources/schema/rule/Operations.json | 5 ++++- resources/schema/rule/Operations.md | 16 ++-------------- tests/unit/test_operations/test_minus.py | 4 ++-- 6 files changed, 18 insertions(+), 25 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 53ef889cc..4086ea5a1 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -57,6 +57,7 @@ class OperationParams: regex: str = None returntype: str = None target: str = None + subtract: str = None value: str = None value_is_reference: bool = False namespace: str = None diff --git a/cdisc_rules_engine/operations/minus.py b/cdisc_rules_engine/operations/minus.py index 2412a596e..71061b3f1 100644 --- a/cdisc_rules_engine/operations/minus.py +++ b/cdisc_rules_engine/operations/minus.py @@ -1,6 +1,6 @@ """ -Set difference operation: name minus value. -Returns elements in name that are not in value, preserving order from name. +Set difference operation: name minus subtract. +Returns elements in name that are not in subtract, preserving order from name. """ from cdisc_rules_engine.operations.base_operation import BaseOperation @@ -28,19 +28,19 @@ def _set_difference_preserve_order(list_a: list, list_b: list) -> list: class Minus(BaseOperation): """ - Operation that computes set difference: name minus value. - name (minuend) and value (subtrahend) reference other operation results. - Returns elements in name that are not in value. + Operation that computes set difference: name minus subtract. + name (minuend) and subtract (subtrahend) reference other operation results. + Returns elements in name that are not in subtract. """ def _execute_operation(self): name_ref = self.params.target - value_ref = self.params.value + subtract_ref = self.params.subtract if not name_ref or name_ref not in self.evaluation_dataset.columns: return [] list_a = self.evaluation_dataset[name_ref].iloc[0] - if not value_ref or value_ref not in self.evaluation_dataset.columns: + if not subtract_ref or subtract_ref not in self.evaluation_dataset.columns: return _normalize_to_list(list_a) - list_b = self.evaluation_dataset[value_ref].iloc[0] + list_b = self.evaluation_dataset[subtract_ref].iloc[0] return _set_difference_preserve_order(list_a, list_b) diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index b0203274a..570560660 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -391,6 +391,7 @@ def perform_rule_operations( operation_id=operation.get("id"), operation_name=operation.get("operator"), original_target=original_target, + subtract=operation.get("subtract"), value=operation.get("value"), regex=operation.get("regex"), returntype=operation.get("returntype"), diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 934299ff7..60134f030 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -232,7 +232,7 @@ "const": "minus" } }, - "required": ["id", "operator", "name", "value"], + "required": ["id", "operator", "name", "subtract"], "type": "object" }, { @@ -617,6 +617,9 @@ "type": "string", "enum": ["code", "value", "pref_term"] }, + "subtract": { + "type": "string" + }, "term_value": { "type": "string" }, diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index f6e4f9e90..14646e18c 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -752,9 +752,7 @@ Operations: ### minus -Computes set difference: elements in `name` that are not in `value`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `value` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). - -When `value` is empty or missing, returns all elements from `name`. +Computes set difference: elements in `name` that are not in `subtract`. Uses [set difference]() semantics (A ∖ B). Preserves order from the first list. Both `name` and `subtract` must reference other operation results (e.g., `$expected_variables`, `$dataset_variables`). When `subtract` is empty or missing, returns all elements from `name`. Can be computed and added to output variables to display missing elements in error results. ```yaml Operations: @@ -765,17 +763,7 @@ Operations: - id: $expected_minus_dataset name: $expected_variables operator: minus - value: $dataset_variables -Check: - all: - - name: $expected_minus_dataset - operator: non_empty -Outcome: - Message: At least one expected variable is missing from dataset - Output Variables: - - $dataset_variables - - $expected_variables - - $expected_minus_dataset + subtract: $dataset_variables ``` ### label_referenced_variable_metadata diff --git a/tests/unit/test_operations/test_minus.py b/tests/unit/test_operations/test_minus.py index e56f6ec71..6969ea99b 100644 --- a/tests/unit/test_operations/test_minus.py +++ b/tests/unit/test_operations/test_minus.py @@ -13,7 +13,7 @@ def minus_params(operation_params: OperationParams) -> OperationParams: """Configure operation_params for minus operation tests.""" operation_params.operation_id = "$expected_minus_dataset" operation_params.target = "$expected_variables" - operation_params.value = "$dataset_variables" + operation_params.subtract = "$dataset_variables" return operation_params @@ -59,7 +59,7 @@ def test_minus_operation(minus_params: OperationParams, dataset_type): @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_minus_empty_value_returns_all_of_name( +def test_minus_empty_subtract_returns_all_of_name( minus_params: OperationParams, dataset_type ): eval_dataset = dataset_type.from_dict( From 4b81bafee50b470e30ffa57b7b1c7da718f15f26 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 24 Feb 2026 09:40:04 -0500 Subject: [PATCH 5/6] Remove operation param value --- cdisc_rules_engine/models/operation_params.py | 1 - cdisc_rules_engine/utilities/rule_processor.py | 1 - resources/schema/rule/Operations.json | 3 --- 3 files changed, 5 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 4086ea5a1..e0d75454f 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -58,7 +58,6 @@ class OperationParams: returntype: str = None target: str = None subtract: str = None - value: str = None value_is_reference: bool = False namespace: str = None delimiter: str = None diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 570560660..0000d0651 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -392,7 +392,6 @@ def perform_rule_operations( operation_name=operation.get("operator"), original_target=original_target, subtract=operation.get("subtract"), - value=operation.get("value"), regex=operation.get("regex"), returntype=operation.get("returntype"), standard=standard, diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 60134f030..7ce4494d5 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -629,9 +629,6 @@ "term_pref_term": { "type": "string" }, - "value": { - "type": "string" - }, "value_is_reference": { "type": "boolean" }, From c4b359c180b3f7216124d8e2c865e96fc6545aa0 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 25 Feb 2026 12:32:22 -0500 Subject: [PATCH 6/6] Adding the subtract operation parameter to the engine's check_parameter.md (from editor PR #293) --- resources/schema/rule/check_parameter.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md index 2f227d87d..8eb7c2984 100644 --- a/resources/schema/rule/check_parameter.md +++ b/resources/schema/rule/check_parameter.md @@ -1,4 +1,3 @@ - ## Overview Check parameters are configuration elements that define how validation rules are applied within the CDISC rules engine. These parameters control the behavior, scope, and criteria for data validation checks across clinical trial datasets. Each parameter serves a specific purpose in customizing rule logic to ensure data integrity and compliance with CDISC standards. @@ -267,6 +266,17 @@ Expected return type for operation results. Valid values are "code" (for NCI cod Either "submission" or "evaluation" for which dataset to check the variable_is_null from. Evaluation is the dataset constructed by the rule type while submission is the raw dataset submitted that is being evaluated. +### subtract + +Reference to another operation result, used as the second operand in operations that take two inputs. For example, in the `minus` operation, `name` references the minuend (first list) and `subtract` references the subtrahend (second list); the operation returns elements in the first list that are not in the second. + +```yaml +- id: $expected_minus_dataset + name: $expected_variables + operator: minus + subtract: $dataset_variables +``` + ### term_code Terminology code value used in controlled terminology operations for code-based lookups.