From cdee5ba2adf9035a69dc6aff20f752388541f583 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 17 Nov 2025 12:26:16 -0500 Subject: [PATCH 1/8] Add domain_suffix extraction and standard_domains operation to enable DOMAIN substring comparison --- .../operations/extract_metadata.py | 22 +++ .../operations/operations_factory.py | 2 + .../operations/standard_domains.py | 17 +++ resources/schema/Operations.md | 43 ++++++ .../test_operations/test_extract_metadata.py | 126 +++++++++++++++++- .../test_operations/test_standard_domains.py | 96 +++++++++++++ 6 files changed, 305 insertions(+), 1 deletion(-) create mode 100644 cdisc_rules_engine/operations/standard_domains.py create mode 100644 tests/unit/test_operations/test_standard_domains.py diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index b9c179a23..31096a8ac 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -9,6 +9,28 @@ def _execute_operation(self): dataset_name=self.params.dataset_path, datasets=self.params.datasets ) + if self.params.target == "domain_suffix": + return self._extract_domain_suffix(metadata) + # extract target value. Metadata df always has one row target_value = metadata.get(self.params.target, pd.Series())[0] return target_value + + def _extract_domain_suffix(self, metadata: pd.DataFrame) -> str: + dataset_name = metadata.get("dataset_name", pd.Series()) + if dataset_name.empty: + return "" + + name = dataset_name.iloc[0] + if isinstance(name, str) and len(name) >= 4 and name.startswith("AP"): + return name[2:4] + + raw_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=self.params.dataset_path, datasets=self.params.datasets + ) + if raw_metadata and raw_metadata.first_record: + domain = raw_metadata.first_record.get("DOMAIN", "") + if isinstance(domain, str) and len(domain) >= 4 and domain.startswith("AP"): + return domain[2:4] + + return "" diff --git a/cdisc_rules_engine/operations/operations_factory.py b/cdisc_rules_engine/operations/operations_factory.py index 801df7a08..d943144d1 100644 --- a/cdisc_rules_engine/operations/operations_factory.py +++ b/cdisc_rules_engine/operations/operations_factory.py @@ -30,6 +30,7 @@ from cdisc_rules_engine.operations.mean import Mean from cdisc_rules_engine.operations.domain_is_custom import DomainIsCustom from cdisc_rules_engine.operations.domain_label import DomainLabel +from cdisc_rules_engine.operations.standard_domains import StandardDomains from cdisc_rules_engine.operations.meddra_code_references_validator import ( MedDRACodeReferencesValidator, ) @@ -121,6 +122,7 @@ class OperationsFactory(FactoryInterface): "variable_is_null": VariableIsNull, "domain_is_custom": DomainIsCustom, "domain_label": DomainLabel, + "standard_domains": StandardDomains, "required_variables": RequiredVariables, "split_by": SplitBy, "expected_variables": ExpectedVariables, diff --git a/cdisc_rules_engine/operations/standard_domains.py b/cdisc_rules_engine/operations/standard_domains.py new file mode 100644 index 000000000..5846dfa46 --- /dev/null +++ b/cdisc_rules_engine/operations/standard_domains.py @@ -0,0 +1,17 @@ +from cdisc_rules_engine.operations.base_operation import BaseOperation + + +class StandardDomains(BaseOperation): + def _execute_operation(self): + standard_data: dict = self.library_metadata.standard_metadata + domains = standard_data.get("domains", set()) + if isinstance(domains, set): + return sorted(list(domains)) + elif isinstance(domains, (list, tuple)): + return sorted(list(domains)) + elif domains is None: + return [] + raise TypeError( + f"Invalid type for 'domains' in standard_metadata: " + f"expected set, list, or tuple, got {type(domains).__name__}" + ) diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index 6214da388..8ca0e92b9 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -485,6 +485,28 @@ Output Laboratory Test Results ``` +### standard_domains + +Returns a list of valid SDTM domain names from the standard metadata. This can be used to compare extracted suffixes from DOMAIN values or dataset names. + +Input + +Product: sdtmig + +Version: 3-4 + +```yaml +Operations: + - operator: standard_domains + id: $valid_domain_names +``` + +Output + +``` +["AE", "CM", "DM", "FA", "LB", "QS", ...] +``` + ### extract_metadata Returns the requested dataset level metadata value for the current dataset. Possible name values are: @@ -493,6 +515,7 @@ Returns the requested dataset level metadata value for the current dataset. Poss - dataset_location - dataset_name - dataset_label +- domain_suffix Example @@ -512,6 +535,26 @@ Output: Laboratory Test Results ``` +Example: domain_suffix + +Extracts the domain suffix (characters 3-4) from AP-related domains. For example, "FA" from "APFA" dataset name or DOMAIN value. + +Input: + +Target domain: APFA + +```yaml +- name: domain_suffix + operator: extract_metadata + id: $domain_suffix +``` + +Output: + +``` +FA +``` + ## IG & Model Variable Operations Operations for working with Implementation Guide and model variable metadata. diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index c5f4dbe1f..070ff90f0 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -2,10 +2,11 @@ from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.extract_metadata import ExtractMetadata from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService -from unittest.mock import Mock +from unittest.mock import Mock, MagicMock import pytest @@ -41,3 +42,126 @@ def test_extract_metadata_get_dataset_name( assert operation_params.operation_id in result for item in result[operation_params.operation_id]: assert item == "AE" + + +def _create_mock_service(dataset_name, first_record=None): + mock_service = Mock(LocalDataService) + mock_service.get_dataset_metadata.return_value = pd.DataFrame.from_dict( + {"dataset_name": [dataset_name]} + ) + if first_record is not None: + raw_metadata = SDTMDatasetMetadata( + name=dataset_name, + first_record=first_record, + ) + mock_service.get_raw_dataset_metadata = MagicMock(return_value=raw_metadata) + return mock_service + + +@pytest.mark.parametrize( + "dataset_name, first_record, expected_suffix", + [ + ("APFA", None, "FA"), + ("APXX", None, "XX"), + ("APLB", None, "LB"), + ("", {"DOMAIN": "APFA"}, "FA"), + ("AE", {"DOMAIN": "APFA"}, "FA"), + ("AP", {"DOMAIN": "APFA"}, "FA"), + ("APF", {"DOMAIN": "APFA"}, "FA"), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_extract_metadata_domain_suffix_valid_cases( + operation_params: OperationParams, + dataset_type, + dataset_name, + first_record, + expected_suffix, +): + mock_data_service = _create_mock_service(dataset_name, first_record) + operation_params.dataframe = dataset_type.from_dict( + {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} + ) + operation_params.target = "domain_suffix" + cache = InMemoryCacheService.get_instance() + operation = ExtractMetadata( + operation_params, operation_params.dataframe, cache, mock_data_service + ) + result = operation.execute() + assert operation_params.operation_id in result + assert all( + item == expected_suffix for item in result[operation_params.operation_id] + ) + + +@pytest.mark.parametrize( + "dataset_name, first_record", + [ + ("AE", None), + ("LB", None), + ("AP", None), + ("APF", None), + ("AE", {"DOMAIN": "AE"}), + ("AE", {"DOMAIN": "LB"}), + ("AE", {"DOMAIN": "AP"}), + ("AE", {"DOMAIN": ""}), + ("AE", {"DOMAIN": None}), + ("AE", None), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_extract_metadata_domain_suffix_returns_empty_for_invalid( + operation_params: OperationParams, + dataset_type, + dataset_name, + first_record, +): + mock_data_service = _create_mock_service(dataset_name, first_record) + operation_params.dataframe = dataset_type.from_dict( + {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["AE"]} + ) + operation_params.target = "domain_suffix" + cache = InMemoryCacheService.get_instance() + operation = ExtractMetadata( + operation_params, operation_params.dataframe, cache, mock_data_service + ) + result = operation.execute() + assert operation_params.operation_id in result + assert all(item == "" for item in result[operation_params.operation_id]) + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_extract_metadata_domain_suffix_dataset_name_priority( + operation_params: OperationParams, dataset_type +): + mock_data_service = _create_mock_service("APFA", {"DOMAIN": "APXX"}) + operation_params.dataframe = dataset_type.from_dict( + {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APXX"]} + ) + operation_params.target = "domain_suffix" + cache = InMemoryCacheService.get_instance() + operation = ExtractMetadata( + operation_params, operation_params.dataframe, cache, mock_data_service + ) + result = operation.execute() + assert operation_params.operation_id in result + assert all(item == "FA" for item in result[operation_params.operation_id]) + + +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_extract_metadata_domain_suffix_empty_metadata( + operation_params: OperationParams, dataset_type +): + mock_data_service = Mock(LocalDataService) + mock_data_service.get_dataset_metadata.return_value = pd.DataFrame() + operation_params.dataframe = dataset_type.from_dict( + {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} + ) + operation_params.target = "domain_suffix" + cache = InMemoryCacheService.get_instance() + operation = ExtractMetadata( + operation_params, operation_params.dataframe, cache, mock_data_service + ) + result = operation.execute() + assert operation_params.operation_id in result + assert all(item == "" for item in result[operation_params.operation_id]) diff --git a/tests/unit/test_operations/test_standard_domains.py b/tests/unit/test_operations/test_standard_domains.py new file mode 100644 index 000000000..605eba7ca --- /dev/null +++ b/tests/unit/test_operations/test_standard_domains.py @@ -0,0 +1,96 @@ +from cdisc_rules_engine.config.config import ConfigService +from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +from cdisc_rules_engine.models.library_metadata_container import ( + LibraryMetadataContainer, +) +import pytest +from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.operations.standard_domains import StandardDomains +from cdisc_rules_engine.services.cache import InMemoryCacheService +from cdisc_rules_engine.services.data_services import LocalDataService + + +def _create_operation(operation_params, standard_metadata, dataset_type): + operation_params.dataframe = dataset_type.from_dict( + {"STUDYID": ["TEST_STUDY"], "AETERM": ["test"]} + ) + operation_params.standard = "sdtmig" + operation_params.standard_version = "3-4" + cache = InMemoryCacheService.get_instance() + library_metadata = LibraryMetadataContainer(standard_metadata=standard_metadata) + data_service = LocalDataService.get_instance( + cache_service=cache, config=ConfigService() + ) + return StandardDomains( + operation_params, + operation_params.dataframe, + cache, + data_service, + library_metadata, + ) + + +@pytest.mark.parametrize( + "domains_input, expected_domains", + [ + ({"AE", "FA", "LB", "QS", "CM", "DM"}, ["AE", "CM", "DM", "FA", "LB", "QS"]), + (["AE", "FA", "LB"], ["AE", "FA", "LB"]), + (("AE", "FA", "LB"), ["AE", "FA", "LB"]), + (["QS", "AE", "FA", "LB", "CM"], ["AE", "CM", "FA", "LB", "QS"]), + (set(), []), + ([], []), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_standard_domains_returns_sorted_list( + operation_params: OperationParams, + dataset_type, + domains_input, + expected_domains, +): + standard_metadata = {"domains": domains_input} + operation = _create_operation(operation_params, standard_metadata, dataset_type) + result = operation.execute() + domain_list = result[operation_params.operation_id].iloc[0] + assert domain_list == expected_domains + + +@pytest.mark.parametrize( + "standard_metadata, expected_length", + [ + ({}, 0), + ({"domains": None}, 0), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_standard_domains_handles_missing_or_none_domains( + operation_params: OperationParams, + dataset_type, + standard_metadata, + expected_length, +): + operation = _create_operation(operation_params, standard_metadata, dataset_type) + result = operation.execute() + domain_list = result[operation_params.operation_id].iloc[0] + assert isinstance(domain_list, list) + assert len(domain_list) == expected_length + + +@pytest.mark.parametrize( + "standard_metadata", + [ + ({"domains": {}}), + ({"domains": 123}), + ({"domains": "invalid"}), + ], +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_standard_domains_raises_error_for_invalid_type( + operation_params: OperationParams, + dataset_type, + standard_metadata, +): + operation = _create_operation(operation_params, standard_metadata, dataset_type) + with pytest.raises(TypeError): + operation.execute() From 646813bb4fa474c35fe1f7b7b71a485aceae4813 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 17 Nov 2025 12:43:35 -0500 Subject: [PATCH 2/8] Schema Update fix --- resources/schema/Operations.json | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/resources/schema/Operations.json b/resources/schema/Operations.json index 7e9013d75..549a59905 100644 --- a/resources/schema/Operations.json +++ b/resources/schema/Operations.json @@ -255,6 +255,13 @@ "required": ["id", "operator"], "type": "object" }, + { + "properties": { + "operator": { "const": "standard_domains" } + }, + "required": ["id", "operator"], + "type": "object" + }, { "properties": { "operator": { From 103c77759f1c3bfb265fb0619a910a68fa32d054 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 24 Nov 2025 20:06:44 -0500 Subject: [PATCH 3/8] Add AP domain suffix extraction and standard_domains operation for CG0663 --- .../content_metadata_dataset_builder.py | 2 + .../contents_define_dataset_builder.py | 2 + ...dataset_metadata_define_dataset_builder.py | 4 ++ .../dataset_metadata_values_builder.py | 2 + .../models/sdtm_dataset_metadata.py | 51 +++++++++++++------ .../operations/extract_metadata.py | 26 ++-------- .../operations/standard_domains.py | 4 +- .../data_services/base_data_service.py | 3 ++ resources/schema/Operations.md | 12 +++-- .../test_operations/test_extract_metadata.py | 31 ++++++----- tests/unit/test_utilities/test_utils.py | 47 +++++++++++++++++ 11 files changed, 124 insertions(+), 60 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py index a59d544b3..f79e05f65 100644 --- a/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/content_metadata_dataset_builder.py @@ -9,6 +9,8 @@ def build(self): dataset_location - Path to file dataset_name - Name of the dataset dataset_label - Label for the dataset + is_ap - Whether the domain is an AP domain + ap_suffix - The 2-character suffix from AP domains """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) return self.data_service.get_dataset_metadata( diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index 306a735e1..b048e3a0e 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -15,6 +15,8 @@ def build(self): dataset_name - Name of the dataset dataset_size - File size dataset_domain - Domain of the dataset + is_ap - Whether the domain is an AP domain + ap_suffix - The 2-character suffix from AP domains define_dataset_class - dataset class define_dataset_domain - dataset domain from define define_dataset_is_non_standard - whether a dataset is a standard diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 177ed5c5c..30bb75382 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -16,6 +16,8 @@ def build(self): dataset_name - Name of the dataset dataset_label - Label for the dataset dataset_domain - Domain of the dataset + is_ap - Whether the domain is an AP domain + ap_suffix - The 2-character suffix from AP domains Columns from Define XML: define_dataset_name - dataset name from define_xml @@ -85,6 +87,8 @@ def _get_dataset_dataframe(self): "dataset_name", "dataset_label", "dataset_domain", + "is_ap", + "ap_suffix", ] if len(self.datasets) == 0: diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py index 202bbf31d..8fb48d9f3 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_values_builder.py @@ -16,6 +16,8 @@ def build(self): - dataset_location - Path to file - dataset_name - Name of the dataset - dataset_label - Label for the dataset + - is_ap - Whether the domain is an AP domain + - ap_suffix - The 2-character suffix from AP domains """ size_unit: str = self.rule_processor.get_size_unit_from_rule(self.rule) dataset_metadata = self.data_service.get_dataset_metadata( diff --git a/cdisc_rules_engine/models/sdtm_dataset_metadata.py b/cdisc_rules_engine/models/sdtm_dataset_metadata.py index bcec8c8d3..5df57d732 100644 --- a/cdisc_rules_engine/models/sdtm_dataset_metadata.py +++ b/cdisc_rules_engine/models/sdtm_dataset_metadata.py @@ -12,21 +12,21 @@ class SDTMDatasetMetadata(DatasetMetadata): """ Examples - | name | unsplit_name | is_supp | domain | rdomain | - | -------- | ------------ | ------- | ------ | ------- | - | QS | QS | False | QS | None | - | QSX | QS | False | QS | None | - | QSXX | QS | False | QS | None | - | SUPPQS | SUPPQS | True | None | QS | - | SUPPQSX | SUPPQS | True | None | QS | - | SUPPQSXX | SUPPQS | True | None | QS | - | APQS | APQS | False | APQS | None | - | APQSX | APQS | False | APQS | None | - | APQSXX | APQS | False | APQS | None | - | SQAPQS | SQAPQS | True | None | APQS | - | SQAPQSX | SQAPQS | True | None | APQS | - | SQAPQSXX | SQAPQS | True | None | APQS | - | RELREC | RELREC | False | None | None | + | name | unsplit_name | is_supp | domain | rdomain | is_ap | ap_suffix | + | -------- | ------------ | ------- | ------ | ------- | ----- | --------- | + | QS | QS | False | QS | None | False | | + | QSX | QS | False | QS | None | False | | + | QSXX | QS | False | QS | None | False | | + | SUPPQS | SUPPQS | True | None | QS | False | | + | SUPPQSX | SUPPQS | True | None | QS | False | | + | SUPPQSXX | SUPPQS | True | None | QS | False | | + | APQS | APQS | False | APQS | None | True | QS | + | APQSX | APQS | False | APQS | None | True | QS | + | APQSXX | APQS | False | APQS | None | True | QS | + | SQAPQS | SQAPQS | True | None | APQS | False | | + | SQAPQSX | SQAPQS | True | None | APQS | False | | + | SQAPQSXX | SQAPQS | True | None | APQS | False | | + | RELREC | RELREC | False | None | None | False | | """ @property @@ -57,3 +57,24 @@ def unsplit_name(self) -> str: @property def is_split(self) -> bool: return self.name != self.unsplit_name + + @property + def is_ap(self) -> bool: + """ + Returns true if domain starts with AP and is at least 4 characters long + """ + return ( + isinstance(self.domain, str) + and len(self.domain) >= 4 + and self.domain.startswith("AP") + ) + + @property + def ap_suffix(self) -> str: + """ + Returns the 2-character suffix (characters 3-4) from AP domains. + Returns empty string if not an AP domain. + """ + if self.is_ap: + return self.domain[2:4] + return "" diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 31096a8ac..1108cbc4a 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -9,28 +9,12 @@ def _execute_operation(self): dataset_name=self.params.dataset_path, datasets=self.params.datasets ) - if self.params.target == "domain_suffix": - return self._extract_domain_suffix(metadata) + if self.params.target in ("ap_suffix", "ap_domain_suffix", "domain_suffix"): + raw_metadata = self.data_service.get_raw_dataset_metadata( + dataset_name=self.params.dataset_path, datasets=self.params.datasets + ) + return raw_metadata.ap_suffix # extract target value. Metadata df always has one row target_value = metadata.get(self.params.target, pd.Series())[0] return target_value - - def _extract_domain_suffix(self, metadata: pd.DataFrame) -> str: - dataset_name = metadata.get("dataset_name", pd.Series()) - if dataset_name.empty: - return "" - - name = dataset_name.iloc[0] - if isinstance(name, str) and len(name) >= 4 and name.startswith("AP"): - return name[2:4] - - raw_metadata = self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets - ) - if raw_metadata and raw_metadata.first_record: - domain = raw_metadata.first_record.get("DOMAIN", "") - if isinstance(domain, str) and len(domain) >= 4 and domain.startswith("AP"): - return domain[2:4] - - return "" diff --git a/cdisc_rules_engine/operations/standard_domains.py b/cdisc_rules_engine/operations/standard_domains.py index 5846dfa46..43b6d453b 100644 --- a/cdisc_rules_engine/operations/standard_domains.py +++ b/cdisc_rules_engine/operations/standard_domains.py @@ -5,9 +5,7 @@ class StandardDomains(BaseOperation): def _execute_operation(self): standard_data: dict = self.library_metadata.standard_metadata domains = standard_data.get("domains", set()) - if isinstance(domains, set): - return sorted(list(domains)) - elif isinstance(domains, (list, tuple)): + if isinstance(domains, (set, list, tuple)): return sorted(list(domains)) elif domains is None: return [] diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index c88109ffc..ef2ceeee3 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -223,6 +223,9 @@ def get_dataset_metadata( "dataset_name": [dataset_metadata.name], "dataset_label": [dataset_metadata.label], "record_count": [dataset_metadata.record_count], + "is_ap": [dataset_metadata.is_ap], + "ap_suffix": [dataset_metadata.ap_suffix], + "domain": [dataset_metadata.domain], } return self.dataset_implementation.from_dict(metadata_to_return) diff --git a/resources/schema/Operations.md b/resources/schema/Operations.md index 8ca0e92b9..290635f45 100644 --- a/resources/schema/Operations.md +++ b/resources/schema/Operations.md @@ -515,7 +515,9 @@ Returns the requested dataset level metadata value for the current dataset. Poss - dataset_location - dataset_name - dataset_label -- domain_suffix +- domain +- is_ap +- ap_suffix Example @@ -535,18 +537,18 @@ Output: Laboratory Test Results ``` -Example: domain_suffix +Example: ap_suffix -Extracts the domain suffix (characters 3-4) from AP-related domains. For example, "FA" from "APFA" dataset name or DOMAIN value. +Extracts the domain suffix (characters 3-4) from AP-related domains. For example, "FA" from "APFA" DOMAIN value. Input: Target domain: APFA ```yaml -- name: domain_suffix +- name: ap_suffix operator: extract_metadata - id: $domain_suffix + id: $ap_suffix ``` Output: diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index 070ff90f0..6dcc3c93d 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -49,21 +49,20 @@ def _create_mock_service(dataset_name, first_record=None): mock_service.get_dataset_metadata.return_value = pd.DataFrame.from_dict( {"dataset_name": [dataset_name]} ) - if first_record is not None: - raw_metadata = SDTMDatasetMetadata( - name=dataset_name, - first_record=first_record, - ) - mock_service.get_raw_dataset_metadata = MagicMock(return_value=raw_metadata) + raw_metadata = SDTMDatasetMetadata( + name=dataset_name, + first_record=first_record, + ) + mock_service.get_raw_dataset_metadata = MagicMock(return_value=raw_metadata) return mock_service @pytest.mark.parametrize( "dataset_name, first_record, expected_suffix", [ - ("APFA", None, "FA"), - ("APXX", None, "XX"), - ("APLB", None, "LB"), + ("APFA", None, ""), + ("APXX", None, ""), + ("APLB", None, ""), ("", {"DOMAIN": "APFA"}, "FA"), ("AE", {"DOMAIN": "APFA"}, "FA"), ("AP", {"DOMAIN": "APFA"}, "FA"), @@ -82,7 +81,7 @@ def test_extract_metadata_domain_suffix_valid_cases( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} ) - operation_params.target = "domain_suffix" + operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( operation_params, operation_params.dataframe, cache, mock_data_service @@ -120,7 +119,7 @@ def test_extract_metadata_domain_suffix_returns_empty_for_invalid( operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["AE"]} ) - operation_params.target = "domain_suffix" + operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( operation_params, operation_params.dataframe, cache, mock_data_service @@ -131,33 +130,33 @@ def test_extract_metadata_domain_suffix_returns_empty_for_invalid( @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_extract_metadata_domain_suffix_dataset_name_priority( +def test_extract_metadata_domain_suffix_uses_domain( operation_params: OperationParams, dataset_type ): mock_data_service = _create_mock_service("APFA", {"DOMAIN": "APXX"}) operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APXX"]} ) - operation_params.target = "domain_suffix" + operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( operation_params, operation_params.dataframe, cache, mock_data_service ) result = operation.execute() assert operation_params.operation_id in result - assert all(item == "FA" for item in result[operation_params.operation_id]) + assert all(item == "XX" for item in result[operation_params.operation_id]) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_extract_metadata_domain_suffix_empty_metadata( operation_params: OperationParams, dataset_type ): - mock_data_service = Mock(LocalDataService) + mock_data_service = _create_mock_service("APFA", None) mock_data_service.get_dataset_metadata.return_value = pd.DataFrame() operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} ) - operation_params.target = "domain_suffix" + operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() operation = ExtractMetadata( operation_params, operation_params.dataframe, cache, mock_data_service diff --git a/tests/unit/test_utilities/test_utils.py b/tests/unit/test_utilities/test_utils.py index fd32b9ac7..50b3a8572 100644 --- a/tests/unit/test_utilities/test_utils.py +++ b/tests/unit/test_utilities/test_utils.py @@ -45,6 +45,53 @@ def test_is_supp_dataset(mock_dataset, expected): ), f"Expected {expected} but got {result} for datasets {mock_datasets}" +is_ap_tests = [ + ({"first_record": {"DOMAIN": "APFA"}}, True), + ({"first_record": {"DOMAIN": "APXX"}}, True), + ({"first_record": {"DOMAIN": "APQS"}}, True), + ({"first_record": {"DOMAIN": "APFAMH"}}, True), + ({"first_record": {"DOMAIN": "AE"}}, False), + ({"first_record": {"DOMAIN": "LB"}}, False), + ({"first_record": {"DOMAIN": "AP"}}, False), + ({"first_record": {"DOMAIN": "APF"}}, False), + ({"first_record": None}, False), + ({"first_record": {}}, False), + ({}, False), +] + + +@pytest.mark.parametrize("mock_dataset, expected", is_ap_tests) +def test_is_ap_dataset(mock_dataset, expected): + result = SDTMDatasetMetadata(**mock_dataset).is_ap + assert ( + result == expected + ), f"Expected {expected} but got {result} for dataset {mock_dataset}" + + +ap_suffix_tests = [ + ({"first_record": {"DOMAIN": "APFA"}}, "FA"), + ({"first_record": {"DOMAIN": "APXX"}}, "XX"), + ({"first_record": {"DOMAIN": "APQS"}}, "QS"), + ({"first_record": {"DOMAIN": "APLB"}}, "LB"), + ({"first_record": {"DOMAIN": "APFAMH"}}, "FA"), + ({"first_record": {"DOMAIN": "AE"}}, ""), + ({"first_record": {"DOMAIN": "LB"}}, ""), + ({"first_record": {"DOMAIN": "AP"}}, ""), + ({"first_record": {"DOMAIN": "APF"}}, ""), + ({"first_record": None}, ""), + ({"first_record": {}}, ""), + ({}, ""), +] + + +@pytest.mark.parametrize("mock_dataset, expected", ap_suffix_tests) +def test_ap_suffix_property(mock_dataset, expected): + result = SDTMDatasetMetadata(**mock_dataset).ap_suffix + assert ( + result == expected + ), f"Expected {expected} but got {result} for dataset {mock_dataset}" + + datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ From a0b81f099f522ebb8f557172094bf85001b30f99 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 24 Nov 2025 20:18:01 -0500 Subject: [PATCH 4/8] Added default value handling for missing is_ap and ap_suffix columns in DatasetMetadataDefineDatasetBuilder._get_dataset_dataframe() --- .../dataset_metadata_define_dataset_builder.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 30bb75382..c81175594 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -123,5 +123,9 @@ def _get_dataset_dataframe(self): dataset_df = datasets.rename(columns=data_col_mapping) if "dataset_size" not in dataset_df.columns: dataset_df["dataset_size"] = None + if "is_ap" not in dataset_df.columns: + dataset_df["is_ap"] = False + if "ap_suffix" not in dataset_df.columns: + dataset_df["ap_suffix"] = "" dataset_df = self.dataset_implementation(dataset_df[dataset_col_order]) return dataset_df From b882157919919dfb28a39bedc3518dc0b33e4fc5 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 26 Nov 2025 08:00:14 -0500 Subject: [PATCH 5/8] Refactor is_ap to check APID in first_record, fix AP supp detection, and remove redundant extract_metadata code --- .../models/sdtm_dataset_metadata.py | 30 ++++++++++------- .../operations/extract_metadata.py | 12 +++---- .../data_services/base_data_service.py | 23 +++++++------ .../utilities/rule_processor.py | 8 +---- .../test_operations/test_extract_metadata.py | 23 ++++++++----- .../test_data_service/test_data_service.py | 5 ++- tests/unit/test_utilities/test_utils.py | 33 ++++++++++++++----- 7 files changed, 78 insertions(+), 56 deletions(-) diff --git a/cdisc_rules_engine/models/sdtm_dataset_metadata.py b/cdisc_rules_engine/models/sdtm_dataset_metadata.py index 5df57d732..704a72cc9 100644 --- a/cdisc_rules_engine/models/sdtm_dataset_metadata.py +++ b/cdisc_rules_engine/models/sdtm_dataset_metadata.py @@ -23,9 +23,9 @@ class SDTMDatasetMetadata(DatasetMetadata): | APQS | APQS | False | APQS | None | True | QS | | APQSX | APQS | False | APQS | None | True | QS | | APQSXX | APQS | False | APQS | None | True | QS | - | SQAPQS | SQAPQS | True | None | APQS | False | | - | SQAPQSX | SQAPQS | True | None | APQS | False | | - | SQAPQSXX | SQAPQS | True | None | APQS | False | | + | SQAPQS | SQAPQS | True | None | APQS | True | | + | SQAPQSX | SQAPQS | True | None | APQS | True | | + | SQAPQSXX | SQAPQS | True | None | APQS | True | | | RELREC | RELREC | False | None | None | False | | """ @@ -61,20 +61,28 @@ def is_split(self) -> bool: @property def is_ap(self) -> bool: """ - Returns true if domain starts with AP and is at least 4 characters long + Returns true if APID variable exists in first_record for non-supp datasets, + or if rdomain is exactly 4 characters and starts with AP for supp datasets. """ - return ( - isinstance(self.domain, str) - and len(self.domain) >= 4 - and self.domain.startswith("AP") - ) + if self.is_supp: + return ( + isinstance(self.rdomain, str) + and len(self.rdomain) == 4 + and self.rdomain.startswith("AP") + ) + first_record = self.first_record or {} + return "APID" in first_record @property def ap_suffix(self) -> str: """ Returns the 2-character suffix (characters 3-4) from AP domains. - Returns empty string if not an AP domain. + Returns empty string if not an AP domain or for supp datasets. """ - if self.is_ap: + if not self.is_ap: + return "" + if self.is_supp: + return "" + if isinstance(self.domain, str) and len(self.domain) >= 4: return self.domain[2:4] return "" diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 1108cbc4a..413d23556 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -4,17 +4,13 @@ class ExtractMetadata(BaseOperation): def _execute_operation(self): - # get metadata metadata: pd.DataFrame = self.data_service.get_dataset_metadata( dataset_name=self.params.dataset_path, datasets=self.params.datasets ) if self.params.target in ("ap_suffix", "ap_domain_suffix", "domain_suffix"): - raw_metadata = self.data_service.get_raw_dataset_metadata( - dataset_name=self.params.dataset_path, datasets=self.params.datasets - ) - return raw_metadata.ap_suffix + series = metadata.get("ap_suffix", pd.Series()) + return series[0] if len(series) > 0 else "" - # extract target value. Metadata df always has one row - target_value = metadata.get(self.params.target, pd.Series())[0] - return target_value + target_value = metadata.get(self.params.target, pd.Series()) + return target_value[0] if len(target_value) > 0 else None diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index ef2ceeee3..53667f715 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -246,38 +246,37 @@ def _handle_special_cases( if self._contains_topic_variable(dataset, dataset_metadata.domain, "OBJ"): return FINDINGS_ABOUT return FINDINGS - if self._is_associated_persons(dataset): + if dataset_metadata.is_ap and not dataset_metadata.is_supp: return self._get_associated_persons_inherit_class( - file_path, datasets, dataset_metadata.domain + file_path, datasets, dataset_metadata ) return None - def _is_associated_persons(self, dataset) -> bool: - """ - Check if AP-- domain. - """ - return "APID" in dataset - def _get_associated_persons_inherit_class( - self, file_path, datasets: Iterable[SDTMDatasetMetadata], domain: str + self, + file_path, + datasets: Iterable[SDTMDatasetMetadata], + dataset_metadata: SDTMDatasetMetadata, ): """ Check with inherit class AP-- belongs to. """ - ap_suffix = domain[2:] + ap_suffix = dataset_metadata.ap_suffix + if not ap_suffix: + return None directory_path = get_directory_path(file_path) if len(datasets) > 1: domain_details: SDTMDatasetMetadata = search_in_list_of_dicts( datasets, lambda item: item.domain == ap_suffix ) if domain_details: + if domain_details.is_ap: + raise ValueError("Nested Associated Persons domain reference") file_name = domain_details.filename new_file_path = os.path.join(directory_path, file_name) new_domain_dataset = self.get_dataset(dataset_name=new_file_path) else: raise ValueError("Filename for domain doesn't exist") - if self._is_associated_persons(new_domain_dataset): - raise ValueError("Nested Associated Persons domain reference") return self.get_dataset_class( new_domain_dataset, new_file_path, diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index c78236484..221b04c7f 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -35,7 +35,6 @@ from cdisc_rules_engine.utilities.utils import ( get_directory_path, get_operations_cache_key, - is_ap_domain, search_in_list_of_dicts, get_dataset_name_from_details, ) @@ -182,12 +181,7 @@ def _domain_matched_ap_or_supp( supp_ap_domains.update({f"{AP_DOMAIN}--", f"{APFA_DOMAIN}--"}) return any(set(domains_to_check).intersection(supp_ap_domains)) and ( - dataset_metadata.is_supp - or is_ap_domain( - dataset_metadata.domain - or dataset_metadata.rdomain - or dataset_metadata.name - ) + dataset_metadata.is_supp or dataset_metadata.is_ap ) def rule_applies_to_data_structure( diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index 6dcc3c93d..9703e0c5f 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -46,13 +46,18 @@ def test_extract_metadata_get_dataset_name( def _create_mock_service(dataset_name, first_record=None): mock_service = Mock(LocalDataService) - mock_service.get_dataset_metadata.return_value = pd.DataFrame.from_dict( - {"dataset_name": [dataset_name]} - ) raw_metadata = SDTMDatasetMetadata( name=dataset_name, first_record=first_record, ) + mock_service.get_dataset_metadata.return_value = pd.DataFrame.from_dict( + { + "dataset_name": [dataset_name], + "ap_suffix": [raw_metadata.ap_suffix], + "is_ap": [raw_metadata.is_ap], + "domain": [raw_metadata.domain], + } + ) mock_service.get_raw_dataset_metadata = MagicMock(return_value=raw_metadata) return mock_service @@ -63,10 +68,10 @@ def _create_mock_service(dataset_name, first_record=None): ("APFA", None, ""), ("APXX", None, ""), ("APLB", None, ""), - ("", {"DOMAIN": "APFA"}, "FA"), - ("AE", {"DOMAIN": "APFA"}, "FA"), - ("AP", {"DOMAIN": "APFA"}, "FA"), - ("APF", {"DOMAIN": "APFA"}, "FA"), + ("", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), + ("AE", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), + ("AP", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), + ("APF", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), ], ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) @@ -133,7 +138,9 @@ def test_extract_metadata_domain_suffix_returns_empty_for_invalid( def test_extract_metadata_domain_suffix_uses_domain( operation_params: OperationParams, dataset_type ): - mock_data_service = _create_mock_service("APFA", {"DOMAIN": "APXX"}) + mock_data_service = _create_mock_service( + "APFA", {"DOMAIN": "APXX", "APID": "AP001"} + ) operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APXX"]} ) diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index 00b4cb208..4d7ded960 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -245,7 +245,10 @@ def test_get_dataset_class_associated_domains(): datasets = [ SDTMDatasetMetadata(**dataset) for dataset in [ - {"first_record": {"DOMAIN": "APDM"}, "filename": "apdm.xpt"}, + { + "first_record": {"DOMAIN": "APDM", "APID": "AP001"}, + "filename": "apdm.xpt", + }, {"first_record": {"DOMAIN": "DM"}, "filename": "dm.xpt"}, ] ] diff --git a/tests/unit/test_utilities/test_utils.py b/tests/unit/test_utilities/test_utils.py index 50b3a8572..6917022c0 100644 --- a/tests/unit/test_utilities/test_utils.py +++ b/tests/unit/test_utilities/test_utils.py @@ -46,10 +46,10 @@ def test_is_supp_dataset(mock_dataset, expected): is_ap_tests = [ - ({"first_record": {"DOMAIN": "APFA"}}, True), - ({"first_record": {"DOMAIN": "APXX"}}, True), - ({"first_record": {"DOMAIN": "APQS"}}, True), - ({"first_record": {"DOMAIN": "APFAMH"}}, True), + ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, True), + ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, True), + ({"first_record": {"DOMAIN": "APFAMH", "APID": "AP004"}}, True), ({"first_record": {"DOMAIN": "AE"}}, False), ({"first_record": {"DOMAIN": "LB"}}, False), ({"first_record": {"DOMAIN": "AP"}}, False), @@ -57,6 +57,15 @@ def test_is_supp_dataset(mock_dataset, expected): ({"first_record": None}, False), ({"first_record": {}}, False), ({}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SUPPQS", "first_record": {"RDOMAIN": "QS"}}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "AP"}}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APF"}}, False), + ({"first_record": {"APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, True), ] @@ -69,11 +78,11 @@ def test_is_ap_dataset(mock_dataset, expected): ap_suffix_tests = [ - ({"first_record": {"DOMAIN": "APFA"}}, "FA"), - ({"first_record": {"DOMAIN": "APXX"}}, "XX"), - ({"first_record": {"DOMAIN": "APQS"}}, "QS"), - ({"first_record": {"DOMAIN": "APLB"}}, "LB"), - ({"first_record": {"DOMAIN": "APFAMH"}}, "FA"), + ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, "FA"), + ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, "XX"), + ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, "QS"), + ({"first_record": {"DOMAIN": "APLB", "APID": "AP004"}}, "LB"), + ({"first_record": {"DOMAIN": "APFAMH", "APID": "AP005"}}, "FA"), ({"first_record": {"DOMAIN": "AE"}}, ""), ({"first_record": {"DOMAIN": "LB"}}, ""), ({"first_record": {"DOMAIN": "AP"}}, ""), @@ -81,6 +90,12 @@ def test_is_ap_dataset(mock_dataset, expected): ({"first_record": None}, ""), ({"first_record": {}}, ""), ({}, ""), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"first_record": {"APID": "AP001"}}, ""), + ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, ""), + ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, ""), ] From cda1b1dafdab2a8ef16c4c8e7e5ce4a4410d3bb1 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 26 Nov 2025 08:22:21 -0500 Subject: [PATCH 6/8] Fix test_rule_applies_to_domain by adding first_record with APID for AP domain test cases --- tests/unit/test_utilities/test_rule_processor.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index 3b8cc3466..e4184557a 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -60,8 +60,13 @@ ) def test_rule_applies_to_domain(mock_data_service, name, rule_metadata, outcome): processor = RuleProcessor(mock_data_service, InMemoryCacheService()) + first_record = None + if name in ("APTE", "APFASU", "APRELSUB"): + first_record = {"DOMAIN": name, "APID": "AP001"} assert ( - processor.rule_applies_to_domain(SDTMDatasetMetadata(name=name), rule_metadata) + processor.rule_applies_to_domain( + SDTMDatasetMetadata(name=name, first_record=first_record), rule_metadata + ) == outcome ) From 8aa328d14b6a6c59f80ffcff8efcc82d4699780a Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 3 Dec 2025 12:28:32 -0500 Subject: [PATCH 7/8] Remove redundant is_supp check and combine extract_metadata tests --- .../operations/extract_metadata.py | 11 ++-- .../data_services/base_data_service.py | 2 +- .../test_operations/test_extract_metadata.py | 55 ++++++------------- 3 files changed, 22 insertions(+), 46 deletions(-) diff --git a/cdisc_rules_engine/operations/extract_metadata.py b/cdisc_rules_engine/operations/extract_metadata.py index 413d23556..5709f22d3 100644 --- a/cdisc_rules_engine/operations/extract_metadata.py +++ b/cdisc_rules_engine/operations/extract_metadata.py @@ -1,16 +1,15 @@ import pandas as pd + from cdisc_rules_engine.operations.base_operation import BaseOperation class ExtractMetadata(BaseOperation): def _execute_operation(self): + # get metadata metadata: pd.DataFrame = self.data_service.get_dataset_metadata( dataset_name=self.params.dataset_path, datasets=self.params.datasets ) - if self.params.target in ("ap_suffix", "ap_domain_suffix", "domain_suffix"): - series = metadata.get("ap_suffix", pd.Series()) - return series[0] if len(series) > 0 else "" - - target_value = metadata.get(self.params.target, pd.Series()) - return target_value[0] if len(target_value) > 0 else None + # extract target value. Metadata df always has one row + target_value = metadata.get(self.params.target, pd.Series())[0] + return target_value diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index 53667f715..6090e2e80 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -246,7 +246,7 @@ def _handle_special_cases( if self._contains_topic_variable(dataset, dataset_metadata.domain, "OBJ"): return FINDINGS_ABOUT return FINDINGS - if dataset_metadata.is_ap and not dataset_metadata.is_supp: + if dataset_metadata.is_ap: return self._get_associated_persons_inherit_class( file_path, datasets, dataset_metadata ) diff --git a/tests/unit/test_operations/test_extract_metadata.py b/tests/unit/test_operations/test_extract_metadata.py index 9703e0c5f..c8efd7d52 100644 --- a/tests/unit/test_operations/test_extract_metadata.py +++ b/tests/unit/test_operations/test_extract_metadata.py @@ -72,10 +72,19 @@ def _create_mock_service(dataset_name, first_record=None): ("AE", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), ("AP", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), ("APF", {"DOMAIN": "APFA", "APID": "AP001"}, "FA"), + ("AE", None, ""), + ("LB", None, ""), + ("AP", None, ""), + ("APF", None, ""), + ("AE", {"DOMAIN": "AE"}, ""), + ("AE", {"DOMAIN": "LB"}, ""), + ("AE", {"DOMAIN": "AP"}, ""), + ("AE", {"DOMAIN": ""}, ""), + ("AE", {"DOMAIN": None}, ""), ], ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_extract_metadata_domain_suffix_valid_cases( +def test_extract_metadata_domain_suffix( operation_params: OperationParams, dataset_type, dataset_name, @@ -83,8 +92,13 @@ def test_extract_metadata_domain_suffix_valid_cases( expected_suffix, ): mock_data_service = _create_mock_service(dataset_name, first_record) + domain_value = ( + first_record.get("DOMAIN") + if first_record and "DOMAIN" in first_record + else "AE" + ) operation_params.dataframe = dataset_type.from_dict( - {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} + {"STUDYID": ["TEST_STUDY"], "DOMAIN": [domain_value]} ) operation_params.target = "ap_suffix" cache = InMemoryCacheService.get_instance() @@ -98,42 +112,6 @@ def test_extract_metadata_domain_suffix_valid_cases( ) -@pytest.mark.parametrize( - "dataset_name, first_record", - [ - ("AE", None), - ("LB", None), - ("AP", None), - ("APF", None), - ("AE", {"DOMAIN": "AE"}), - ("AE", {"DOMAIN": "LB"}), - ("AE", {"DOMAIN": "AP"}), - ("AE", {"DOMAIN": ""}), - ("AE", {"DOMAIN": None}), - ("AE", None), - ], -) -@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_extract_metadata_domain_suffix_returns_empty_for_invalid( - operation_params: OperationParams, - dataset_type, - dataset_name, - first_record, -): - mock_data_service = _create_mock_service(dataset_name, first_record) - operation_params.dataframe = dataset_type.from_dict( - {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["AE"]} - ) - operation_params.target = "ap_suffix" - cache = InMemoryCacheService.get_instance() - operation = ExtractMetadata( - operation_params, operation_params.dataframe, cache, mock_data_service - ) - result = operation.execute() - assert operation_params.operation_id in result - assert all(item == "" for item in result[operation_params.operation_id]) - - @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_extract_metadata_domain_suffix_uses_domain( operation_params: OperationParams, dataset_type @@ -159,7 +137,6 @@ def test_extract_metadata_domain_suffix_empty_metadata( operation_params: OperationParams, dataset_type ): mock_data_service = _create_mock_service("APFA", None) - mock_data_service.get_dataset_metadata.return_value = pd.DataFrame() operation_params.dataframe = dataset_type.from_dict( {"STUDYID": ["TEST_STUDY"], "DOMAIN": ["APFA"]} ) From 820272492557d3174eddcc1abb2dbe49591ab22b Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 3 Dec 2025 12:37:15 -0500 Subject: [PATCH 8/8] fixed complexity issue --- ...dataset_metadata_define_dataset_builder.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index deef5f83a..65b5eefb4 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -87,6 +87,15 @@ def _get_define_xml_dataframe(self): return self.dataset_implementation(columns=define_col_order) return self.dataset_implementation.from_records(define_metadata) + def _ensure_required_columns(self, dataset_df, dataset_col_order): + if "dataset_size" not in dataset_df.columns: + dataset_df["dataset_size"] = None + if "is_ap" not in dataset_df.columns: + dataset_df["is_ap"] = False + if "ap_suffix" not in dataset_df.columns: + dataset_df["ap_suffix"] = "" + return self.dataset_implementation(dataset_df[dataset_col_order]) + def _get_dataset_dataframe(self): dataset_col_order = [ "dataset_size", @@ -130,11 +139,7 @@ def _get_dataset_dataframe(self): "domain": "dataset_name", } dataset_df = datasets.rename(columns=data_col_mapping) - if "dataset_size" not in dataset_df.columns: - dataset_df["dataset_size"] = None - if "is_ap" not in dataset_df.columns: - dataset_df["is_ap"] = False - if "ap_suffix" not in dataset_df.columns: - dataset_df["ap_suffix"] = "" - dataset_df = self.dataset_implementation(dataset_df[dataset_col_order]) + dataset_df = self._ensure_required_columns( + dataset_df, dataset_col_order + ) return dataset_df