From 5fcd3de28e7f318fffb31b9e8eabe91dde45a1ef Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 16 Mar 2026 19:31:30 -0400 Subject: [PATCH 1/2] error handling and removed if conditional --- .../utilities/data_processor.py | 23 +++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 04d6dbe46..9e4f7c3ee 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -17,6 +17,7 @@ DataServiceFactory, DummyDataService, ) +from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.utilities.utils import ( search_in_list_of_dicts, ) @@ -211,6 +212,9 @@ def merge_pivot_supp_dataset( for key in static_keys if key in left_dataset.columns and key in right_dataset.columns ] + DataProcessor._validate_merge_key_overlap( + left_dataset, right_dataset, common_keys + ) if not is_blank: common_keys.append(dynamic_key) current_supp = right_dataset.rename(columns={"IDVARVAL": dynamic_key}) @@ -287,6 +291,9 @@ def _merge_supp_with_multiple_idvars( for key in static_keys if key in result_dataset.columns and key in group_data.columns ] + DataProcessor._validate_merge_key_overlap( + result_dataset, group_data, common_keys + ) common_keys.append(idvar_value) agg_dict = { @@ -480,3 +487,19 @@ def column_metadata_equal_to_define_and_library( @staticmethod def is_dummy_data(data_service: DataServiceInterface) -> bool: return isinstance(data_service, DummyDataService) + + @staticmethod + def _validate_merge_key_overlap( + left_dataset: DatasetInterface, + right_dataset: DatasetInterface, + common_keys: List[str], + ): + for key in common_keys: + left_values = set(left_dataset[key].dropna().unique()) + right_values = set(right_dataset[key].dropna().unique()) + if left_values and right_values and left_values.isdisjoint(right_values): + raise PreprocessingError( + f"SUPP merge key '{key}' has no overlapping values between " + f"parent dataset and SUPP dataset. " + f"Parent values: {left_values}, SUPP values: {right_values}." + ) From 7772a526427f80a746cfa6a49512512ba7fa763c Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 16 Mar 2026 19:45:29 -0400 Subject: [PATCH 2/2] added test --- tests/unit/test_dataset_preprocessor.py | 65 ++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_dataset_preprocessor.py b/tests/unit/test_dataset_preprocessor.py index 2b238ce9d..65fe8b519 100644 --- a/tests/unit/test_dataset_preprocessor.py +++ b/tests/unit/test_dataset_preprocessor.py @@ -17,7 +17,7 @@ from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) - +from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError from cdisc_rules_engine.models.dataset import PandasDataset @@ -1291,6 +1291,69 @@ def test_dm_merged_with_suppdm_without_dupes( assert result.data.loc[0, ["RACE1", "RACE2", "RACE3"]].notna().all() +@patch("cdisc_rules_engine.services.data_services.LocalDataService.get_dataset") +def test_preprocess_suppae_mismatched_studyid_raises_key_error(mock_get_dataset): + ae_dataset = PandasDataset( + pd.DataFrame( + { + "STUDYID": ["CDISC-PILOT-01"], + "DOMAIN": ["AE"], + "USUBJID": ["S001"], + "AESEQ": [1], + "AETERM": ["Headache"], + } + ) + ) + suppae_dataset = PandasDataset( + pd.DataFrame( + { + "STUDYID": ["COMPLETELY-DIFFERENT-STUDY"], + "RDOMAIN": ["AE"], + "USUBJID": ["S001"], + "IDVAR": ["AESEQ"], + "IDVARVAL": ["1"], + "QNAM": ["TEST"], + "QLABEL": ["Test"], + "QVAL": ["A"], + } + ) + ) + + mock_get_dataset.return_value = suppae_dataset + rule = { + "core_id": "TestRule", + "datasets": [{"domain_name": "SUPPAE", "match_key": ["USUBJID"]}], + "conditions": ConditionCompositeFactory.get_condition_composite( + { + "all": [ + { + "name": "get_dataset", + "operator": "equal_to", + "value": {"target": "TEST", "comparator": "A"}, + } + ] + } + ), + } + datasets = [ + SDTMDatasetMetadata( + name="SUPPAE", first_record={"RDOMAIN": "AE"}, filename="suppae.xpt" + ) + ] + data_service = LocalDataService(MagicMock(), MagicMock(), MagicMock()) + preprocessor = DatasetPreprocessor( + ae_dataset, + SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}, full_path="path"), + data_service, + InMemoryCacheService(), + ) + + with pytest.raises( + PreprocessingError, match="SUPP merge key 'STUDYID' has no overlapping values" + ): + preprocessor.preprocess(rule, datasets) + + def test_relrec_processed_correctly_with_others(rule_with_specific_supp): ec_meta = SDTMDatasetMetadata( name="EC",