diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 1e1fcd483..3b5b02524 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -121,6 +121,7 @@ def get_define_xml_item_group_metadata_for_dataset( "define_dataset_is_non_standard" "define_dataset_variables" "define_dataset_key_sequence" + "define_dataset_has_no_data" """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( @@ -142,6 +143,7 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic "define_dataset_is_non_standard" "define_dataset_variables" "define_dataset_key_sequence" + "define_dataset_has_no_data" """ define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( diff --git a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py index 7bcdfd3eb..69fb46f8e 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py +++ b/cdisc_rules_engine/dataset_builders/dataset_builder_factory.py @@ -56,6 +56,9 @@ from cdisc_rules_engine.dataset_builders.variables_metadata_values_dataset_builder import ( ValueCheckVariableMetadataDatasetBuilder, ) +from cdisc_rules_engine.dataset_builders.domain_list_with_define_builder import ( + DomainListWithDefineDatasetBuilder, +) from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder from cdisc_rules_engine.enums.rule_types import RuleTypes @@ -67,6 +70,7 @@ class DatasetBuilderFactory(FactoryInterface): RuleTypes.DATASET_METADATA_CHECK_AGAINST_DEFINE.value: DatasetMetadataDefineDatasetBuilder, RuleTypes.VARIABLE_METADATA_CHECK.value: VariablesMetadataDatasetBuilder, RuleTypes.DOMAIN_PRESENCE_CHECK.value: DomainListDatasetBuilder, + RuleTypes.DOMAIN_PRESENCE_CHECK_AGAINST_DEFINE.value: DomainListWithDefineDatasetBuilder, RuleTypes.DEFINE_ITEM_METADATA_CHECK.value: DefineVariablesDatasetBuilder, RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value: VariablesMetadataWithDefineDatasetBuilder, RuleTypes.DATASET_CONTENTS_CHECK_AGAINST_DEFINE_AND_LIBRARY.value: ContentsDatasetBuilder, diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 65b5eefb4..78a25f61e 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -1,7 +1,5 @@ -from cdisc_rules_engine.models.dataset import DatasetInterface from cdisc_rules_engine.services import logger from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder -import os import numpy as np @@ -17,6 +15,7 @@ def build(self): dataset_name - Name of the dataset dataset_label - Label for the dataset dataset_domain - Domain of the dataset + dataset_columns - List of columns in the dataset is_ap - Whether the domain is an AP domain ap_suffix - The 2-character suffix from AP domains @@ -30,12 +29,11 @@ def build(self): define_dataset_is_non_standard - whether a dataset is a standard define_dataset_variables - dataset variables define_dataset_key_sequence - dataset key sequence - - ..., + define_dataset_has_no_data """ # 1. Build define xml dataframe define_df = self._get_define_xml_dataframe() - # ) + # 2. Build dataset dataframe dataset_df = self._get_dataset_dataframe() if define_df.empty or dataset_df.empty: @@ -49,26 +47,11 @@ def build(self): right_on=["define_dataset_name", "define_dataset_location"], how="outer", ) + # 4. Remove NaN merged._data = merged._data.astype(object).replace({np.nan: None}) - # 5. remove unused rows, replace rows with target row - merged_cleaned = merged.dropna(subset=["dataset_name"]) - dataset_filename = ( - os.path.basename(self.dataset_metadata.full_path).lower() - if self.dataset_metadata.full_path - else None - ) - matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_location"].str.lower() == dataset_filename - ] - if matching_row.empty: - # when using DASK dataset_filename refers to temp parquet filename - matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_location"].str.lower() - == self.dataset_metadata.original_path.lower() - ] - for column in merged.columns: - merged[column] = matching_row[column].iloc[0] + + # 5. Return all rows (one per dataset) return merged def _get_define_xml_dataframe(self): @@ -80,6 +63,7 @@ def _get_define_xml_dataframe(self): "define_dataset_class", "define_dataset_structure", "define_dataset_is_non_standard", + "define_dataset_has_no_data", ] define_metadata = self.get_define_metadata() if not define_metadata: @@ -92,6 +76,8 @@ def _ensure_required_columns(self, dataset_df, dataset_col_order): dataset_df["dataset_size"] = None if "is_ap" not in dataset_df.columns: dataset_df["is_ap"] = False + if "dataset_columns" not in dataset_df.columns: + dataset_df["dataset_columns"] = None if "ap_suffix" not in dataset_df.columns: dataset_df["ap_suffix"] = "" return self.dataset_implementation(dataset_df[dataset_col_order]) @@ -103,6 +89,7 @@ def _get_dataset_dataframe(self): "dataset_name", "dataset_label", "dataset_domain", + "dataset_columns", "is_ap", "ap_suffix", ] @@ -121,6 +108,12 @@ def _get_dataset_dataframe(self): ds_metadata.data["dataset_domain"] = getattr( dataset, "domain", None ) + if dataset.first_record: + ds_metadata.data["dataset_columns"] = [ + list(dataset.first_record.keys()) + ] + else: + ds_metadata.data["dataset_columns"] = [[]] except Exception as e: logger.trace(e) logger.error(f"Error: {e}. Error message: {str(e)}") @@ -136,7 +129,6 @@ def _get_dataset_dataframe(self): data_col_mapping = { "filename": "dataset_location", "label": "dataset_label", - "domain": "dataset_name", } dataset_df = datasets.rename(columns=data_col_mapping) dataset_df = self._ensure_required_columns( diff --git a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py index 471b1259d..8c926b3bb 100644 --- a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py @@ -16,6 +16,7 @@ def build(self): "define_dataset_is_non_standard" "define_dataset_variables" "define_dataset_key_sequence" + "define_dataset_has_no_data" """ item_group_metadata: List[dict] = ( self.get_define_xml_item_group_metadata_for_domain( diff --git a/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py new file mode 100644 index 000000000..180603763 --- /dev/null +++ b/cdisc_rules_engine/dataset_builders/domain_list_with_define_builder.py @@ -0,0 +1,41 @@ +from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder + + +class DomainListWithDefineDatasetBuilder(BaseDatasetBuilder): + def build(self): + """ + Returns a dataframe with one row per dataset in Define-XML. + + Columns: + - domain: The domain name + - filename: The file name if the dataset exists, None otherwise + - define_dataset_name + - define_dataset_label + - define_dataset_location + - define_dataset_domain + - define_dataset_class + - define_dataset_structure + - define_dataset_is_non_standard + - define_dataset_has_no_data + - define_dataset_key_sequence + - define_dataset_variables + + Dataset example: + domain filename define_dataset_name define_dataset_has_no_data + 0 AE ae.xpt AE False + 1 EC ec.xpt EC False + 2 SE None SE True + """ + domain_files = {ds.unsplit_name: ds.filename for ds in self.datasets} + all_define_metadata = self.get_define_metadata() + records = [] + for define_item in all_define_metadata: + domain_name = define_item.get("define_dataset_name", "") + record = { + "domain": domain_name, + "filename": domain_files.get(domain_name), + **define_item, + } + records.append(record) + + return self.dataset_implementation.from_records(records) diff --git a/cdisc_rules_engine/enums/domain_presence_values.py b/cdisc_rules_engine/enums/domain_presence_values.py new file mode 100644 index 000000000..bdd5b687c --- /dev/null +++ b/cdisc_rules_engine/enums/domain_presence_values.py @@ -0,0 +1,6 @@ +from cdisc_rules_engine.enums.base_enum import BaseEnum + + +class DomainPresenceValues(BaseEnum): + DATASET = "STUDY" + RECORD = "" diff --git a/cdisc_rules_engine/enums/rule_types.py b/cdisc_rules_engine/enums/rule_types.py index 6fdc25767..6f25d4366 100644 --- a/cdisc_rules_engine/enums/rule_types.py +++ b/cdisc_rules_engine/enums/rule_types.py @@ -11,6 +11,7 @@ class RuleTypes(BaseEnum): DEFINE_ITEM_GROUP_METADATA_CHECK = "Define Item Group Metadata Check" DEFINE_ITEM_METADATA_CHECK = "Define Item Metadata Check" DOMAIN_PRESENCE_CHECK = "Domain Presence Check" + DOMAIN_PRESENCE_CHECK_AGAINST_DEFINE = "Domain Presence Check against Define XML" JSONATA = "JSONata" VALUE_LEVEL_METADATA_CHECK_AGAINST_DEFINE = ( "Value Level Metadata Check against Define XML" diff --git a/cdisc_rules_engine/models/actions.py b/cdisc_rules_engine/models/actions.py index 8eccf107f..331a62ed7 100644 --- a/cdisc_rules_engine/models/actions.py +++ b/cdisc_rules_engine/models/actions.py @@ -10,6 +10,7 @@ SOURCE_ROW_NUMBER, ) from cdisc_rules_engine.enums.sensitivity import Sensitivity +from cdisc_rules_engine.enums.domain_presence_values import DomainPresenceValues from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.dataset_variable import DatasetVariable from cdisc_rules_engine.models.validation_error_container import ( @@ -62,6 +63,11 @@ def generate_dataset_error_objects(self, message: str, results: pd.Series): error_object = self.generate_targeted_error_object( target_names, rows_with_error, message ) + if "domain presence" in self.rule.get("rule_type", "").lower(): + error_object.dataset = DomainPresenceValues.DATASET.value + for error in error_object.errors: + error.dataset = DomainPresenceValues.DATASET.value + error.row = DomainPresenceValues.RECORD.value self.output_container.append(error_object.to_representation()) @rule_action(params={"message": FIELD_TEXT}) diff --git a/resources/schema/MetaVariables.md b/resources/schema/MetaVariables.md index b75c9c98b..91945918d 100644 --- a/resources/schema/MetaVariables.md +++ b/resources/schema/MetaVariables.md @@ -36,8 +36,16 @@ ItemGroupDef.leaf.href ## define_dataset_name +ItemGroupDef.Name + +## define_dataset_domain + ItemGroupDef.Domain +## define_dataset_has_no_data + +ItemGroupDef.HasNoData + ## define_dataset_structure ItemGroupDef.Structure diff --git a/resources/schema/Rule_Type.json b/resources/schema/Rule_Type.json index 332345be2..722efd25b 100644 --- a/resources/schema/Rule_Type.json +++ b/resources/schema/Rule_Type.json @@ -34,6 +34,10 @@ "const": "Domain Presence Check", "title": "Content domain presence at study level" }, + { + "const": "Domain Presence Check against Define XML", + "title": "Content domain presence at study level with define xml metadata at dataset level" + }, { "const": "JSON Schema Check", "title": "Apply JSON schema validation to a JSON file" diff --git a/resources/schema/Rule_Type.md b/resources/schema/Rule_Type.md index a3ec83245..9556730f6 100644 --- a/resources/schema/Rule_Type.md +++ b/resources/schema/Rule_Type.md @@ -38,6 +38,7 @@ Columns are the columns within the original dataset along with the following col - `dataset_domain` - `define_dataset_class` - `define_dataset_domain` +- `define_dataset_has_no_data` - `define_dataset_is_non_standard` - `define_dataset_key_sequence` - `define_dataset_label` @@ -48,6 +49,8 @@ Columns are the columns within the original dataset along with the following col ## Dataset Metadata Check against Define XML +Returns a dataset where each dataset is a row in the new dataset. The define xml dataset metadata is attached to each row. + #### Columns - `dataset_size` @@ -55,15 +58,17 @@ Columns are the columns within the original dataset along with the following col - `dataset_name` - `dataset_label` - `dataset_domain` -- `define_dataset_name` +- `dataset_columns` +- `define_dataset_class` +- `define_dataset_domain` +- `define_dataset_has_no_data` +- `define_dataset_is_non_standard` +- `define_dataset_key_sequence` - `define_dataset_label` - `define_dataset_location` -- `define_dataset_domain` -- `define_dataset_class` +- `define_dataset_name` - `define_dataset_structure` -- `define_dataset_is_non_standard` - `define_dataset_variables` -- `define_dataset_key_sequence` #### Rule Macro @@ -92,14 +97,16 @@ any: #### Columns -- `define_dataset_name` +- `define_dataset_class` +- `define_dataset_domain` +- `define_dataset_has_no_data` +- `define_dataset_is_non_standard` +- `define_dataset_key_sequence` - `define_dataset_label` - `define_dataset_location` -- `define_dataset_class` +- `define_dataset_name` - `define_dataset_structure` -- `define_dataset_is_non_standard` - `define_dataset_variables` -- `define_dataset_key_sequence` ## Define Item Metadata Check @@ -179,6 +186,41 @@ all: operator: not_exists ``` +## Domain Presence Check against Define XML + +#### Columns + +One row per dataset defined in Define-XML: + +- `domain` +- `filename` - The file name if dataset exists, null otherwise +- `define_dataset_name` +- `define_dataset_label` +- `define_dataset_location` +- `define_dataset_domain` +- `define_dataset_class` +- `define_dataset_structure` +- `define_dataset_is_non_standard` +- `define_dataset_has_no_data` +- `define_dataset_key_sequence` +- `define_dataset_variables` + +#### Example + +Check if SE domain is defined in Define-XML without HasNoData="Yes" but the dataset file doesn't exist: + +```yaml +all: + - name: define_dataset_name + operator: equal_to + value: "SE" + - name: define_dataset_has_no_data + operator: equal_to + value: False + - name: filename + operator: not_exists +``` + ## JSONata Apply a JSONata query to a JSON file. [JSONata documentation](https://docs.jsonata.org) diff --git a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py index da2a67867..8ea9110d0 100644 --- a/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_dataset_metadata_define_dataset_builder.py @@ -44,60 +44,64 @@ "dataset_location": ["ts.xpt", "dm.xpt", "ae.xpt"], "dataset_name": ["TS", "DM", "AE"], "dataset_label": ["Trial Summary", "Demographics", "Adverse Events"], + "dataset_domain": ["TS", "DM", "AE"], + "dataset_columns": [ + ["STUDYID", "DOMAIN"], + ["STUDYID", "USUBJID"], + ["STUDYID", "AETERM"], + ], + "is_ap": [False, False, False], + "ap_suffix": ["", "", ""], } expected_results = { "ts.xpt": { - "dataset_size": [10, 10, 10], - "dataset_location": ["ts.xpt", "ts.xpt", "ts.xpt"], - "dataset_name": ["TS", "TS", "TS"], - "dataset_label": ["Trial Summary", "Trial Summary", "Trial Summary"], - "define_dataset_name": ["TS", "TS", "TS"], - "define_dataset_label": ["Trial Summary", "Trial Summary", "Trial Summary"], - "define_dataset_location": ["ts.xpt", "ts.xpt", "ts.xpt"], - "define_dataset_class": ["TRIAL DESIGN", "TRIAL DESIGN", "TRIAL DESIGN"], - "define_dataset_structure": [ - "One record per trial summary parameter value", - "One record per trial summary parameter value", - "One record per trial summary parameter value", - ], - "define_dataset_is_non_standard": ["", "", ""], + "dataset_size": 10, + "dataset_location": "ts.xpt", + "dataset_name": "TS", + "dataset_label": "Trial Summary", + "dataset_domain": "TS", + "dataset_columns": ["STUDYID", "DOMAIN"], + "is_ap": False, + "ap_suffix": "", + "define_dataset_name": "TS", + "define_dataset_label": "Trial Summary", + "define_dataset_location": "ts.xpt", + "define_dataset_class": "TRIAL DESIGN", + "define_dataset_structure": "One record per trial summary parameter value", + "define_dataset_is_non_standard": "", }, "dm.xpt": { - "dataset_size": [0, 0, 0], - "dataset_location": ["dm.xpt", "dm.xpt", "dm.xpt"], - "dataset_name": ["DM", "DM", "DM"], - "dataset_label": ["Demographics", "Demographics", "Demographics"], - "define_dataset_name": ["DM", "DM", "DM"], - "define_dataset_label": ["Demographics", "Demographics", "Demographics"], - "define_dataset_location": ["dm.xpt", "dm.xpt", "dm.xpt"], - "define_dataset_class": [ - "SPECIAL PURPOSE", - "SPECIAL PURPOSE", - "SPECIAL PURPOSE", - ], - "define_dataset_structure": [ - "One record per subject", - "One record per subject", - "One record per subject", - ], - "define_dataset_is_non_standard": ["", "", ""], + "dataset_size": 0, + "dataset_location": "dm.xpt", + "dataset_name": "DM", + "dataset_label": "Demographics", + "dataset_domain": "DM", + "dataset_columns": ["STUDYID", "USUBJID"], + "is_ap": False, + "ap_suffix": "", + "define_dataset_name": "DM", + "define_dataset_label": "Demographics", + "define_dataset_location": "dm.xpt", + "define_dataset_class": "SPECIAL PURPOSE", + "define_dataset_structure": "One record per subject", + "define_dataset_is_non_standard": "", }, "ae.xpt": { - "dataset_size": [1000, 1000, 1000], - "dataset_location": ["ae.xpt", "ae.xpt", "ae.xpt"], - "dataset_name": ["AE", "AE", "AE"], - "dataset_label": ["Adverse Events", "Adverse Events", "Adverse Events"], - "define_dataset_name": ["AE", "AE", "AE"], - "define_dataset_label": ["Adverse Events", "Adverse Events", "Adverse Events"], - "define_dataset_location": ["ae.xpt", "ae.xpt", "ae.xpt"], - "define_dataset_class": ["EVENTS", "EVENTS", "EVENTS"], - "define_dataset_structure": [ - "One record per adverse event", - "One record per adverse event", - "One record per adverse event", - ], - "define_dataset_is_non_standard": ["", "", ""], + "dataset_size": 1000, + "dataset_location": "ae.xpt", + "dataset_name": "AE", + "dataset_label": "Adverse Events", + "dataset_domain": "AE", + "dataset_columns": ["STUDYID", "AETERM"], + "is_ap": False, + "ap_suffix": "", + "define_dataset_name": "AE", + "define_dataset_label": "Adverse Events", + "define_dataset_location": "ae.xpt", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per adverse event", + "define_dataset_is_non_standard": "", }, } @@ -141,9 +145,15 @@ def test_dataset_metadata_define_dataset_builder(dataset_path): result = builder.build() - # Ensure columns are in the expected order - expected_df = pd.DataFrame(expected_results[dataset_path]) - result_df = result.data[expected_df.columns] + expected_df = pd.DataFrame( + [ + expected_results["ts.xpt"], + expected_results["dm.xpt"], + expected_results["ae.xpt"], + ] + ).astype(object) + + result_df = result.data[expected_df.columns].reset_index(drop=True) # Check that columns are the same assert list(result_df.columns) == list(expected_df.columns), "Columns do not match" diff --git a/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py new file mode 100644 index 000000000..70ccb61c3 --- /dev/null +++ b/tests/unit/test_dataset_builders/test_domain_presence_define_builder.py @@ -0,0 +1,350 @@ +import pytest +import pandas as pd +from unittest.mock import MagicMock, patch +from cdisc_rules_engine.dataset_builders.domain_list_with_define_builder import ( + DomainListWithDefineDatasetBuilder, +) +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata +from cdisc_rules_engine.services.data_services import DummyDataService +from cdisc_rules_engine.models.library_metadata_container import ( + LibraryMetadataContainer, +) + + +define_metadata = [ + { + "define_dataset_name": "AE", + "define_dataset_label": "Adverse Events", + "define_dataset_location": "ae.xpt", + "define_dataset_domain": "AE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per adverse event", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "AESEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "AETERM", "AESEQ"], + }, + { + "define_dataset_name": "DM", + "define_dataset_label": "Demographics", + "define_dataset_location": "dm.xpt", + "define_dataset_domain": "DM", + "define_dataset_class": "SPECIAL PURPOSE", + "define_dataset_structure": "One record per subject", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID"], + "define_dataset_variables": ["STUDYID", "USUBJID", "AGE", "SEX"], + }, + { + "define_dataset_name": "SE", + "define_dataset_label": "Subject Elements", + "define_dataset_location": "se.xpt", + "define_dataset_domain": "SE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per subject element", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": True, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "SESEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "SESEQ"], + }, + { + "define_dataset_name": "EC", + "define_dataset_label": "Exposure as Collected", + "define_dataset_location": "ec.xpt", + "define_dataset_domain": "EC", + "define_dataset_class": "INTERVENTIONS", + "define_dataset_structure": "One record per exposure", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "ECSEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "ECTRT", "ECSEQ"], + }, +] + + +@pytest.mark.parametrize( + "mock_datasets,define_meta,expected_results,test_description", + [ + ( + [ + MagicMock(unsplit_name="AE", filename="ae.xpt"), + MagicMock(unsplit_name="DM", filename="dm.xpt"), + MagicMock(unsplit_name="SE", filename="se.xpt"), + MagicMock(unsplit_name="EC", filename="ec.xpt"), + ], + define_metadata, + pd.DataFrame( + [ + { + "domain": "AE", + "filename": "ae.xpt", + "define_dataset_name": "AE", + "define_dataset_label": "Adverse Events", + "define_dataset_location": "ae.xpt", + "define_dataset_domain": "AE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per adverse event", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "AESEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AETERM", + "AESEQ", + ], + }, + { + "domain": "DM", + "filename": "dm.xpt", + "define_dataset_name": "DM", + "define_dataset_label": "Demographics", + "define_dataset_location": "dm.xpt", + "define_dataset_domain": "DM", + "define_dataset_class": "SPECIAL PURPOSE", + "define_dataset_structure": "One record per subject", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AGE", + "SEX", + ], + }, + { + "domain": "SE", + "filename": "se.xpt", + "define_dataset_name": "SE", + "define_dataset_label": "Subject Elements", + "define_dataset_location": "se.xpt", + "define_dataset_domain": "SE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per subject element", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": True, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "SESEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "SESEQ"], + }, + { + "domain": "EC", + "filename": "ec.xpt", + "define_dataset_name": "EC", + "define_dataset_label": "Exposure as Collected", + "define_dataset_location": "ec.xpt", + "define_dataset_domain": "EC", + "define_dataset_class": "INTERVENTIONS", + "define_dataset_structure": "One record per exposure", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "ECSEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "ECTRT", + "ECSEQ", + ], + }, + ] + ).astype(object), + "all_datasets_exist", + ), + ( + [ + MagicMock(unsplit_name="AE", filename="ae.xpt"), + MagicMock(unsplit_name="DM", filename="dm.xpt"), + MagicMock(unsplit_name="EC", filename="ec.xpt"), + ], + define_metadata, + pd.DataFrame( + [ + { + "domain": "AE", + "filename": "ae.xpt", + "define_dataset_name": "AE", + "define_dataset_label": "Adverse Events", + "define_dataset_location": "ae.xpt", + "define_dataset_domain": "AE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per adverse event", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "AESEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AETERM", + "AESEQ", + ], + }, + { + "domain": "DM", + "filename": "dm.xpt", + "define_dataset_name": "DM", + "define_dataset_label": "Demographics", + "define_dataset_location": "dm.xpt", + "define_dataset_domain": "DM", + "define_dataset_class": "SPECIAL PURPOSE", + "define_dataset_structure": "One record per subject", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AGE", + "SEX", + ], + }, + { + "domain": "SE", + "filename": None, + "define_dataset_name": "SE", + "define_dataset_label": "Subject Elements", + "define_dataset_location": "se.xpt", + "define_dataset_domain": "SE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per subject element", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": True, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "SESEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "SESEQ"], + }, + { + "domain": "EC", + "filename": "ec.xpt", + "define_dataset_name": "EC", + "define_dataset_label": "Exposure as Collected", + "define_dataset_location": "ec.xpt", + "define_dataset_domain": "EC", + "define_dataset_class": "INTERVENTIONS", + "define_dataset_structure": "One record per exposure", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "ECSEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "ECTRT", + "ECSEQ", + ], + }, + ] + ).astype(object), + "some_datasets_missing", + ), + ( + [], + define_metadata, + pd.DataFrame( + [ + { + "domain": "AE", + "filename": None, + "define_dataset_name": "AE", + "define_dataset_label": "Adverse Events", + "define_dataset_location": "ae.xpt", + "define_dataset_domain": "AE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per adverse event", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "AESEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AETERM", + "AESEQ", + ], + }, + { + "domain": "DM", + "filename": None, + "define_dataset_name": "DM", + "define_dataset_label": "Demographics", + "define_dataset_location": "dm.xpt", + "define_dataset_domain": "DM", + "define_dataset_class": "SPECIAL PURPOSE", + "define_dataset_structure": "One record per subject", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "AGE", + "SEX", + ], + }, + { + "domain": "SE", + "filename": None, + "define_dataset_name": "SE", + "define_dataset_label": "Subject Elements", + "define_dataset_location": "se.xpt", + "define_dataset_domain": "SE", + "define_dataset_class": "EVENTS", + "define_dataset_structure": "One record per subject element", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": True, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "SESEQ"], + "define_dataset_variables": ["STUDYID", "USUBJID", "SESEQ"], + }, + { + "domain": "EC", + "filename": None, + "define_dataset_name": "EC", + "define_dataset_label": "Exposure as Collected", + "define_dataset_location": "ec.xpt", + "define_dataset_domain": "EC", + "define_dataset_class": "INTERVENTIONS", + "define_dataset_structure": "One record per exposure", + "define_dataset_is_non_standard": "", + "define_dataset_has_no_data": False, + "define_dataset_key_sequence": ["STUDYID", "USUBJID", "ECSEQ"], + "define_dataset_variables": [ + "STUDYID", + "USUBJID", + "ECTRT", + "ECSEQ", + ], + }, + ] + ).astype(object), + "no_datasets_exist", + ), + ], +) +def test_domain_list_with_define_dataset_builder( + mock_datasets, define_meta, expected_results, test_description +): + builder = DomainListWithDefineDatasetBuilder( + rule=None, + data_service=DummyDataService(MagicMock(), MagicMock(), MagicMock(), data=[]), + cache_service=None, + rule_processor=None, + data_processor=None, + dataset_path="ae.xpt", + datasets=mock_datasets, + dataset_metadata=SDTMDatasetMetadata(full_path="ae.xpt"), + define_xml_path=None, + standard="sdtmig", + standard_version="3-4", + standard_substandard=None, + library_metadata=LibraryMetadataContainer(), + ) + + with patch.object(builder, "get_define_metadata", return_value=define_meta): + result = builder.build() + + result_df = result.data.reset_index(drop=True) + + if expected_results.empty: + assert result_df.empty, f"Expected empty DataFrame for {test_description}" + else: + assert list(result_df.columns) == list( + expected_results.columns + ), f"Columns do not match for {test_description}" + pd.testing.assert_frame_equal(result_df, expected_results, check_dtype=False) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index ffae7f45c..a068ac9a7 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1049,12 +1049,12 @@ def test_rule_with_domain_prefix_replacement(mock_get_dataset: MagicMock): [ { "executionStatus": ExecutionStatus.ISSUE_REPORTED.value, - "dataset": "bundle", + "dataset": "STUDY", "domain": "AE", "variables": ["AE"], "message": "Domain AE exists", "errors": [ - {"value": {"AE": "ae.xpt"}, "dataset": "bundle", "row": 1} + {"value": {"AE": "ae.xpt"}, "dataset": "STUDY", "row": ""} ], } ],