diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index ae67c499e..ad1bc81c4 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -109,10 +109,10 @@ def get_dataset_contents(self, **kwargs): def get_define_xml_item_group_metadata_for_dataset( self, dataset_metadata: SDTMDatasetMetadata - ) -> List[dict]: + ) -> dict: """ Gets Define XML item group metadata - returns a list of dictionaries containing the following keys: + returns a dictionary containing the following keys: "define_dataset_name" "define_dataset_label" "define_dataset_location" @@ -120,6 +120,7 @@ def get_define_xml_item_group_metadata_for_dataset( "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" "define_dataset_has_no_data" """ @@ -142,6 +143,7 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" "define_dataset_has_no_data" """ diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index b048e3a0e..a64fa8d9a 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -26,6 +26,7 @@ def build(self): define_dataset_name - dataset name from define_xml define_dataset_structure - dataset structure define_dataset_variables - list of variables in the dataset + define_dataset_variable_order - ordered list of variables in the dataset ..., """ diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 78a25f61e..fd42ed2d9 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -1,5 +1,8 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder +from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( + DefineXMLReaderFactory, +) import numpy as np @@ -27,9 +30,13 @@ def build(self): define_dataset_class - dataset class define_dataset_structure - dataset structure define_dataset_is_non_standard - whether a dataset is a standard - define_dataset_variables - dataset variables - define_dataset_key_sequence - dataset key sequence - define_dataset_has_no_data + define_dataset_variables - list of variables in the dataset + define_dataset_variable_order - ordered list of variables in the dataset + (ordered by OrderNumber if present, otherwise by XML document order) + define_dataset_key_sequence - ordered list of key sequence variables in the dataset + define_dataset_has_no_data - whether a dataset has no data + + ..., """ # 1. Build define xml dataframe define_df = self._get_define_xml_dataframe() @@ -63,13 +70,43 @@ def _get_define_xml_dataframe(self): "define_dataset_class", "define_dataset_structure", "define_dataset_is_non_standard", + "define_dataset_variables", + "define_dataset_variable_order", + "define_dataset_key_sequence", "define_dataset_has_no_data", ] define_metadata = self.get_define_metadata() if not define_metadata: logger.info(f"No define_metadata is provided for {__name__}.") return self.dataset_implementation(columns=define_col_order) - return self.dataset_implementation.from_records(define_metadata) + define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( + self.dataset_path, self.define_xml_path, self.data_service, self.cache + ) + enriched_metadata = [] + for basic_metadata in define_metadata: + dataset_name = basic_metadata.get("define_dataset_name") + if dataset_name: + try: + full_metadata = define_xml_reader.extract_dataset_metadata( + dataset_name + ) + enriched_metadata.append(full_metadata) + except Exception as e: + logger.trace(e) + logger.error( + f"Error extracting metadata for {dataset_name}: {str(e)}" + ) + self._ensure_define_metadata_keys(basic_metadata, define_col_order) + enriched_metadata.append(basic_metadata) + else: + self._ensure_define_metadata_keys(basic_metadata, define_col_order) + enriched_metadata.append(basic_metadata) + return self.dataset_implementation.from_records(enriched_metadata) + + def _ensure_define_metadata_keys(self, metadata: dict, col_order: list) -> None: + for key in col_order: + if key not in metadata: + metadata[key] = None def _ensure_required_columns(self, dataset_df, dataset_col_order): if "dataset_size" not in dataset_df.columns: diff --git a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py index 8c926b3bb..26f93e884 100644 --- a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py @@ -15,6 +15,7 @@ def build(self): "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" "define_dataset_has_no_data" """ diff --git a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py index fb57c1ea5..5d08f235c 100644 --- a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py +++ b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py @@ -120,6 +120,11 @@ def extract_dataset_metadata(self, dataset_name: Union[str, None] = None) -> dic for item in dataset_metadata.ItemRef if item.ItemOID in item_mapping ] + dataset_metadata_dict["define_dataset_variable_order"] = ( + self._get_ordered_dataset_variables( + dataset_metadata.ItemRef, item_mapping + ) + ) dataset_metadata_dict["define_dataset_key_sequence"] = ( self.get_dataset_key_sequence(dataset_name) ) @@ -148,6 +153,11 @@ def extract_domain_metadata(self, domain_name: str = None) -> dict: for item in domain_metadata.ItemRef if item.ItemOID in item_mapping ] + domain_metadata_dict["define_dataset_variable_order"] = ( + self._get_ordered_dataset_variables( + domain_metadata.ItemRef, item_mapping + ) + ) domain_metadata_dict["define_dataset_key_sequence"] = ( self.get_domain_key_sequence(domain_name) ) @@ -338,6 +348,17 @@ def _get_order_number(self, itemref, index): else: return index + 1 + def _get_ordered_dataset_variables(self, item_refs, item_mapping): + item_refs_with_order = [] + for index, item_ref in enumerate(item_refs): + if item_ref.ItemOID in item_mapping: + order_number = self._get_order_number(item_ref, index) + item_def = item_mapping.get(item_ref.ItemOID) + if item_def: + item_refs_with_order.append((order_number, item_def.Name)) + item_refs_with_order.sort(key=lambda x: x[0]) + return [var_name for _, var_name in item_refs_with_order] + def _get_item_def_representation(self, itemdef, itemref, codelists, index) -> dict: """ Returns item def as a dictionary diff --git a/resources/schema/rule/MetaVariables.json b/resources/schema/rule/MetaVariables.json index 3ac38aee2..3b3f8d753 100644 --- a/resources/schema/rule/MetaVariables.json +++ b/resources/schema/rule/MetaVariables.json @@ -29,6 +29,12 @@ { "const": "define_dataset_structure" }, + { + "const": "define_dataset_variables" + }, + { + "const": "define_dataset_variable_order" + }, { "const": "define_variable_allowed_terms" }, diff --git a/resources/schema/rule/Rule_Type.md b/resources/schema/rule/Rule_Type.md index 4dd4d5034..18cc012c0 100644 --- a/resources/schema/rule/Rule_Type.md +++ b/resources/schema/rule/Rule_Type.md @@ -46,6 +46,7 @@ Columns are the columns within the original dataset along with the following col - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` ## Dataset Metadata Check against Define XML @@ -69,6 +70,7 @@ Returns a dataset where each dataset is a row in the new dataset. The define xml - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` #### Rule Macro @@ -107,6 +109,7 @@ any: - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` ## Define Item Metadata Check diff --git a/tests/unit/test_define_xml_reader.py b/tests/unit/test_define_xml_reader.py index 5d933fa1c..b31c52a87 100644 --- a/tests/unit/test_define_xml_reader.py +++ b/tests/unit/test_define_xml_reader.py @@ -111,6 +111,19 @@ def test_extract_domain_metadata(filename): "TSVCDREF", "TSVCDVER", ], + "define_dataset_variable_order": [ + "STUDYID", + "DOMAIN", + "TSSEQ", + "TSGRPID", + "TSPARMCD", + "TSPARM", + "TSVAL", + "TSVALNF", + "TSVALCD", + "TSVCDREF", + "TSVCDVER", + ], "define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"], "define_dataset_has_no_data": False, } @@ -148,6 +161,19 @@ def test_extract_dataset_metadata(filename): "TSVCDREF", "TSVCDVER", ], + "define_dataset_variable_order": [ + "STUDYID", + "DOMAIN", + "TSSEQ", + "TSGRPID", + "TSPARMCD", + "TSPARM", + "TSVAL", + "TSVALNF", + "TSVALCD", + "TSVCDREF", + "TSVCDVER", + ], "define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"], "define_dataset_has_no_data": False, } @@ -210,7 +236,7 @@ def test_extract_variable_metadata(filename): assert variable["define_variable_order_number"] == index + 1 -@pytest.mark.parametrize("filename", [(test_define_file_path)]) +@pytest.mark.parametrize("filename", [test_define_file_path]) def test_extract_variable_metadata_when_one_ordernumber_non_1_based(filename): """ Unit test for DefineXMLReader.extract_domain_metadata function. @@ -437,6 +463,52 @@ def test_extract_domain_metadata_nv_has_no_data(filename, has_no_data): assert v in domain_metadata["define_dataset_variables"] +@pytest.mark.parametrize( + "filename", [(test_define_file_path), (test_define_2_0_file_path)] +) +def test_extract_dataset_metadata_with_ordernumber(filename): + """ + Unit test for DefineXMLReader.extract_dataset_metadata function. + Tests that define_dataset_variable_order respects OrderNumber attributes. + """ + with open(filename, "rb") as file: + contents: bytes = file.read() + reader = DefineXMLReaderFactory.from_file_contents(contents) + dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="SE") + assert "define_dataset_variables" in dataset_metadata + assert "define_dataset_variable_order" in dataset_metadata + assert isinstance(dataset_metadata["define_dataset_variables"], list) + assert isinstance(dataset_metadata["define_dataset_variable_order"], list) + assert len(dataset_metadata["define_dataset_variables"]) == len( + dataset_metadata["define_dataset_variable_order"] + ) + assert dataset_metadata["define_dataset_variable_order"][0] == "STUDYID" + assert dataset_metadata["define_dataset_variable_order"][1] == "DOMAIN" + assert dataset_metadata["define_dataset_variable_order"][2] == "USUBJID" + assert dataset_metadata["define_dataset_variable_order"][3] == "SESEQ" + assert dataset_metadata["define_dataset_variable_order"][4] == "ETCD" + + +@pytest.mark.parametrize( + "filename", [(test_define_file_path), (test_define_2_0_file_path)] +) +def test_extract_dataset_metadata_without_ordernumber(filename): + """ + Unit test for DefineXMLReader.extract_dataset_metadata function. + Tests that define_dataset_variable_order uses XML document order when OrderNumber is not present. + """ + with open(filename, "rb") as file: + contents: bytes = file.read() + reader = DefineXMLReaderFactory.from_file_contents(contents) + dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="TS") + assert "define_dataset_variables" in dataset_metadata + assert "define_dataset_variable_order" in dataset_metadata + assert ( + dataset_metadata["define_dataset_variable_order"] + == dataset_metadata["define_dataset_variables"] + ) + + class TestGetExtensibleCodelistMappings: @staticmethod def _make_codelist(name, coded_values_extended, alias_list): diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 62a4bd040..12f78466b 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1335,6 +1335,9 @@ def test_validate_single_dataset_not_equal_to( ), ], ) +@patch( + "cdisc_rules_engine.services.define_xml.define_xml_reader_factory.DefineXMLReaderFactory.get_define_xml_reader" +) @patch( "cdisc_rules_engine.dataset_builders.base_dataset_builder." + "BaseDatasetBuilder.get_define_metadata" @@ -1345,6 +1348,7 @@ def test_validate_single_dataset_not_equal_to( def test_validate_dataset_metadata_against_define_xml( mock_get_dataset_metadata: MagicMock, mock_get_define_xml_metadata_for_domain: MagicMock, + mock_get_define_xml_reader: MagicMock, define_xml_validation_rule: dict, define_xml_metadata: dict, dataset_mock: PandasDataset, @@ -1354,6 +1358,9 @@ def test_validate_dataset_metadata_against_define_xml( Unit test for Define XML validation. Creates an invalid dataset and validates it against Define XML. """ + mock_reader = MagicMock() + mock_reader.extract_dataset_metadata.side_effect = Exception("Mock exception") + mock_get_define_xml_reader.return_value = mock_reader mock_get_define_xml_metadata_for_domain.return_value = define_xml_metadata mock_get_dataset_metadata.return_value = dataset_mock