From 8807f61746f988569556dc3a9f254182291fed02 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Fri, 2 Jan 2026 12:02:15 -0500 Subject: [PATCH 1/5] Add define_dataset_variable_order with OrderNumber sorting support (#791) --- .../dataset_builders/base_dataset_builder.py | 6 +- .../contents_define_dataset_builder.py | 1 + ...dataset_metadata_define_dataset_builder.py | 40 ++++++++++++- .../define_item_group_dataset_builder.py | 1 + .../define_xml/base_define_xml_reader.py | 21 +++++++ resources/schema/MetaVariables.json | 6 ++ tests/unit/test_define_xml_reader.py | 59 +++++++++++++++++++ 7 files changed, 129 insertions(+), 5 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index e1e07b1b7..64d25fd32 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -109,10 +109,10 @@ def get_dataset_contents(self, **kwargs): def get_define_xml_item_group_metadata_for_dataset( self, dataset_metadata: SDTMDatasetMetadata - ) -> List[dict]: + ) -> dict: """ Gets Define XML item group metadata - returns a list of dictionaries containing the following keys: + returns a dictionary containing the following keys: "define_dataset_name" "define_dataset_label" "define_dataset_location" @@ -120,6 +120,7 @@ def get_define_xml_item_group_metadata_for_dataset( "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" """ @@ -141,6 +142,7 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" """ diff --git a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py index b048e3a0e..a64fa8d9a 100644 --- a/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_define_dataset_builder.py @@ -26,6 +26,7 @@ def build(self): define_dataset_name - dataset name from define_xml define_dataset_structure - dataset structure define_dataset_variables - list of variables in the dataset + define_dataset_variable_order - ordered list of variables in the dataset ..., """ diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 65b5eefb4..b52d3b395 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -1,6 +1,9 @@ from cdisc_rules_engine.models.dataset import DatasetInterface from cdisc_rules_engine.services import logger from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder +from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( + DefineXMLReaderFactory, +) import os import numpy as np @@ -28,8 +31,10 @@ def build(self): define_dataset_class - dataset class define_dataset_structure - dataset structure define_dataset_is_non_standard - whether a dataset is a standard - define_dataset_variables - dataset variables - define_dataset_key_sequence - dataset key sequence + define_dataset_variables - list of variables in the dataset + define_dataset_variable_order - ordered list of variables in the dataset + (ordered by OrderNumber if present, otherwise by XML document order) + define_dataset_key_sequence - ordered list of key sequence variables in the dataset ..., """ @@ -80,12 +85,41 @@ def _get_define_xml_dataframe(self): "define_dataset_class", "define_dataset_structure", "define_dataset_is_non_standard", + "define_dataset_variables", + "define_dataset_variable_order", + "define_dataset_key_sequence", ] define_metadata = self.get_define_metadata() if not define_metadata: logger.info(f"No define_metadata is provided for {__name__}.") return self.dataset_implementation(columns=define_col_order) - return self.dataset_implementation.from_records(define_metadata) + define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( + self.dataset_path, self.define_xml_path, self.data_service, self.cache + ) + enriched_metadata = [] + for basic_metadata in define_metadata: + dataset_name = basic_metadata.get("define_dataset_name") + if dataset_name: + try: + full_metadata = define_xml_reader.extract_dataset_metadata( + dataset_name + ) + enriched_metadata.append(full_metadata) + except Exception as e: + logger.trace(e) + logger.error( + f"Error extracting metadata for {dataset_name}: {str(e)}" + ) + basic_metadata["define_dataset_variables"] = None + basic_metadata["define_dataset_variable_order"] = None + basic_metadata["define_dataset_key_sequence"] = None + enriched_metadata.append(basic_metadata) + else: + basic_metadata["define_dataset_variables"] = None + basic_metadata["define_dataset_variable_order"] = None + basic_metadata["define_dataset_key_sequence"] = None + enriched_metadata.append(basic_metadata) + return self.dataset_implementation.from_records(enriched_metadata) def _ensure_required_columns(self, dataset_df, dataset_col_order): if "dataset_size" not in dataset_df.columns: diff --git a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py index 471b1259d..6838b2245 100644 --- a/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/define_item_group_dataset_builder.py @@ -15,6 +15,7 @@ def build(self): "define_dataset_structure" "define_dataset_is_non_standard" "define_dataset_variables" + "define_dataset_variable_order" "define_dataset_key_sequence" """ item_group_metadata: List[dict] = ( diff --git a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py index fb57c1ea5..5d08f235c 100644 --- a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py +++ b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py @@ -120,6 +120,11 @@ def extract_dataset_metadata(self, dataset_name: Union[str, None] = None) -> dic for item in dataset_metadata.ItemRef if item.ItemOID in item_mapping ] + dataset_metadata_dict["define_dataset_variable_order"] = ( + self._get_ordered_dataset_variables( + dataset_metadata.ItemRef, item_mapping + ) + ) dataset_metadata_dict["define_dataset_key_sequence"] = ( self.get_dataset_key_sequence(dataset_name) ) @@ -148,6 +153,11 @@ def extract_domain_metadata(self, domain_name: str = None) -> dict: for item in domain_metadata.ItemRef if item.ItemOID in item_mapping ] + domain_metadata_dict["define_dataset_variable_order"] = ( + self._get_ordered_dataset_variables( + domain_metadata.ItemRef, item_mapping + ) + ) domain_metadata_dict["define_dataset_key_sequence"] = ( self.get_domain_key_sequence(domain_name) ) @@ -338,6 +348,17 @@ def _get_order_number(self, itemref, index): else: return index + 1 + def _get_ordered_dataset_variables(self, item_refs, item_mapping): + item_refs_with_order = [] + for index, item_ref in enumerate(item_refs): + if item_ref.ItemOID in item_mapping: + order_number = self._get_order_number(item_ref, index) + item_def = item_mapping.get(item_ref.ItemOID) + if item_def: + item_refs_with_order.append((order_number, item_def.Name)) + item_refs_with_order.sort(key=lambda x: x[0]) + return [var_name for _, var_name in item_refs_with_order] + def _get_item_def_representation(self, itemdef, itemref, codelists, index) -> dict: """ Returns item def as a dictionary diff --git a/resources/schema/MetaVariables.json b/resources/schema/MetaVariables.json index 2e3a690a7..2bc76e315 100644 --- a/resources/schema/MetaVariables.json +++ b/resources/schema/MetaVariables.json @@ -29,6 +29,12 @@ { "const": "define_dataset_structure" }, + { + "const": "define_dataset_variables" + }, + { + "const": "define_dataset_variable_order" + }, { "const": "define_variable_allowed_terms" }, diff --git a/tests/unit/test_define_xml_reader.py b/tests/unit/test_define_xml_reader.py index 7a9fbbee7..6fa11d071 100644 --- a/tests/unit/test_define_xml_reader.py +++ b/tests/unit/test_define_xml_reader.py @@ -109,6 +109,19 @@ def test_extract_domain_metadata(filename): "TSVCDREF", "TSVCDVER", ], + "define_dataset_variable_order": [ + "STUDYID", + "DOMAIN", + "TSSEQ", + "TSGRPID", + "TSPARMCD", + "TSPARM", + "TSVAL", + "TSVALNF", + "TSVALCD", + "TSVCDREF", + "TSVCDVER", + ], "define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"], } @@ -370,3 +383,49 @@ def test_read_dictionary_version(dictionary_type, expected_version): reader = DefineXMLReaderFactory.from_file_contents(contents) version = reader.get_external_dictionary_version(dictionary_type) assert version == expected_version + + +@pytest.mark.parametrize( + "filename", [(test_define_file_path), (test_define_2_0_file_path)] +) +def test_extract_dataset_metadata_with_ordernumber(filename): + """ + Unit test for DefineXMLReader.extract_dataset_metadata function. + Tests that define_dataset_variable_order respects OrderNumber attributes. + """ + with open(filename, "rb") as file: + contents: bytes = file.read() + reader = DefineXMLReaderFactory.from_file_contents(contents) + dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="SE") + assert "define_dataset_variables" in dataset_metadata + assert "define_dataset_variable_order" in dataset_metadata + assert isinstance(dataset_metadata["define_dataset_variables"], list) + assert isinstance(dataset_metadata["define_dataset_variable_order"], list) + assert len(dataset_metadata["define_dataset_variables"]) == len( + dataset_metadata["define_dataset_variable_order"] + ) + assert dataset_metadata["define_dataset_variable_order"][0] == "STUDYID" + assert dataset_metadata["define_dataset_variable_order"][1] == "DOMAIN" + assert dataset_metadata["define_dataset_variable_order"][2] == "USUBJID" + assert dataset_metadata["define_dataset_variable_order"][3] == "SESEQ" + assert dataset_metadata["define_dataset_variable_order"][4] == "ETCD" + + +@pytest.mark.parametrize( + "filename", [(test_define_file_path), (test_define_2_0_file_path)] +) +def test_extract_dataset_metadata_without_ordernumber(filename): + """ + Unit test for DefineXMLReader.extract_dataset_metadata function. + Tests that define_dataset_variable_order uses XML document order when OrderNumber is not present. + """ + with open(filename, "rb") as file: + contents: bytes = file.read() + reader = DefineXMLReaderFactory.from_file_contents(contents) + dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="TS") + assert "define_dataset_variables" in dataset_metadata + assert "define_dataset_variable_order" in dataset_metadata + assert ( + dataset_metadata["define_dataset_variable_order"] + == dataset_metadata["define_dataset_variables"] + ) From 628757d524e1b9af946d005e72afdb51b01bc685 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Fri, 2 Jan 2026 13:40:09 -0500 Subject: [PATCH 2/5] Fix test_validate_dataset_metadata_against_define_xml by mocking DefineXMLReaderFactory --- tests/unit/test_rules_engine.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index c27468bfb..158f23289 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1335,6 +1335,9 @@ def test_validate_single_dataset_not_equal_to( ), ], ) +@patch( + "cdisc_rules_engine.services.define_xml.define_xml_reader_factory.DefineXMLReaderFactory.get_define_xml_reader" +) @patch( "cdisc_rules_engine.dataset_builders.base_dataset_builder." + "BaseDatasetBuilder.get_define_metadata" @@ -1345,6 +1348,7 @@ def test_validate_single_dataset_not_equal_to( def test_validate_dataset_metadata_against_define_xml( mock_get_dataset_metadata: MagicMock, mock_get_define_xml_metadata_for_domain: MagicMock, + mock_get_define_xml_reader: MagicMock, define_xml_validation_rule: dict, define_xml_metadata: dict, dataset_mock: PandasDataset, @@ -1354,6 +1358,9 @@ def test_validate_dataset_metadata_against_define_xml( Unit test for Define XML validation. Creates an invalid dataset and validates it against Define XML. """ + mock_reader = MagicMock() + mock_reader.extract_dataset_metadata.side_effect = Exception("Mock exception") + mock_get_define_xml_reader.return_value = mock_reader mock_get_define_xml_metadata_for_domain.return_value = define_xml_metadata mock_get_dataset_metadata.return_value = dataset_mock From a45e1a3b6f6bb3a45940dc6017df19fd66f82186 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 6 Jan 2026 10:06:44 -0500 Subject: [PATCH 3/5] Update test_extract_dataset_metadata to include define_dataset_variable_order field --- tests/unit/test_define_xml_reader.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/unit/test_define_xml_reader.py b/tests/unit/test_define_xml_reader.py index 0a6acd58c..d8a763808 100644 --- a/tests/unit/test_define_xml_reader.py +++ b/tests/unit/test_define_xml_reader.py @@ -160,6 +160,19 @@ def test_extract_dataset_metadata(filename): "TSVCDREF", "TSVCDVER", ], + "define_dataset_variable_order": [ + "STUDYID", + "DOMAIN", + "TSSEQ", + "TSGRPID", + "TSPARMCD", + "TSPARM", + "TSVAL", + "TSVALNF", + "TSVALCD", + "TSVCDREF", + "TSVCDVER", + ], "define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"], "define_dataset_has_no_data": False, } From 95fcb6f0bc80986c4b025bfe0da8fa82d2f2621a Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 25 Feb 2026 13:28:42 -0500 Subject: [PATCH 4/5] Refactor define metadata fallback to use _ensure_define_metadata_keys and document define_dataset_variable_order in Rule_Type.md --- .../dataset_metadata_define_dataset_builder.py | 15 +++++++-------- resources/schema/rule/Rule_Type.md | 3 +++ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 09cfb6a14..fd42ed2d9 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -96,19 +96,18 @@ def _get_define_xml_dataframe(self): logger.error( f"Error extracting metadata for {dataset_name}: {str(e)}" ) - basic_metadata["define_dataset_variables"] = None - basic_metadata["define_dataset_variable_order"] = None - basic_metadata["define_dataset_key_sequence"] = None - basic_metadata["define_dataset_has_no_data"] = None + self._ensure_define_metadata_keys(basic_metadata, define_col_order) enriched_metadata.append(basic_metadata) else: - basic_metadata["define_dataset_variables"] = None - basic_metadata["define_dataset_variable_order"] = None - basic_metadata["define_dataset_key_sequence"] = None - basic_metadata["define_dataset_has_no_data"] = None + self._ensure_define_metadata_keys(basic_metadata, define_col_order) enriched_metadata.append(basic_metadata) return self.dataset_implementation.from_records(enriched_metadata) + def _ensure_define_metadata_keys(self, metadata: dict, col_order: list) -> None: + for key in col_order: + if key not in metadata: + metadata[key] = None + def _ensure_required_columns(self, dataset_df, dataset_col_order): if "dataset_size" not in dataset_df.columns: dataset_df["dataset_size"] = None diff --git a/resources/schema/rule/Rule_Type.md b/resources/schema/rule/Rule_Type.md index 4dd4d5034..18cc012c0 100644 --- a/resources/schema/rule/Rule_Type.md +++ b/resources/schema/rule/Rule_Type.md @@ -46,6 +46,7 @@ Columns are the columns within the original dataset along with the following col - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` ## Dataset Metadata Check against Define XML @@ -69,6 +70,7 @@ Returns a dataset where each dataset is a row in the new dataset. The define xml - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` #### Rule Macro @@ -107,6 +109,7 @@ any: - `define_dataset_name` - `define_dataset_structure` - `define_dataset_variables` +- `define_dataset_variable_order` ## Define Item Metadata Check From 032e7056851c1bdb291d06d4f8e4a07f5cb10e76 Mon Sep 17 00:00:00 2001 From: gerrycampion <85252124+gerrycampion@users.noreply.github.com> Date: Wed, 4 Mar 2026 23:41:29 -0500 Subject: [PATCH 5/5] Update test_define_xml_reader.py format --- tests/unit/test_define_xml_reader.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_define_xml_reader.py b/tests/unit/test_define_xml_reader.py index a35a262d7..b31c52a87 100644 --- a/tests/unit/test_define_xml_reader.py +++ b/tests/unit/test_define_xml_reader.py @@ -507,6 +507,8 @@ def test_extract_dataset_metadata_without_ordernumber(filename): dataset_metadata["define_dataset_variable_order"] == dataset_metadata["define_dataset_variables"] ) + + class TestGetExtensibleCodelistMappings: @staticmethod def _make_codelist(name, coded_values_extended, alias_list):