Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
8807f61
Add define_dataset_variable_order with OrderNumber sorting support (#…
RakeshBobba03 Jan 2, 2026
628757d
Fix test_validate_dataset_metadata_against_define_xml by mocking Defi…
RakeshBobba03 Jan 2, 2026
a1d9a3e
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Jan 6, 2026
a45e1a3
Update test_extract_dataset_metadata to include define_dataset_variab…
RakeshBobba03 Jan 6, 2026
ada1287
Merge branch 'main' into 791-define-variable-order
RamilCDISC Jan 6, 2026
93b3e90
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Jan 9, 2026
8a6a6d9
Merge branch '791-define-variable-order' of https://github.com/cdisc-…
RakeshBobba03 Jan 9, 2026
f26d614
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
c6da795
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
859d610
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
4ea71f0
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Jan 12, 2026
13c43f7
Merge branch '791-define-variable-order' of https://github.com/cdisc-…
RakeshBobba03 Jan 12, 2026
be11222
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
987d9be
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
b945085
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
f62e4db
Auto-updated branch with latest changes from main
SFJohnson24 Jan 12, 2026
900b0f1
Auto-updated branch with latest changes from main
SFJohnson24 Jan 13, 2026
154b636
Auto-updated branch with latest changes from main
SFJohnson24 Jan 13, 2026
2ddb716
Auto-updated branch with latest changes from main
SFJohnson24 Jan 13, 2026
269b13c
Auto-updated branch with latest changes from main
SFJohnson24 Jan 14, 2026
c508435
Auto-updated branch with latest changes from main
SFJohnson24 Jan 16, 2026
45e5eef
Auto-updated branch with latest changes from main
SFJohnson24 Jan 19, 2026
48d5da7
Auto-updated branch with latest changes from main
SFJohnson24 Jan 20, 2026
eeb6ff1
Auto-updated branch with latest changes from main
SFJohnson24 Jan 22, 2026
6485272
Auto-updated branch with latest changes from main
SFJohnson24 Jan 26, 2026
0d95692
Auto-updated branch with latest changes from main
SFJohnson24 Jan 26, 2026
17eb51b
Auto-updated branch with latest changes from main
SFJohnson24 Jan 27, 2026
db88366
Auto-updated branch with latest changes from main
SFJohnson24 Jan 27, 2026
94a0241
Auto-updated branch with latest changes from main
SFJohnson24 Jan 28, 2026
e8b5790
Auto-updated branch with latest changes from main
SFJohnson24 Jan 28, 2026
16470e3
Auto-updated branch with latest changes from main
SFJohnson24 Jan 30, 2026
6e748f1
Auto-updated branch with latest changes from main
SFJohnson24 Jan 30, 2026
f477744
Auto-updated branch with latest changes from main
SFJohnson24 Jan 30, 2026
dd3a48c
Auto-updated branch with latest changes from main
SFJohnson24 Jan 30, 2026
722f0b8
Auto-updated branch with latest changes from main
SFJohnson24 Feb 1, 2026
f87b2f3
Auto-updated branch with latest changes from main
SFJohnson24 Feb 2, 2026
6626ca4
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Feb 11, 2026
bdd7466
Merge branch '791-define-variable-order' of https://github.com/cdisc-…
RakeshBobba03 Feb 11, 2026
57d4241
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Feb 16, 2026
ad980de
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Feb 17, 2026
406ed17
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Feb 20, 2026
52990d5
Merge branch 'main' into 791-define-variable-order
RakeshBobba03 Feb 25, 2026
95fcb6f
Refactor define metadata fallback to use _ensure_define_metadata_keys…
RakeshBobba03 Feb 25, 2026
51570ab
Merge branch 'main' into 791-define-variable-order
gerrycampion Mar 5, 2026
032e705
Update test_define_xml_reader.py format
gerrycampion Mar 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,17 +109,18 @@ def get_dataset_contents(self, **kwargs):

def get_define_xml_item_group_metadata_for_dataset(
self, dataset_metadata: SDTMDatasetMetadata
) -> List[dict]:
) -> dict:
"""
Gets Define XML item group metadata
returns a list of dictionaries containing the following keys:
returns a dictionary containing the following keys:
"define_dataset_name"
"define_dataset_label"
"define_dataset_location"
"define_dataset_class"
"define_dataset_structure"
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_variable_order"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""
Expand All @@ -142,6 +143,7 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic
"define_dataset_structure"
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_variable_order"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def build(self):
define_dataset_name - dataset name from define_xml
define_dataset_structure - dataset structure
define_dataset_variables - list of variables in the dataset
define_dataset_variable_order - ordered list of variables in the dataset

...,
"""
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from cdisc_rules_engine.services import logger
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import (
DefineXMLReaderFactory,
)
import numpy as np


Expand Down Expand Up @@ -27,9 +30,13 @@ def build(self):
define_dataset_class - dataset class
define_dataset_structure - dataset structure
define_dataset_is_non_standard - whether a dataset is a standard
define_dataset_variables - dataset variables
define_dataset_key_sequence - dataset key sequence
define_dataset_has_no_data
define_dataset_variables - list of variables in the dataset
define_dataset_variable_order - ordered list of variables in the dataset
(ordered by OrderNumber if present, otherwise by XML document order)
define_dataset_key_sequence - ordered list of key sequence variables in the dataset
define_dataset_has_no_data - whether a dataset has no data

...,
"""
# 1. Build define xml dataframe
define_df = self._get_define_xml_dataframe()
Expand Down Expand Up @@ -63,13 +70,43 @@ def _get_define_xml_dataframe(self):
"define_dataset_class",
"define_dataset_structure",
"define_dataset_is_non_standard",
"define_dataset_variables",
"define_dataset_variable_order",
"define_dataset_key_sequence",
"define_dataset_has_no_data",
]
define_metadata = self.get_define_metadata()
if not define_metadata:
logger.info(f"No define_metadata is provided for {__name__}.")
return self.dataset_implementation(columns=define_col_order)
return self.dataset_implementation.from_records(define_metadata)
define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
self.dataset_path, self.define_xml_path, self.data_service, self.cache
)
enriched_metadata = []
for basic_metadata in define_metadata:
dataset_name = basic_metadata.get("define_dataset_name")
if dataset_name:
try:
full_metadata = define_xml_reader.extract_dataset_metadata(
dataset_name
)
enriched_metadata.append(full_metadata)
except Exception as e:
logger.trace(e)
logger.error(
f"Error extracting metadata for {dataset_name}: {str(e)}"
)
self._ensure_define_metadata_keys(basic_metadata, define_col_order)
enriched_metadata.append(basic_metadata)
else:
self._ensure_define_metadata_keys(basic_metadata, define_col_order)
enriched_metadata.append(basic_metadata)
return self.dataset_implementation.from_records(enriched_metadata)

def _ensure_define_metadata_keys(self, metadata: dict, col_order: list) -> None:
for key in col_order:
if key not in metadata:
metadata[key] = None

def _ensure_required_columns(self, dataset_df, dataset_col_order):
if "dataset_size" not in dataset_df.columns:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ def build(self):
"define_dataset_structure"
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_variable_order"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""
Expand Down
21 changes: 21 additions & 0 deletions cdisc_rules_engine/services/define_xml/base_define_xml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,11 @@ def extract_dataset_metadata(self, dataset_name: Union[str, None] = None) -> dic
for item in dataset_metadata.ItemRef
if item.ItemOID in item_mapping
]
dataset_metadata_dict["define_dataset_variable_order"] = (
self._get_ordered_dataset_variables(
dataset_metadata.ItemRef, item_mapping
)
)
dataset_metadata_dict["define_dataset_key_sequence"] = (
self.get_dataset_key_sequence(dataset_name)
)
Expand Down Expand Up @@ -148,6 +153,11 @@ def extract_domain_metadata(self, domain_name: str = None) -> dict:
for item in domain_metadata.ItemRef
if item.ItemOID in item_mapping
]
domain_metadata_dict["define_dataset_variable_order"] = (
self._get_ordered_dataset_variables(
domain_metadata.ItemRef, item_mapping
)
)
domain_metadata_dict["define_dataset_key_sequence"] = (
self.get_domain_key_sequence(domain_name)
)
Expand Down Expand Up @@ -338,6 +348,17 @@ def _get_order_number(self, itemref, index):
else:
return index + 1

def _get_ordered_dataset_variables(self, item_refs, item_mapping):
item_refs_with_order = []
for index, item_ref in enumerate(item_refs):
if item_ref.ItemOID in item_mapping:
order_number = self._get_order_number(item_ref, index)
item_def = item_mapping.get(item_ref.ItemOID)
if item_def:
item_refs_with_order.append((order_number, item_def.Name))
item_refs_with_order.sort(key=lambda x: x[0])
return [var_name for _, var_name in item_refs_with_order]

def _get_item_def_representation(self, itemdef, itemref, codelists, index) -> dict:
"""
Returns item def as a dictionary
Expand Down
6 changes: 6 additions & 0 deletions resources/schema/rule/MetaVariables.json
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@
{
"const": "define_dataset_structure"
},
{
"const": "define_dataset_variables"
},
{
"const": "define_dataset_variable_order"
},
{
"const": "define_variable_allowed_terms"
},
Expand Down
3 changes: 3 additions & 0 deletions resources/schema/rule/Rule_Type.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Columns are the columns within the original dataset along with the following col
- `define_dataset_name`
- `define_dataset_structure`
- `define_dataset_variables`
- `define_dataset_variable_order`

## Dataset Metadata Check against Define XML

Expand All @@ -69,6 +70,7 @@ Returns a dataset where each dataset is a row in the new dataset. The define xml
- `define_dataset_name`
- `define_dataset_structure`
- `define_dataset_variables`
- `define_dataset_variable_order`

#### Rule Macro

Expand Down Expand Up @@ -107,6 +109,7 @@ any:
- `define_dataset_name`
- `define_dataset_structure`
- `define_dataset_variables`
- `define_dataset_variable_order`

## Define Item Metadata Check

Expand Down
74 changes: 73 additions & 1 deletion tests/unit/test_define_xml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,19 @@ def test_extract_domain_metadata(filename):
"TSVCDREF",
"TSVCDVER",
],
"define_dataset_variable_order": [
"STUDYID",
"DOMAIN",
"TSSEQ",
"TSGRPID",
"TSPARMCD",
"TSPARM",
"TSVAL",
"TSVALNF",
"TSVALCD",
"TSVCDREF",
"TSVCDVER",
],
"define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"],
"define_dataset_has_no_data": False,
}
Expand Down Expand Up @@ -148,6 +161,19 @@ def test_extract_dataset_metadata(filename):
"TSVCDREF",
"TSVCDVER",
],
"define_dataset_variable_order": [
"STUDYID",
"DOMAIN",
"TSSEQ",
"TSGRPID",
"TSPARMCD",
"TSPARM",
"TSVAL",
"TSVALNF",
"TSVALCD",
"TSVCDREF",
"TSVCDVER",
],
"define_dataset_key_sequence": ["STUDYID", "TSPARMCD", "TSVAL", "TSSEQ"],
"define_dataset_has_no_data": False,
}
Expand Down Expand Up @@ -210,7 +236,7 @@ def test_extract_variable_metadata(filename):
assert variable["define_variable_order_number"] == index + 1


@pytest.mark.parametrize("filename", [(test_define_file_path)])
@pytest.mark.parametrize("filename", [test_define_file_path])
def test_extract_variable_metadata_when_one_ordernumber_non_1_based(filename):
"""
Unit test for DefineXMLReader.extract_domain_metadata function.
Expand Down Expand Up @@ -437,6 +463,52 @@ def test_extract_domain_metadata_nv_has_no_data(filename, has_no_data):
assert v in domain_metadata["define_dataset_variables"]


@pytest.mark.parametrize(
"filename", [(test_define_file_path), (test_define_2_0_file_path)]
)
def test_extract_dataset_metadata_with_ordernumber(filename):
"""
Unit test for DefineXMLReader.extract_dataset_metadata function.
Tests that define_dataset_variable_order respects OrderNumber attributes.
"""
with open(filename, "rb") as file:
contents: bytes = file.read()
reader = DefineXMLReaderFactory.from_file_contents(contents)
dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="SE")
assert "define_dataset_variables" in dataset_metadata
assert "define_dataset_variable_order" in dataset_metadata
assert isinstance(dataset_metadata["define_dataset_variables"], list)
assert isinstance(dataset_metadata["define_dataset_variable_order"], list)
assert len(dataset_metadata["define_dataset_variables"]) == len(
dataset_metadata["define_dataset_variable_order"]
)
assert dataset_metadata["define_dataset_variable_order"][0] == "STUDYID"
assert dataset_metadata["define_dataset_variable_order"][1] == "DOMAIN"
assert dataset_metadata["define_dataset_variable_order"][2] == "USUBJID"
assert dataset_metadata["define_dataset_variable_order"][3] == "SESEQ"
assert dataset_metadata["define_dataset_variable_order"][4] == "ETCD"


@pytest.mark.parametrize(
"filename", [(test_define_file_path), (test_define_2_0_file_path)]
)
def test_extract_dataset_metadata_without_ordernumber(filename):
"""
Unit test for DefineXMLReader.extract_dataset_metadata function.
Tests that define_dataset_variable_order uses XML document order when OrderNumber is not present.
"""
with open(filename, "rb") as file:
contents: bytes = file.read()
reader = DefineXMLReaderFactory.from_file_contents(contents)
dataset_metadata: dict = reader.extract_dataset_metadata(dataset_name="TS")
assert "define_dataset_variables" in dataset_metadata
assert "define_dataset_variable_order" in dataset_metadata
assert (
dataset_metadata["define_dataset_variable_order"]
== dataset_metadata["define_dataset_variables"]
)


class TestGetExtensibleCodelistMappings:
@staticmethod
def _make_codelist(name, coded_values_extended, alias_list):
Expand Down
7 changes: 7 additions & 0 deletions tests/unit/test_rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,6 +1335,9 @@ def test_validate_single_dataset_not_equal_to(
),
],
)
@patch(
"cdisc_rules_engine.services.define_xml.define_xml_reader_factory.DefineXMLReaderFactory.get_define_xml_reader"
)
@patch(
"cdisc_rules_engine.dataset_builders.base_dataset_builder."
+ "BaseDatasetBuilder.get_define_metadata"
Expand All @@ -1345,6 +1348,7 @@ def test_validate_single_dataset_not_equal_to(
def test_validate_dataset_metadata_against_define_xml(
mock_get_dataset_metadata: MagicMock,
mock_get_define_xml_metadata_for_domain: MagicMock,
mock_get_define_xml_reader: MagicMock,
define_xml_validation_rule: dict,
define_xml_metadata: dict,
dataset_mock: PandasDataset,
Expand All @@ -1354,6 +1358,9 @@ def test_validate_dataset_metadata_against_define_xml(
Unit test for Define XML validation.
Creates an invalid dataset and validates it against Define XML.
"""
mock_reader = MagicMock()
mock_reader.extract_dataset_metadata.side_effect = Exception("Mock exception")
mock_get_define_xml_reader.return_value = mock_reader
mock_get_define_xml_metadata_for_domain.return_value = define_xml_metadata
mock_get_dataset_metadata.return_value = dataset_mock

Expand Down
Loading