From 0388ab2720c9ed29452207307d601fb58dad2edf Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 17 Nov 2025 18:37:51 +0100 Subject: [PATCH 01/10] fix dataset metadata handling in DatasetMetadataDefineDatasetBuilder --- ...dataset_metadata_define_dataset_builder.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 177ed5c5c..d7b45ca3e 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -2,6 +2,7 @@ from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder import os import numpy as np +import pandas as pd class DatasetMetadataDefineDatasetBuilder(BaseDatasetBuilder): @@ -93,19 +94,24 @@ def _get_dataset_dataframe(self): else: datasets = self.dataset_implementation() for dataset in self.datasets: + ds_metadata = None try: ds_metadata = self.data_service.get_dataset_metadata( - dataset.filename + dataset_name=dataset.filename + ) + ds_metadata.data["dataset_domain"] = getattr( + dataset, "domain", None ) - ds_metadata.data["dataset_domain"] = dataset.domain except Exception as e: logger.trace(e) logger.error(f"Error: {e}. Error message: {str(e)}") - datasets.data = ( - ds_metadata.data - if datasets.data.empty - else datasets.data.append(ds_metadata.data) - ) + if ds_metadata: + if datasets.data.empty: + datasets.data = ds_metadata.data.copy() + else: + datasets.data = pd.concat( + [datasets.data, ds_metadata.data], ignore_index=True + ) if datasets.data.empty or len(datasets.data) == 0: dataset_df = self.dataset_implementation(columns=dataset_col_order) From 64897232a7593f21a846b388705111c5bd3f295e Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 18 Nov 2025 23:03:59 +0100 Subject: [PATCH 02/10] refactor dataset handling in DatasetMetadataDefineDatasetBuilder and enhance to_parquet method in DummyDataService --- ...dataset_metadata_define_dataset_builder.py | 6 +----- .../data_services/dummy_data_service.py | 19 +++++++++++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index d7b45ca3e..f80337306 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -2,7 +2,6 @@ from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder import os import numpy as np -import pandas as pd class DatasetMetadataDefineDatasetBuilder(BaseDatasetBuilder): @@ -109,10 +108,7 @@ def _get_dataset_dataframe(self): if datasets.data.empty: datasets.data = ds_metadata.data.copy() else: - datasets.data = pd.concat( - [datasets.data, ds_metadata.data], ignore_index=True - ) - + datasets.data = datasets.concat(ds_metadata).data if datasets.data.empty or len(datasets.data) == 0: dataset_df = self.dataset_implementation(columns=dataset_col_order) logger.info(f"No datasets metadata is provided for {__name__}.") diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index f22247f97..f163b0a14 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -4,7 +4,7 @@ import os import pandas as pd - +import tempfile from cdisc_rules_engine.dummy_models.dummy_dataset import DummyDataset from cdisc_rules_engine.exceptions.custom_exceptions import ( DatasetNotFoundError, @@ -156,7 +156,22 @@ def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> dict: return metadata_to_return def to_parquet(self, file_path: str) -> str: - return "" + """ + Save the dataset with full_path == file_path to a parquet file. + Returns the number of rows and the path to the saved parquet file, or (0, "") if not found. + """ + for dataset in self.data: + if hasattr(dataset, "full_path") and dataset.full_path == file_path: + # Convert the DummyDataset's data (assumed to be a DataFrame or dict-like) to a pandas DataFrame + if hasattr(dataset, "data"): + df = pd.DataFrame(dataset.data) + else: + # fallback: try to convert the whole object to dict + df = pd.DataFrame([dataset.__dict__]) + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") + df.to_parquet(temp_file.name) + return len(df.index), temp_file.name + return 0, "" def get_datasets(self) -> Iterable[SDTMDatasetMetadata]: return self.data From 6627c7ff57b2fed16f772aaabc1dffcde94918d4 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 19 Nov 2025 14:09:52 +0100 Subject: [PATCH 03/10] fix index error --- .../dataset_metadata_define_dataset_builder.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index f80337306..efe568136 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -1,3 +1,4 @@ +from cdisc_rules_engine.models.dataset import DatasetInterface from cdisc_rules_engine.services import logger from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder import os @@ -55,9 +56,14 @@ def build(self): if self.dataset_metadata.full_path else None ) - matching_row = merged_cleaned[ + matching_row: DatasetInterface = merged_cleaned[ merged_cleaned["dataset_location"].str.lower() == dataset_filename ] + if matching_row.empty: + matching_row: DatasetInterface = merged_cleaned[ + merged_cleaned["dataset_domain"].str.upper() + == self.dataset_metadata.domain.upper() + ] for column in merged.columns: merged[column] = matching_row[column].iloc[0] return merged From fc8f5a2511e6ebc5c4cb9dd2e6f9bc17b6bb7912 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 19 Nov 2025 14:10:48 +0100 Subject: [PATCH 04/10] add comment --- .../dataset_builders/dataset_metadata_define_dataset_builder.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index efe568136..49df0dd1d 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -60,6 +60,7 @@ def build(self): merged_cleaned["dataset_location"].str.lower() == dataset_filename ] if matching_row.empty: + # when using DASK dataset_filename refers to temp parquet filename matching_row: DatasetInterface = merged_cleaned[ merged_cleaned["dataset_domain"].str.upper() == self.dataset_metadata.domain.upper() From b2e0d52495d5665534431b37312bbb92fb9a0f17 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 21 Nov 2025 10:44:25 +0100 Subject: [PATCH 05/10] fix dataset metadata validation in test_rules_engine.py --- tests/unit/test_rules_engine.py | 49 +++++++++++++++++---------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 24183c0ff..3a6db9361 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1278,31 +1278,38 @@ def test_validate_single_dataset_not_equal_to( PandasDataset( pd.DataFrame.from_dict( { - "dataset_name": [ - "AE", - ], - "dataset_label": [ - "Adverse Events", - ], - "dataset_location": [ - "te.xpt", - ], + "dataset_name": ["AE"], + "dataset_label": ["Adverse Events"], + "dataset_location": ["te.xpt"], } ) ), [ { - "executionStatus": "execution_error", + "executionStatus": "success", "dataset": "ae.xpt", "domain": "AE", - "variables": [], - "message": "rule execution error", + "variables": ["dataset_label", "dataset_location", "dataset_name"], + "message": "Dataset metadata does not correspond to Define XML", "errors": [ { + "value": { + "dataset_name": "AE", + "dataset_location": "te.xpt", + "dataset_label": "Adverse Events", + }, "dataset": "ae.xpt", - "error": "An unknown exception has occurred", - "message": "single positional indexer is out-of-bounds", - } + "row": 1, + }, + { + "value": { + "dataset_name": "AE", + "dataset_location": "te.xpt", + "dataset_label": "Adverse Events", + }, + "dataset": "ae.xpt", + "row": 2, + }, ], } ], @@ -1318,15 +1325,9 @@ def test_validate_single_dataset_not_equal_to( PandasDataset( pd.DataFrame.from_dict( { - "dataset_name": [ - "AE", - ], - "dataset_label": [ - "Adverse Events", - ], - "dataset_location": [ - "ae.xpt", - ], + "dataset_name": ["AE"], + "dataset_label": ["Adverse Events"], + "dataset_location": ["ae.xpt"], } ) ), From bde3df2c1cce87965dd0aa5f5dacfe0d6366adfb Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Tue, 25 Nov 2025 18:37:11 +0100 Subject: [PATCH 06/10] use dataset metadata original_path in DatasetMetadataDefineDatasetBuilder --- .../dataset_metadata_define_dataset_builder.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 49df0dd1d..d36838a5e 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -62,8 +62,8 @@ def build(self): if matching_row.empty: # when using DASK dataset_filename refers to temp parquet filename matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_domain"].str.upper() - == self.dataset_metadata.domain.upper() + merged_cleaned["dataset_location"].str.lower() + == self.dataset_metadata.original_path.lower() ] for column in merged.columns: merged[column] = matching_row[column].iloc[0] From 7b4e98ba7374d218cbc8caf24073de817bd4cdf5 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 26 Nov 2025 14:56:30 +0100 Subject: [PATCH 07/10] fallback to dataset_domain if original_path is none --- .../dataset_metadata_define_dataset_builder.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index d36838a5e..6e9456537 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -61,10 +61,16 @@ def build(self): ] if matching_row.empty: # when using DASK dataset_filename refers to temp parquet filename - matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_location"].str.lower() - == self.dataset_metadata.original_path.lower() - ] + if self.dataset_metadata.original_path: + matching_row: DatasetInterface = merged_cleaned[ + merged_cleaned["dataset_location"].str.lower() + == self.dataset_metadata.original_path.lower() + ] + else: + matching_row: DatasetInterface = merged_cleaned[ + merged_cleaned["dataset_domain"].str.upper() + == self.dataset_metadata.domain.upper() + ] for column in merged.columns: merged[column] = matching_row[column].iloc[0] return merged From fa17093bedb716b2d9f0413bae3ff23f9d86363b Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 26 Nov 2025 16:44:26 +0100 Subject: [PATCH 08/10] use dataset_metadata.original_path --- .../dataset_metadata_define_dataset_builder.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py index 6e9456537..d36838a5e 100644 --- a/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/dataset_metadata_define_dataset_builder.py @@ -61,16 +61,10 @@ def build(self): ] if matching_row.empty: # when using DASK dataset_filename refers to temp parquet filename - if self.dataset_metadata.original_path: - matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_location"].str.lower() - == self.dataset_metadata.original_path.lower() - ] - else: - matching_row: DatasetInterface = merged_cleaned[ - merged_cleaned["dataset_domain"].str.upper() - == self.dataset_metadata.domain.upper() - ] + matching_row: DatasetInterface = merged_cleaned[ + merged_cleaned["dataset_location"].str.lower() + == self.dataset_metadata.original_path.lower() + ] for column in merged.columns: merged[column] = matching_row[column].iloc[0] return merged From f776200fced6182a904760658df41e1d019c6d79 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 26 Nov 2025 16:47:00 +0100 Subject: [PATCH 09/10] fix test_validate_dataset_metadata_against_define_xml --- tests/unit/test_rules_engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index 186ee8ec6..f89f43d65 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1371,6 +1371,7 @@ def test_validate_dataset_metadata_against_define_xml( first_record={"DOMAIN": "AE"}, full_path="CDISC01/test/ae.xpt", filename="ae.xpt", + original_path="ae.xpt", ) validation_result: List[dict] = RulesEngine( standard="sdtmig" From b5d373adb88f1ed8ef910ae9fd6ec148231322dc Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 26 Nov 2025 17:13:20 +0100 Subject: [PATCH 10/10] fix test_validate_dataset_metadata_against_define_xml --- tests/unit/test_rules_engine.py | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index f89f43d65..c27468bfb 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -1279,8 +1279,8 @@ def test_validate_single_dataset_not_equal_to( pd.DataFrame.from_dict( { "dataset_name": ["AE"], - "dataset_label": ["Adverse Events"], - "dataset_location": ["te.xpt"], + "dataset_label": ["Adverse"], + "dataset_location": ["ae.xpt"], } ) ), @@ -1295,21 +1295,12 @@ def test_validate_single_dataset_not_equal_to( { "value": { "dataset_name": "AE", - "dataset_location": "te.xpt", - "dataset_label": "Adverse Events", + "dataset_location": "ae.xpt", + "dataset_label": "Adverse", }, "dataset": "ae.xpt", "row": 1, }, - { - "value": { - "dataset_name": "AE", - "dataset_location": "te.xpt", - "dataset_label": "Adverse Events", - }, - "dataset": "ae.xpt", - "row": 2, - }, ], } ],