From 6ad14ad507c2ade55fbfcd71ff2d2568f8fa7e8e Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 18 Feb 2026 16:29:32 -0500 Subject: [PATCH 01/10] #1616: User-friendly errors for missing/wrong Datasets tab and columns, standard validation, and library metadata lookup failures --- TestRule/__init__.py | 64 +++++-- cdisc_rules_engine/enums/standard_types.py | 11 ++ .../exceptions/custom_exceptions.py | 35 ++++ .../data_services/excel_data_service.py | 78 ++++++--- .../data_services/local_data_service.py | 32 ++-- core.py | 10 ++ scripts/run_validation.py | 66 ++++++-- scripts/script_utils.py | 11 ++ .../unit/test_datasets_payload_validation.py | 160 ++++++++++++++++++ .../test_excel_data_service.py | 90 ++++++++++ .../test_local_data_service.py | 18 ++ 11 files changed, 501 insertions(+), 74 deletions(-) create mode 100644 cdisc_rules_engine/enums/standard_types.py create mode 100644 tests/unit/test_datasets_payload_validation.py diff --git a/TestRule/__init__.py b/TestRule/__init__.py index 0ccac2d0d..fea65e522 100644 --- a/TestRule/__init__.py +++ b/TestRule/__init__.py @@ -6,6 +6,13 @@ from cdisc_rules_engine.services.cdisc_library_service import CDISCLibraryService from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator from scripts.run_validation import run_single_rule_validation +from cdisc_rules_engine.exceptions.custom_exceptions import ( + LibraryMetadataNotFoundError, + library_metadata_not_found_message, +) +from cdisc_library_client.custom_exceptions import ( + ResourceNotFoundException as LibraryResourceNotFoundException, +) import json import os import asyncio @@ -17,11 +24,18 @@ class BadRequestError(Exception): pass +_DATASETS_TAB_GUIDANCE = ( + "Make sure there is a 'Datasets' tab in your test data workbook (name is " + "case-sensitive). The 'Datasets' tab must have column headers: 'Filename', " + "'Label', and 'Dataset Name' (also case-sensitive)." +) +_REQUIRED_DATASET_KEYS = {"filename", "label", "domain", "records", "variables"} + + def validate_datasets_payload(datasets): - required_keys = {"filename", "label", "domain", "records", "variables"} missing_keys = set() for dataset in datasets: - for key in required_keys: + for key in _REQUIRED_DATASET_KEYS: if key not in dataset: missing_keys.add(key) @@ -32,19 +46,38 @@ def validate_datasets_payload(datasets): ) if missing_keys: - raise KeyError( - f"one or more datasets missing the following keys {missing_keys}" + missing_list = sorted(missing_keys) + raise BadRequestError( + f"Test data is missing required dataset properties: {missing_list}. " + f"This usually means the 'Datasets' sheet in your Excel file is missing " + f"or has incorrect column headers. {_DATASETS_TAB_GUIDANCE}" ) def handle_exception(e: Exception): - if isinstance(e, KeyError): + if isinstance(e, BadRequestError): + return func.HttpResponse( + json.dumps({"error": "BadRequestError", "message": str(e)}), + status_code=400, + ) + if isinstance(e, LibraryMetadataNotFoundError): + msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e) return func.HttpResponse( - json.dumps({"error": "KeyError", "message": str(e)}), status_code=400 + json.dumps( + { + "error": "LibraryMetadataNotFoundError", + "message": msg, + } + ), + status_code=400, ) - elif isinstance(e, BadRequestError): + if isinstance(e, KeyError): + msg = str(e) + if "rule" in msg.lower() or "datasets" in msg.lower(): + msg = f"{msg} Ensure the request body includes the required JSON keys." return func.HttpResponse( - json.dumps({"error": "BadRequestError", "message": str(e)}), status_code=400 + json.dumps({"error": "BadRequestError", "message": msg}), + status_code=400, ) else: return func.HttpResponse( @@ -97,11 +130,18 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: # asyncio.run(cache_populator.load_available_ct_packages()) if standards_data or codelists: if standards_data: - asyncio.run( - cache_populator.load_standard( - standard, standard_version, standard_substandard + try: + asyncio.run( + cache_populator.load_standard( + standard, standard_version, standard_substandard + ) + ) + except LibraryResourceNotFoundException: + raise LibraryMetadataNotFoundError( + library_metadata_not_found_message( + standard, standard_version, standard_substandard + ) ) - ) asyncio.run(cache_populator.load_codelists(codelists)) if not rule: raise KeyError("'rule' required in request") diff --git a/cdisc_rules_engine/enums/standard_types.py b/cdisc_rules_engine/enums/standard_types.py new file mode 100644 index 000000000..a52333f9c --- /dev/null +++ b/cdisc_rules_engine/enums/standard_types.py @@ -0,0 +1,11 @@ +from cdisc_rules_engine.enums.base_enum import BaseEnum + + +class StandardTypes(BaseEnum): + """Standards supported by CDISC Library; used for CLI validation when not using --custom-standard.""" + + SDTMIG = "sdtmig" + SENDIG = "sendig" + ADAM = "adam" + CDASHIG = "cdashig" + TIG = "tig" diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index e3e5b79d5..c7fa7f031 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -40,6 +40,28 @@ class VariableMetadataNotFoundError(EngineError): ) +LIBRARY_METADATA_NOT_FOUND_HINT = ( + "Check your standard/version (CLI) or Library tab values (editor)." +) + + +def library_metadata_not_found_message(standard, version, substandard=None): + version_display = (version or "").replace("-", ".") + sub_part = f" substandard {substandard}" if substandard else "" + return ( + f"No library metadata found for standard '{standard}' " + f"version '{version_display}'{sub_part}. {LIBRARY_METADATA_NOT_FOUND_HINT}" + ) + + +class LibraryMetadataNotFoundError(EngineError): + code = 400 + description = ( + "Library metadata not found for the provided standard and version combination. " + f"{LIBRARY_METADATA_NOT_FOUND_HINT}" + ) + + class DomainNotFoundError(EngineError): """Raised when a required domain is not found in the dataset""" @@ -57,11 +79,24 @@ class InvalidDatasetFormat(EngineError): description = "Dataset data is malformed." +INVALID_DATASET_FORMAT_REASON = ( + "may be corrupted, incorrectly formatted, or encoded with an unexpected encoding." +) + + class InvalidJSONFormat(EngineError): code = 400 description = "JSON data is malformed." +class ExcelTestDataError(EngineError): + code = 400 + description = ( + "Excel test data file is missing required sheets or column headers. " + "Sheet and column names are case-sensitive." + ) + + class NumberOfAttemptsExceeded(EngineError): pass diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index dbfa60567..b4e79354c 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -14,7 +14,7 @@ from cdisc_rules_engine.models.variable_metadata_container import ( VariableMetadataContainer, ) -from cdisc_rules_engine.services import logger +from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError from cdisc_rules_engine.services.data_readers.data_reader_factory import ( DataReaderFactory, ) @@ -25,6 +25,11 @@ DATASET_LABEL_COLUMN = "Label" DATASET_NAME_COLUMN = "Dataset Name" +DATASETS_SHEET_REQUIRED_COLUMNS = ( + DATASET_FILENAME_COLUMN, + DATASET_LABEL_COLUMN, +) + class ExcelDataService(BaseDataService): _instance = None @@ -125,15 +130,19 @@ def get_raw_dataset_metadata( """ Returns dataset metadata as DatasetMetadata instance. """ - datasets_worksheet = pd.read_excel( - self.dataset_path, - sheet_name=DATASETS_SHEET_NAME, - na_values=[""], - keep_default_na=False, - ) - metadata = datasets_worksheet[ - datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name - ] + _worksheet = kwargs.get("_worksheet") + if _worksheet is not None: + metadata = _worksheet[_worksheet[DATASET_FILENAME_COLUMN] == dataset_name] + else: + datasets_worksheet = pd.read_excel( + self.dataset_path, + sheet_name=DATASETS_SHEET_NAME, + na_values=[""], + keep_default_na=False, + ) + metadata = datasets_worksheet[ + datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name + ] dataset = self.get_dataset(dataset_name=dataset_name) first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} return SDTMDatasetMetadata( @@ -199,23 +208,42 @@ def read_data(self, file_path: str) -> IOBase: def get_datasets(self) -> List[dict]: try: - worksheet = pd.read_excel( - self.dataset_path, - sheet_name=DATASETS_SHEET_NAME, - na_values=[""], - keep_default_na=False, - ) - except TypeError as e: - logger.error( - f"Failed to read datasets from the Excel file at {self.dataset_path}. " - f"Ensure the file is in the correct format. " - f"Try opening and saving the file in Microsoft Excel. " - f"Error: {str(e)}" - ) + with pd.ExcelFile(self.dataset_path) as xl: + sheet_names = xl.sheet_names + if DATASETS_SHEET_NAME not in sheet_names: + available = ", ".join(repr(s) for s in sheet_names) or "(none)" + raise ExcelTestDataError( + f"The workbook does not contain a sheet named " + f"'{DATASETS_SHEET_NAME}'. Make sure there is a 'Datasets' tab " + f"(case-sensitive). Available sheet names: {available}." + ) + worksheet = xl.parse( + DATASETS_SHEET_NAME, + na_values=[""], + keep_default_na=False, + ) + except ExcelTestDataError: raise + except Exception as e: + raise ExcelTestDataError( + f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. " + f"Details: {e}" + ) from e + + missing_cols = sorted( + set(DATASETS_SHEET_REQUIRED_COLUMNS) - set(worksheet.columns) + ) + if missing_cols: + raise ExcelTestDataError( + f"The '{DATASETS_SHEET_NAME}' sheet is missing required column(s): " + f"{missing_cols}. Column headers are case-sensitive. " + f"Required: '{DATASET_FILENAME_COLUMN}', '{DATASET_LABEL_COLUMN}', " + f"and optionally '{DATASET_NAME_COLUMN}'." + ) + datasets = [ - self.get_raw_dataset_metadata(dataset_name=dataset_filename) - for dataset_filename in worksheet[DATASET_FILENAME_COLUMN] + self.get_raw_dataset_metadata(dataset_name=fn, _worksheet=worksheet) + for fn in worksheet[DATASET_FILENAME_COLUMN] ] return datasets diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index cffb61bd3..6487f11db 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -24,11 +24,14 @@ convert_file_size, extract_file_name_from_path_string, ) +from cdisc_rules_engine.exceptions.custom_exceptions import ( + INVALID_DATASET_FORMAT_REASON, + InvalidDatasetFormat, +) from .base_data_service import BaseDataService, cached_dataset from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset import PandasDataset -from cdisc_rules_engine.services import logger import re @@ -244,28 +247,13 @@ def get_datasets(self) -> List[dict]: dataset_name=dataset_path ) datasets.append(dataset_metadata) + except InvalidDatasetFormat: + raise except Exception as e: - logger.error( - f"Failed to read metadata for dataset {dataset_path}. " - f"Error: {type(e).__name__}: {e}. Skipping this dataset." - ) - file_name = extract_file_name_from_path_string(dataset_path) - datasets.append( - SDTMDatasetMetadata( - name=( - file_name.split(".")[0].upper() - if "." in file_name - else file_name.upper() - ), - first_record={}, - label="", - modification_date="", - filename=file_name, - full_path=dataset_path, - file_size=0, - record_count=0, - ) - ) + raise InvalidDatasetFormat( + f"Your data file could not be read: {dataset_path}. " + f"It {INVALID_DATASET_FORMAT_REASON} Underlying error: {e}" + ) from e return datasets @staticmethod diff --git a/core.py b/core.py index d40f91687..eac04258b 100644 --- a/core.py +++ b/core.py @@ -23,6 +23,7 @@ from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions from cdisc_rules_engine.enums.report_types import ReportTypes +from cdisc_rules_engine.enums.standard_types import StandardTypes from cdisc_rules_engine.models.external_dictionaries_container import ( DictionaryTypes, ExternalDictionariesContainer, @@ -459,6 +460,15 @@ def validate( # noqa if not custom_standard: standard = standard.lower() + supported_standards = StandardTypes.values() + if standard not in supported_standards: + supported_list = ", ".join(sorted(supported_standards)) + logger.error( + f"Standard '{standard}' is not a supported standard. " + f"Supported standards: {supported_list}. " + f"Use --custom-standard flag for custom standards." + ) + ctx.exit(2) if raw_report is True: if not (len(output_format) == 1 and output_format[0] == ReportTypes.JSON.value): diff --git a/scripts/run_validation.py b/scripts/run_validation.py index fe3573b22..0b99fe245 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -18,6 +18,13 @@ from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.validation_args import Validation_args from cdisc_rules_engine.rules_engine import RulesEngine +from cdisc_rules_engine.exceptions.custom_exceptions import ( + EngineError, + InvalidDatasetFormat, + INVALID_DATASET_FORMAT_REASON, + LibraryMetadataNotFoundError, + library_metadata_not_found_message, +) from cdisc_rules_engine.services import logger as engine_logger from cdisc_rules_engine.services.cache import ( InMemoryCacheService, @@ -123,6 +130,37 @@ def initialize_logger(disabled, log_level): engine_logger.setLevel(log_level) +def _get_datasets_or_raise(data_service): + try: + return data_service.get_datasets() + except EngineError: + raise + except Exception as e: + raise InvalidDatasetFormat( + f"Your data files could not be read. They {INVALID_DATASET_FORMAT_REASON} " + f"Underlying error: {e}" + ) from e + + +def _convert_datasets_to_parquet_if_needed( + data_service, datasets, created_files, large_dataset_validation: bool +): + if not (large_dataset_validation and data_service.standard != "usdm"): + return + engine_logger.warning( + "Large datasets must use parquet format, converting all datasets to parquet" + ) + for dataset in datasets: + file_path = dataset.full_path + if file_path.endswith(".parquet"): + continue + num_rows, new_file = data_service.to_parquet(file_path) + created_files.append(new_file) + dataset.full_path = new_file + dataset.record_count = num_rows + dataset.original_path = file_path + + def run_validation(args: Validation_args): set_log_level(args) # fill cache @@ -160,21 +198,13 @@ def run_validation(args: Validation_args): large_dataset_validation: bool = ( data_service.dataset_implementation != PandasDataset ) - datasets = data_service.get_datasets() - if large_dataset_validation and data_service.standard != "usdm": - # convert all files to parquet temp files - engine_logger.warning( - "Large datasets must use parquet format, converting all datasets to parquet" - ) - for dataset in datasets: - file_path = dataset.full_path - if file_path.endswith(".parquet"): - continue - num_rows, new_file = data_service.to_parquet(file_path) - created_files.append(new_file) - dataset.full_path = new_file - dataset.record_count = num_rows - dataset.original_path = file_path + datasets = _get_datasets_or_raise(data_service) + _convert_datasets_to_parquet_if_needed( + data_service, + datasets, + created_files, + large_dataset_validation, + ) engine_logger.info( f"Running {len(rules)} rules against {len(datasets)} datasets" ) @@ -249,6 +279,12 @@ def run_single_rule_validation( standard, standard_version, standard_substandard ) standard_metadata = cache.get(standard_details_cache_key) + if not standard_metadata and standard and standard_version: + raise LibraryMetadataNotFoundError( + library_metadata_not_found_message( + standard, standard_version, standard_substandard + ) + ) if standard_metadata: model_cache_key = get_model_details_cache_key_from_ig(standard_metadata) model_metadata = cache.get(model_cache_key) diff --git a/scripts/script_utils.py b/scripts/script_utils.py index b08035d3a..8dbf8efaa 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -27,6 +27,10 @@ from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) +from cdisc_rules_engine.exceptions.custom_exceptions import ( + LibraryMetadataNotFoundError, + library_metadata_not_found_message, +) def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa @@ -75,6 +79,13 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa data = pickle.load(f) standard_metadata = data.get(standard_details_cache_key, {}) + if not standard_metadata and not args.custom_standard: + raise LibraryMetadataNotFoundError( + library_metadata_not_found_message( + args.standard, args.version, args.substandard + ) + ) + if standard_metadata: model_cache_key = get_model_details_cache_key_from_ig(standard_metadata) with open(models_file, "rb") as f: diff --git a/tests/unit/test_datasets_payload_validation.py b/tests/unit/test_datasets_payload_validation.py new file mode 100644 index 000000000..9827609f2 --- /dev/null +++ b/tests/unit/test_datasets_payload_validation.py @@ -0,0 +1,160 @@ +""" +Unit tests for datasets payload validation and API error handling. +Covers TestRule Azure function: validate_datasets_payload and handle_exception. +""" + +import json +import importlib +import sys +from unittest.mock import MagicMock + +import pytest +from cdisc_rules_engine.exceptions.custom_exceptions import ( + LibraryMetadataNotFoundError, + library_metadata_not_found_message, +) + + +class _MockHttpResponse: + def __init__(self, body, status_code=200): + self.status_code = status_code + self._body = body if isinstance(body, bytes) else body.encode("utf-8") + + def get_body(self): + return self._body + + +_mock_func = MagicMock() +_mock_func.HttpResponse = _MockHttpResponse +_mock_azure = MagicMock() +_mock_azure.functions = _mock_func +sys.modules["azure"] = _mock_azure +sys.modules["azure.functions"] = _mock_func + + +def _get_testrule_module(): + return importlib.import_module("TestRule") + + +class TestValidateDatasetsPayload: + """Test validate_datasets_payload raises clear, actionable errors.""" + + def test_missing_label_raises_bad_request_with_coping_guidance(self): + testrule = _get_testrule_module() + datasets = [ + { + "filename": "dm.xpt", + "domain": "DM", + "records": {"USUBJID": ["1"]}, + "variables": [{"name": "USUBJID"}], + } + ] + with pytest.raises(testrule.BadRequestError) as exc_info: + testrule.validate_datasets_payload(datasets) + msg = str(exc_info.value) + assert "label" in msg + assert "Datasets" in msg + assert "case-sensitive" in msg + assert "Filename" in msg or "Label" in msg + + def test_missing_multiple_keys_includes_all_in_message(self): + testrule = _get_testrule_module() + datasets = [ + { + "filename": "dm.xpt", + } + ] + with pytest.raises(testrule.BadRequestError) as exc_info: + testrule.validate_datasets_payload(datasets) + msg = str(exc_info.value) + assert "label" in msg + assert "domain" in msg + assert "records" in msg + assert "variables" in msg + assert "case-sensitive" in msg + + def test_valid_payload_passes(self): + testrule = _get_testrule_module() + datasets = [ + { + "filename": "dm.xpt", + "label": "Demographics", + "domain": "DM", + "records": {"USUBJID": ["1"]}, + "variables": [{"name": "USUBJID"}], + } + ] + testrule.validate_datasets_payload(datasets) + + def test_missing_variable_metadata_raises_bad_request(self): + testrule = _get_testrule_module() + datasets = [ + { + "filename": "dm.xpt", + "label": "Demographics", + "domain": "DM", + "records": {"USUBJID": ["1"]}, + "variables": [None], + } + ] + with pytest.raises(testrule.BadRequestError) as exc_info: + testrule.validate_datasets_payload(datasets) + assert "variable metadata" in str(exc_info.value) + + +class TestHandleException: + """Test that handle_exception returns user-friendly JSON for clients.""" + + def test_bad_request_error_returns_400_with_message(self): + testrule = _get_testrule_module() + e = testrule.BadRequestError( + "Test data is missing required dataset properties: ['label']. " + "Make sure there is a 'Datasets' tab (case-sensitive)." + ) + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "BadRequestError" + assert "message" in body + assert "Datasets" in body["message"] + assert "case-sensitive" in body["message"] + + def test_key_error_for_rule_returns_400_with_bad_request_error_type(self): + testrule = _get_testrule_module() + e = KeyError("'rule' required in request") + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "BadRequestError" + assert ( + "rule" in body["message"].lower() or "required" in body["message"].lower() + ) + + def test_key_error_for_datasets_returns_400_with_bad_request_error_type(self): + testrule = _get_testrule_module() + e = KeyError("'datasets' required in request") + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "BadRequestError" + + def test_library_metadata_not_found_error_returns_400_with_message(self): + testrule = _get_testrule_module() + e = LibraryMetadataNotFoundError( + library_metadata_not_found_message("sdtmig", "3-4") + ) + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "LibraryMetadataNotFoundError" + assert "sdtmig" in body["message"] + assert "3.4" in body["message"] or "version" in body["message"] + + def test_other_exception_returns_400_unknown_exception(self): + testrule = _get_testrule_module() + e = ValueError("Something else went wrong") + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "Unknown Exception" + assert "Something else went wrong" in body["message"] diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index 6e2e4c90f..9648e2cd3 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -7,7 +7,11 @@ from openpyxl import Workbook from cdisc_rules_engine.config.config import ConfigService +from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError from cdisc_rules_engine.services.data_services import ExcelDataService +from cdisc_rules_engine.services.data_services.excel_data_service import ( + DATASETS_SHEET_NAME, +) from cdisc_rules_engine.models.dataset import PandasDataset @@ -175,3 +179,89 @@ def test_na_value_preserved_not_converted_to_nan(): finally: # Cleanup temporary file os.unlink(temp_path) + + +def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): + """ + When the workbook has no 'Datasets' sheet (e.g. tab named 'datasets' instead), + get_datasets() raises ExcelTestDataError with message that includes + case-sensitive guidance. + """ + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: + temp_path = tmp_file.name + + try: + wb = Workbook() + wb.active.title = "datasets" + wb.active.append(["Filename", "Label", "Dataset Name"]) + wb.active.append(["dm.xpt", "Demographics", "DM"]) + wb.create_sheet("dm.xpt") + dm_sheet = wb["dm.xpt"] + dm_sheet.append(["USUBJID", "DOMAIN"]) + dm_sheet.append(["Study ID", "Domain"]) + dm_sheet.append(["Char", "Char"]) + dm_sheet.append(["20", "2"]) + dm_sheet.append(["SUBJ001", "DM"]) + wb.save(temp_path) + wb.close() + + ExcelDataService._instance = None + mock_cache = MagicMock() + mock_cache.get_dataset.return_value = None + + data_service = ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) + + with pytest.raises(ExcelTestDataError) as exc_info: + data_service.get_datasets() + + msg = str(exc_info.value) + assert DATASETS_SHEET_NAME in msg or "Datasets" in msg + assert "case-sensitive" in msg + finally: + os.unlink(temp_path) + + +def test_get_datasets_missing_label_column_raises_friendly_error(): + """ + When the 'Datasets' sheet exists but is missing the 'Label' column, + get_datasets() raises ExcelTestDataError with column names and + case-sensitive guidance. + """ + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as tmp_file: + temp_path = tmp_file.name + + try: + wb = Workbook() + datasets_sheet = wb.active + datasets_sheet.title = DATASETS_SHEET_NAME + datasets_sheet.append(["Filename", "label", "Dataset Name"]) + datasets_sheet.append(["dm.xpt", "Demographics", "DM"]) + wb.create_sheet("dm.xpt") + dm_sheet = wb["dm.xpt"] + dm_sheet.append(["USUBJID", "DOMAIN"]) + dm_sheet.append(["Study ID", "Domain"]) + dm_sheet.append(["Char", "Char"]) + dm_sheet.append(["20", "2"]) + dm_sheet.append(["SUBJ001", "DM"]) + wb.save(temp_path) + wb.close() + + ExcelDataService._instance = None + mock_cache = MagicMock() + mock_cache.get_dataset.return_value = None + + data_service = ExcelDataService( + mock_cache, MagicMock(), MagicMock(), dataset_path=temp_path + ) + + with pytest.raises(ExcelTestDataError) as exc_info: + data_service.get_datasets() + + msg = str(exc_info.value) + assert "Label" in msg + assert "case-sensitive" in msg + assert "column" in msg.lower() + finally: + os.unlink(temp_path) diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 82b30744c..1282ad99b 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -3,6 +3,7 @@ import pytest from cdisc_rules_engine.config.config import ConfigService +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat from cdisc_rules_engine.services.data_services import LocalDataService from cdisc_rules_engine.models.dataset import PandasDataset @@ -85,3 +86,20 @@ def test_get_variables_metdata(dataset_implementation): ] for key in expected_keys: assert key in data + + +def test_get_datasets_raises_invalid_dataset_format_when_file_cannot_be_read(): + """get_datasets() raises InvalidDatasetFormat with user-friendly message when a file cannot be read.""" + mock_cache = MagicMock() + mock_cache.get_dataset.return_value = None + data_service = LocalDataService( + mock_cache, MagicMock(), MagicMock(), dataset_paths=["/bad/path.xpt"] + ) + with pytest.raises(InvalidDatasetFormat) as exc_info: + data_service.get_datasets() + assert "Your data file could not be read" in str(exc_info.value) + assert "/bad/path.xpt" in str(exc_info.value) + assert ( + "corrupted" in str(exc_info.value).lower() + or "formatted" in str(exc_info.value).lower() + ) From b2c8c880f26e52143b7d9d0adab90f00565bf7d5 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 18 Feb 2026 16:55:16 -0500 Subject: [PATCH 02/10] Add USDM to StandardTypes enum --- cdisc_rules_engine/enums/standard_types.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cdisc_rules_engine/enums/standard_types.py b/cdisc_rules_engine/enums/standard_types.py index a52333f9c..ab7570d9b 100644 --- a/cdisc_rules_engine/enums/standard_types.py +++ b/cdisc_rules_engine/enums/standard_types.py @@ -9,3 +9,4 @@ class StandardTypes(BaseEnum): ADAM = "adam" CDASHIG = "cdashig" TIG = "tig" + USDM = "usdm" From b6205fe77405a6fd3b53adfe16977f42ac2c16c8 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 18 Feb 2026 17:15:56 -0500 Subject: [PATCH 03/10] Do not raise LibraryMetadataNotFoundError for USDM when standard metadata is absent in cache --- scripts/script_utils.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/scripts/script_utils.py b/scripts/script_utils.py index 8dbf8efaa..c4d117cf7 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -80,11 +80,12 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa standard_metadata = data.get(standard_details_cache_key, {}) if not standard_metadata and not args.custom_standard: - raise LibraryMetadataNotFoundError( - library_metadata_not_found_message( - args.standard, args.version, args.substandard + if args.standard and args.standard.lower() != "usdm": + raise LibraryMetadataNotFoundError( + library_metadata_not_found_message( + args.standard, args.version, args.substandard + ) ) - ) if standard_metadata: model_cache_key = get_model_details_cache_key_from_ig(standard_metadata) From dc76434c72cbc27a23a8b1eb07248b37a0a0bc3d Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 24 Feb 2026 11:50:40 -0500 Subject: [PATCH 04/10] Remove datasets key reporting, inline Datasets guidance, add CT package validation (script_utils + run_single_rule_validation) --- TestRule/__init__.py | 22 +++++++------ .../exceptions/custom_exceptions.py | 11 +++++++ scripts/run_validation.py | 14 +++++++++ scripts/script_utils.py | 16 ++++++++++ .../unit/test_datasets_payload_validation.py | 31 ++++++++++++++----- 5 files changed, 77 insertions(+), 17 deletions(-) diff --git a/TestRule/__init__.py b/TestRule/__init__.py index fea65e522..fe18396a7 100644 --- a/TestRule/__init__.py +++ b/TestRule/__init__.py @@ -7,6 +7,7 @@ from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator from scripts.run_validation import run_single_rule_validation from cdisc_rules_engine.exceptions.custom_exceptions import ( + CTPackageNotFoundError, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) @@ -24,11 +25,6 @@ class BadRequestError(Exception): pass -_DATASETS_TAB_GUIDANCE = ( - "Make sure there is a 'Datasets' tab in your test data workbook (name is " - "case-sensitive). The 'Datasets' tab must have column headers: 'Filename', " - "'Label', and 'Dataset Name' (also case-sensitive)." -) _REQUIRED_DATASET_KEYS = {"filename", "label", "domain", "records", "variables"} @@ -46,11 +42,13 @@ def validate_datasets_payload(datasets): ) if missing_keys: - missing_list = sorted(missing_keys) raise BadRequestError( - f"Test data is missing required dataset properties: {missing_list}. " - f"This usually means the 'Datasets' sheet in your Excel file is missing " - f"or has incorrect column headers. {_DATASETS_TAB_GUIDANCE}" + "Test data is missing required dataset properties. " + "This usually means the 'Datasets' sheet in your Excel file is missing " + "or has incorrect column headers. " + "Make sure there is a 'Datasets' tab in your test data workbook (name is " + "case-sensitive). The 'Datasets' tab must have column headers: 'Filename', " + "'Label', and 'Dataset Name' (also case-sensitive)." ) @@ -71,6 +69,12 @@ def handle_exception(e: Exception): ), status_code=400, ) + if isinstance(e, CTPackageNotFoundError): + msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e) + return func.HttpResponse( + json.dumps({"error": "CTPackageNotFoundError", "message": msg}), + status_code=400, + ) if isinstance(e, KeyError): msg = str(e) if "rule" in msg.lower() or "datasets" in msg.lower(): diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index c7fa7f031..bde0b203a 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -97,6 +97,17 @@ class ExcelTestDataError(EngineError): ) +CT_PACKAGE_NOT_FOUND_PREFIX = "Controlled terminology package(s) not found" + + +class CTPackageNotFoundError(EngineError): + code = 400 + description = ( + f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache. " + "Check package names and run 'core.py update-cache' if needed." + ) + + class NumberOfAttemptsExceeded(EngineError): pass diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 0b99fe245..d7b0b1e7d 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -19,6 +19,8 @@ from cdisc_rules_engine.models.validation_args import Validation_args from cdisc_rules_engine.rules_engine import RulesEngine from cdisc_rules_engine.exceptions.custom_exceptions import ( + CT_PACKAGE_NOT_FOUND_PREFIX, + CTPackageNotFoundError, EngineError, InvalidDatasetFormat, INVALID_DATASET_FORMAT_REASON, @@ -295,8 +297,20 @@ def run_single_rule_validation( ) ct_package_metadata = {} + codelists = codelists or [] for codelist in codelists: ct_package_metadata[codelist] = cache.get(codelist) + if codelists: + missing_ct = [c for c in codelists if ct_package_metadata.get(c) is None] + if missing_ct: + sorted_missing = sorted( + missing_ct, key=lambda x: (x is None, str(x) if x is not None else "") + ) + raise CTPackageNotFoundError( + f"{CT_PACKAGE_NOT_FOUND_PREFIX}: " + f"{', '.join(str(c) for c in sorted_missing)}. " + "Check Library tab or request codelist names." + ) library_metadata = LibraryMetadataContainer( standard_metadata=standard_metadata, diff --git a/scripts/script_utils.py b/scripts/script_utils.py index c4d117cf7..62b4aed2f 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -28,6 +28,8 @@ DefineXMLReaderFactory, ) from cdisc_rules_engine.exceptions.custom_exceptions import ( + CTPackageNotFoundError, + CT_PACKAGE_NOT_FOUND_PREFIX, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) @@ -129,6 +131,20 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa with open(os.path.join(args.cache, file_name), "rb") as f: data = pickle.load(f) ct_package_data[ct_version] = data + if args.controlled_terminology_package: + requested = set(args.controlled_terminology_package) + missing = requested - published_ct_packages + if missing: + sorted_missing = sorted( + missing, key=lambda x: (x is None, str(x) if x is not None else "") + ) + available = ", ".join(sorted(published_ct_packages)) or "(none)" + raise CTPackageNotFoundError( + f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache: " + f"{', '.join(str(c) for c in sorted_missing)}. " + f"Available packages: {available}. " + "Run 'core.py update-cache' to refresh the cache." + ) if args.define_xml_path and define_version.model_package == "define_2_1": ( standards, diff --git a/tests/unit/test_datasets_payload_validation.py b/tests/unit/test_datasets_payload_validation.py index 9827609f2..1a875a3e1 100644 --- a/tests/unit/test_datasets_payload_validation.py +++ b/tests/unit/test_datasets_payload_validation.py @@ -10,6 +10,7 @@ import pytest from cdisc_rules_engine.exceptions.custom_exceptions import ( + CTPackageNotFoundError, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) @@ -39,7 +40,9 @@ def _get_testrule_module(): class TestValidateDatasetsPayload: """Test validate_datasets_payload raises clear, actionable errors.""" - def test_missing_label_raises_bad_request_with_coping_guidance(self): + def test_missing_required_properties_raises_bad_request_with_datasets_guidance( + self, + ): testrule = _get_testrule_module() datasets = [ { @@ -52,12 +55,12 @@ def test_missing_label_raises_bad_request_with_coping_guidance(self): with pytest.raises(testrule.BadRequestError) as exc_info: testrule.validate_datasets_payload(datasets) msg = str(exc_info.value) - assert "label" in msg + assert "missing required dataset properties" in msg assert "Datasets" in msg assert "case-sensitive" in msg assert "Filename" in msg or "Label" in msg - def test_missing_multiple_keys_includes_all_in_message(self): + def test_missing_multiple_required_properties_raises_with_datasets_guidance(self): testrule = _get_testrule_module() datasets = [ { @@ -67,11 +70,10 @@ def test_missing_multiple_keys_includes_all_in_message(self): with pytest.raises(testrule.BadRequestError) as exc_info: testrule.validate_datasets_payload(datasets) msg = str(exc_info.value) - assert "label" in msg - assert "domain" in msg - assert "records" in msg - assert "variables" in msg + assert "missing required dataset properties" in msg + assert "Datasets" in msg assert "case-sensitive" in msg + assert "Filename" in msg or "Label" in msg def test_valid_payload_passes(self): testrule = _get_testrule_module() @@ -108,7 +110,7 @@ class TestHandleException: def test_bad_request_error_returns_400_with_message(self): testrule = _get_testrule_module() e = testrule.BadRequestError( - "Test data is missing required dataset properties: ['label']. " + "Test data is missing required dataset properties. " "Make sure there is a 'Datasets' tab (case-sensitive)." ) response = testrule.handle_exception(e) @@ -150,6 +152,19 @@ def test_library_metadata_not_found_error_returns_400_with_message(self): assert "sdtmig" in body["message"] assert "3.4" in body["message"] or "version" in body["message"] + def test_ct_package_not_found_error_returns_400_with_message(self): + testrule = _get_testrule_module() + e = CTPackageNotFoundError( + "Controlled terminology package(s) not found: bad-ct-pkg. " + "Check Library tab or request codelist names." + ) + response = testrule.handle_exception(e) + assert response.status_code == 400 + body = json.loads(response.get_body().decode()) + assert body["error"] == "CTPackageNotFoundError" + assert "not found" in body["message"] + assert "bad-ct-pkg" in body["message"] + def test_other_exception_returns_400_unknown_exception(self): testrule = _get_testrule_module() e = ValueError("Something else went wrong") From cc22934e10e3d3e334aeb106120f0689a23a15e6 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 26 Feb 2026 14:14:02 -0500 Subject: [PATCH 05/10] Improve Library/Datasets/CT validation errors; remove hint text, move _get_datasets_or_raise to script_utils, unify CT check after Define --- TestRule/__init__.py | 18 ++++---- .../exceptions/custom_exceptions.py | 20 ++------- .../data_services/local_data_service.py | 8 +--- scripts/run_validation.py | 21 ++-------- scripts/script_utils.py | 42 ++++++++++++------- .../unit/test_datasets_payload_validation.py | 21 ++++------ .../test_local_data_service.py | 4 -- 7 files changed, 52 insertions(+), 82 deletions(-) diff --git a/TestRule/__init__.py b/TestRule/__init__.py index fe18396a7..a290d24b8 100644 --- a/TestRule/__init__.py +++ b/TestRule/__init__.py @@ -7,6 +7,7 @@ from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator from scripts.run_validation import run_single_rule_validation from cdisc_rules_engine.exceptions.custom_exceptions import ( + CT_PACKAGE_NOT_FOUND_PREFIX, CTPackageNotFoundError, LibraryMetadataNotFoundError, library_metadata_not_found_message, @@ -42,14 +43,7 @@ def validate_datasets_payload(datasets): ) if missing_keys: - raise BadRequestError( - "Test data is missing required dataset properties. " - "This usually means the 'Datasets' sheet in your Excel file is missing " - "or has incorrect column headers. " - "Make sure there is a 'Datasets' tab in your test data workbook (name is " - "case-sensitive). The 'Datasets' tab must have column headers: 'Filename', " - "'Label', and 'Dataset Name' (also case-sensitive)." - ) + raise BadRequestError("Test data is incorrect and missing required formatting.") def handle_exception(e: Exception): @@ -146,7 +140,13 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: # standard, standard_version, standard_substandard ) ) - asyncio.run(cache_populator.load_codelists(codelists)) + try: + asyncio.run(cache_populator.load_codelists(codelists or [])) + except LibraryResourceNotFoundException: + raise CTPackageNotFoundError( + f"{CT_PACKAGE_NOT_FOUND_PREFIX}: " + f"{', '.join(str(c) for c in (codelists or []))}." + ) if not rule: raise KeyError("'rule' required in request") datasets = json_data.get("datasets") diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index bde0b203a..a29d791c9 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -40,25 +40,19 @@ class VariableMetadataNotFoundError(EngineError): ) -LIBRARY_METADATA_NOT_FOUND_HINT = ( - "Check your standard/version (CLI) or Library tab values (editor)." -) - - def library_metadata_not_found_message(standard, version, substandard=None): version_display = (version or "").replace("-", ".") sub_part = f" substandard {substandard}" if substandard else "" return ( f"No library metadata found for standard '{standard}' " - f"version '{version_display}'{sub_part}. {LIBRARY_METADATA_NOT_FOUND_HINT}" + f"version '{version_display}'{sub_part}." ) class LibraryMetadataNotFoundError(EngineError): code = 400 description = ( - "Library metadata not found for the provided standard and version combination. " - f"{LIBRARY_METADATA_NOT_FOUND_HINT}" + "Library metadata not found for the provided standard and version combination." ) @@ -79,11 +73,6 @@ class InvalidDatasetFormat(EngineError): description = "Dataset data is malformed." -INVALID_DATASET_FORMAT_REASON = ( - "may be corrupted, incorrectly formatted, or encoded with an unexpected encoding." -) - - class InvalidJSONFormat(EngineError): code = 400 description = "JSON data is malformed." @@ -102,10 +91,7 @@ class ExcelTestDataError(EngineError): class CTPackageNotFoundError(EngineError): code = 400 - description = ( - f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache. " - "Check package names and run 'core.py update-cache' if needed." - ) + description = f"{CT_PACKAGE_NOT_FOUND_PREFIX}." class NumberOfAttemptsExceeded(EngineError): diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 6487f11db..6f2408a51 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -24,10 +24,7 @@ convert_file_size, extract_file_name_from_path_string, ) -from cdisc_rules_engine.exceptions.custom_exceptions import ( - INVALID_DATASET_FORMAT_REASON, - InvalidDatasetFormat, -) +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat from .base_data_service import BaseDataService, cached_dataset from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface @@ -251,8 +248,7 @@ def get_datasets(self) -> List[dict]: raise except Exception as e: raise InvalidDatasetFormat( - f"Your data file could not be read: {dataset_path}. " - f"It {INVALID_DATASET_FORMAT_REASON} Underlying error: {e}" + f"Your data file could not be read: {dataset_path}." ) from e return datasets diff --git a/scripts/run_validation.py b/scripts/run_validation.py index d7b0b1e7d..109b651ff 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -21,9 +21,6 @@ from cdisc_rules_engine.exceptions.custom_exceptions import ( CT_PACKAGE_NOT_FOUND_PREFIX, CTPackageNotFoundError, - EngineError, - InvalidDatasetFormat, - INVALID_DATASET_FORMAT_REASON, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) @@ -52,8 +49,9 @@ fill_cache_with_dictionaries, get_cache_service, get_library_metadata_from_cache, - get_rules, get_max_dataset_size, + get_rules, + _get_datasets_or_raise, ) from cdisc_rules_engine.services.reporting import BaseReport, ReportFactory from cdisc_rules_engine.utilities.progress_displayers import get_progress_displayer @@ -132,18 +130,6 @@ def initialize_logger(disabled, log_level): engine_logger.setLevel(log_level) -def _get_datasets_or_raise(data_service): - try: - return data_service.get_datasets() - except EngineError: - raise - except Exception as e: - raise InvalidDatasetFormat( - f"Your data files could not be read. They {INVALID_DATASET_FORMAT_REASON} " - f"Underlying error: {e}" - ) from e - - def _convert_datasets_to_parquet_if_needed( data_service, datasets, created_files, large_dataset_validation: bool ): @@ -308,8 +294,7 @@ def run_single_rule_validation( ) raise CTPackageNotFoundError( f"{CT_PACKAGE_NOT_FOUND_PREFIX}: " - f"{', '.join(str(c) for c in sorted_missing)}. " - "Check Library tab or request codelist names." + f"{', '.join(str(c) for c in sorted_missing)}." ) library_metadata = LibraryMetadataContainer( diff --git a/scripts/script_utils.py b/scripts/script_utils.py index 62b4aed2f..fb95e5579 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -30,11 +30,22 @@ from cdisc_rules_engine.exceptions.custom_exceptions import ( CTPackageNotFoundError, CT_PACKAGE_NOT_FOUND_PREFIX, + EngineError, + InvalidDatasetFormat, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) +def _get_datasets_or_raise(data_service): + try: + return data_service.get_datasets() + except EngineError: + raise + except Exception as e: + raise InvalidDatasetFormat("Your data files could not be read.") from e + + def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa if args.custom_standard: check = check_custom_standard(args) @@ -118,6 +129,7 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa variables_metadata = data.get(cache_key) ct_package_data = {} + define_referenced_ct = set() cache_files = next(os.walk(args.cache), (None, None, []))[2] ct_files = [file_name for file_name in cache_files if "ct-" in file_name] published_ct_packages = set() @@ -131,20 +143,6 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa with open(os.path.join(args.cache, file_name), "rb") as f: data = pickle.load(f) ct_package_data[ct_version] = data - if args.controlled_terminology_package: - requested = set(args.controlled_terminology_package) - missing = requested - published_ct_packages - if missing: - sorted_missing = sorted( - missing, key=lambda x: (x is None, str(x) if x is not None else "") - ) - available = ", ".join(sorted(published_ct_packages)) or "(none)" - raise CTPackageNotFoundError( - f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache: " - f"{', '.join(str(c) for c in sorted_missing)}. " - f"Available packages: {available}. " - "Run 'core.py update-cache' to refresh the cache." - ) if args.define_xml_path and define_version.model_package == "define_2_1": ( standards, @@ -152,6 +150,10 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa extensible, merged_flag, ) = define_xml_reader.get_ct_standards_metadata() + define_referenced_ct = { + f"{standard.publishing_set.lower()}ct-{standard.version}" + for standard in standards + } for standard in standards: pickle_filename = ( f"{standard.publishing_set.lower()}ct-{standard.version}.pkl" @@ -169,6 +171,18 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa if args.define_xml_path: extensible_terms = define_xml_reader.get_extensible_codelist_mappings() ct_package_data["extensible"] = extensible_terms + requested_ct = set(args.controlled_terminology_package or []) + if args.define_xml_path and define_version.model_package == "define_2_1": + requested_ct |= define_referenced_ct + missing_ct = requested_ct - published_ct_packages + if missing_ct: + sorted_missing = sorted( + missing_ct, key=lambda x: (x is None, str(x) if x is not None else "") + ) + raise CTPackageNotFoundError( + f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache: " + f"{', '.join(str(c) for c in sorted_missing)}." + ) return LibraryMetadataContainer( standard_metadata=standard_metadata, standard_schema_definition=standard_schema_definition, diff --git a/tests/unit/test_datasets_payload_validation.py b/tests/unit/test_datasets_payload_validation.py index 1a875a3e1..a37729421 100644 --- a/tests/unit/test_datasets_payload_validation.py +++ b/tests/unit/test_datasets_payload_validation.py @@ -55,10 +55,7 @@ def test_missing_required_properties_raises_bad_request_with_datasets_guidance( with pytest.raises(testrule.BadRequestError) as exc_info: testrule.validate_datasets_payload(datasets) msg = str(exc_info.value) - assert "missing required dataset properties" in msg - assert "Datasets" in msg - assert "case-sensitive" in msg - assert "Filename" in msg or "Label" in msg + assert "Test data is incorrect and missing required formatting" in msg def test_missing_multiple_required_properties_raises_with_datasets_guidance(self): testrule = _get_testrule_module() @@ -70,10 +67,7 @@ def test_missing_multiple_required_properties_raises_with_datasets_guidance(self with pytest.raises(testrule.BadRequestError) as exc_info: testrule.validate_datasets_payload(datasets) msg = str(exc_info.value) - assert "missing required dataset properties" in msg - assert "Datasets" in msg - assert "case-sensitive" in msg - assert "Filename" in msg or "Label" in msg + assert "Test data is incorrect and missing required formatting" in msg def test_valid_payload_passes(self): testrule = _get_testrule_module() @@ -110,16 +104,16 @@ class TestHandleException: def test_bad_request_error_returns_400_with_message(self): testrule = _get_testrule_module() e = testrule.BadRequestError( - "Test data is missing required dataset properties. " - "Make sure there is a 'Datasets' tab (case-sensitive)." + "Test data is incorrect and missing required formatting." ) response = testrule.handle_exception(e) assert response.status_code == 400 body = json.loads(response.get_body().decode()) assert body["error"] == "BadRequestError" assert "message" in body - assert "Datasets" in body["message"] - assert "case-sensitive" in body["message"] + assert ( + "Test data is incorrect and missing required formatting" in body["message"] + ) def test_key_error_for_rule_returns_400_with_bad_request_error_type(self): testrule = _get_testrule_module() @@ -155,8 +149,7 @@ def test_library_metadata_not_found_error_returns_400_with_message(self): def test_ct_package_not_found_error_returns_400_with_message(self): testrule = _get_testrule_module() e = CTPackageNotFoundError( - "Controlled terminology package(s) not found: bad-ct-pkg. " - "Check Library tab or request codelist names." + "Controlled terminology package(s) not found: bad-ct-pkg." ) response = testrule.handle_exception(e) assert response.status_code == 400 diff --git a/tests/unit/test_services/test_data_service/test_local_data_service.py b/tests/unit/test_services/test_data_service/test_local_data_service.py index 1282ad99b..4dcfe8eed 100644 --- a/tests/unit/test_services/test_data_service/test_local_data_service.py +++ b/tests/unit/test_services/test_data_service/test_local_data_service.py @@ -99,7 +99,3 @@ def test_get_datasets_raises_invalid_dataset_format_when_file_cannot_be_read(): data_service.get_datasets() assert "Your data file could not be read" in str(exc_info.value) assert "/bad/path.xpt" in str(exc_info.value) - assert ( - "corrupted" in str(exc_info.value).lower() - or "formatted" in str(exc_info.value).lower() - ) From 90ecaba9aa8519d124af7c414b253c376b0875b7 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 5 Mar 2026 12:07:52 -0500 Subject: [PATCH 06/10] Remove dataset wrapper, centralize CT check, add cached_worksheet to get_raw_dataset_metadata --- .../data_services/excel_data_service.py | 19 +++++++++++++------ scripts/run_validation.py | 15 +-------------- scripts/script_utils.py | 11 ----------- 3 files changed, 14 insertions(+), 31 deletions(-) diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index b4e79354c..ab01a79e9 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -1,6 +1,6 @@ import os from io import IOBase -from typing import List, Sequence +from typing import List, Optional, Sequence from datetime import datetime import re import pandas as pd @@ -125,14 +125,21 @@ def _get_dataset_name( @cached_dataset(DatasetTypes.RAW_METADATA.value) def get_raw_dataset_metadata( - self, dataset_name: str, **kwargs + self, + dataset_name: str, + *, + cached_worksheet: Optional[pd.DataFrame] = None, + **kwargs, ) -> SDTMDatasetMetadata: """ Returns dataset metadata as DatasetMetadata instance. + Pass cached_worksheet to avoid re-reading the Datasets sheet when + calling repeatedly (e.g. from get_datasets()). """ - _worksheet = kwargs.get("_worksheet") - if _worksheet is not None: - metadata = _worksheet[_worksheet[DATASET_FILENAME_COLUMN] == dataset_name] + if cached_worksheet is not None: + metadata = cached_worksheet[ + cached_worksheet[DATASET_FILENAME_COLUMN] == dataset_name + ] else: datasets_worksheet = pd.read_excel( self.dataset_path, @@ -242,7 +249,7 @@ def get_datasets(self) -> List[dict]: ) datasets = [ - self.get_raw_dataset_metadata(dataset_name=fn, _worksheet=worksheet) + self.get_raw_dataset_metadata(dataset_name=fn, cached_worksheet=worksheet) for fn in worksheet[DATASET_FILENAME_COLUMN] ] return datasets diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 109b651ff..6ef3e4040 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -19,8 +19,6 @@ from cdisc_rules_engine.models.validation_args import Validation_args from cdisc_rules_engine.rules_engine import RulesEngine from cdisc_rules_engine.exceptions.custom_exceptions import ( - CT_PACKAGE_NOT_FOUND_PREFIX, - CTPackageNotFoundError, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) @@ -51,7 +49,6 @@ get_library_metadata_from_cache, get_max_dataset_size, get_rules, - _get_datasets_or_raise, ) from cdisc_rules_engine.services.reporting import BaseReport, ReportFactory from cdisc_rules_engine.utilities.progress_displayers import get_progress_displayer @@ -186,7 +183,7 @@ def run_validation(args: Validation_args): large_dataset_validation: bool = ( data_service.dataset_implementation != PandasDataset ) - datasets = _get_datasets_or_raise(data_service) + datasets = data_service.get_datasets() _convert_datasets_to_parquet_if_needed( data_service, datasets, @@ -286,16 +283,6 @@ def run_single_rule_validation( codelists = codelists or [] for codelist in codelists: ct_package_metadata[codelist] = cache.get(codelist) - if codelists: - missing_ct = [c for c in codelists if ct_package_metadata.get(c) is None] - if missing_ct: - sorted_missing = sorted( - missing_ct, key=lambda x: (x is None, str(x) if x is not None else "") - ) - raise CTPackageNotFoundError( - f"{CT_PACKAGE_NOT_FOUND_PREFIX}: " - f"{', '.join(str(c) for c in sorted_missing)}." - ) library_metadata = LibraryMetadataContainer( standard_metadata=standard_metadata, diff --git a/scripts/script_utils.py b/scripts/script_utils.py index fb95e5579..808f24f1f 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -30,22 +30,11 @@ from cdisc_rules_engine.exceptions.custom_exceptions import ( CTPackageNotFoundError, CT_PACKAGE_NOT_FOUND_PREFIX, - EngineError, - InvalidDatasetFormat, LibraryMetadataNotFoundError, library_metadata_not_found_message, ) -def _get_datasets_or_raise(data_service): - try: - return data_service.get_datasets() - except EngineError: - raise - except Exception as e: - raise InvalidDatasetFormat("Your data files could not be read.") from e - - def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa if args.custom_standard: check = check_custom_standard(args) From a8d4c4cddcf43a6b7f2f92829ba569c09b7c2e06 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 10 Mar 2026 13:20:44 -0400 Subject: [PATCH 07/10] cache & cdash --- cdisc_rules_engine/enums/standard_types.py | 1 - .../data_services/excel_data_service.py | 36 +++++++++---------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/cdisc_rules_engine/enums/standard_types.py b/cdisc_rules_engine/enums/standard_types.py index ab7570d9b..f787d4afd 100644 --- a/cdisc_rules_engine/enums/standard_types.py +++ b/cdisc_rules_engine/enums/standard_types.py @@ -7,6 +7,5 @@ class StandardTypes(BaseEnum): SDTMIG = "sdtmig" SENDIG = "sendig" ADAM = "adam" - CDASHIG = "cdashig" TIG = "tig" USDM = "usdm" diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index ab01a79e9..0c7903b55 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -1,6 +1,7 @@ import os from io import IOBase -from typing import List, Optional, Sequence +import functools +from typing import List, Sequence from datetime import datetime import re import pandas as pd @@ -123,33 +124,28 @@ def _get_dataset_name( return first_record.get("instanceType", dataset_filename.split(".")[0]) return dataset_filename.split(".")[0].upper() + @functools.lru_cache(maxsize=None) + def _get_datasets_worksheet(self) -> pd.DataFrame: + return pd.read_excel( + self.dataset_path, + sheet_name=DATASETS_SHEET_NAME, + na_values=[""], + keep_default_na=False, + ) + @cached_dataset(DatasetTypes.RAW_METADATA.value) def get_raw_dataset_metadata( self, dataset_name: str, - *, - cached_worksheet: Optional[pd.DataFrame] = None, **kwargs, ) -> SDTMDatasetMetadata: """ Returns dataset metadata as DatasetMetadata instance. - Pass cached_worksheet to avoid re-reading the Datasets sheet when - calling repeatedly (e.g. from get_datasets()). """ - if cached_worksheet is not None: - metadata = cached_worksheet[ - cached_worksheet[DATASET_FILENAME_COLUMN] == dataset_name - ] - else: - datasets_worksheet = pd.read_excel( - self.dataset_path, - sheet_name=DATASETS_SHEET_NAME, - na_values=[""], - keep_default_na=False, - ) - metadata = datasets_worksheet[ - datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name - ] + datasets_worksheet = self._get_datasets_worksheet() + metadata = datasets_worksheet[ + datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name + ] dataset = self.get_dataset(dataset_name=dataset_name) first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} return SDTMDatasetMetadata( @@ -249,7 +245,7 @@ def get_datasets(self) -> List[dict]: ) datasets = [ - self.get_raw_dataset_metadata(dataset_name=fn, cached_worksheet=worksheet) + self.get_raw_dataset_metadata(dataset_name=fn) for fn in worksheet[DATASET_FILENAME_COLUMN] ] return datasets From aef4e6d9e370c81768cb6274dd0ebdf5980b788c Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 10 Mar 2026 13:46:27 -0400 Subject: [PATCH 08/10] moved script out of exceptions --- TestRule/__init__.py | 5 ++--- .../exceptions/custom_exceptions.py | 14 +------------- scripts/run_validation.py | 2 +- scripts/script_utils.py | 17 +++++++++++------ tests/unit/test_datasets_payload_validation.py | 2 +- 5 files changed, 16 insertions(+), 24 deletions(-) diff --git a/TestRule/__init__.py b/TestRule/__init__.py index a290d24b8..d0f88256d 100644 --- a/TestRule/__init__.py +++ b/TestRule/__init__.py @@ -7,11 +7,10 @@ from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator from scripts.run_validation import run_single_rule_validation from cdisc_rules_engine.exceptions.custom_exceptions import ( - CT_PACKAGE_NOT_FOUND_PREFIX, CTPackageNotFoundError, LibraryMetadataNotFoundError, - library_metadata_not_found_message, ) +from scripts.script_utils import library_metadata_not_found_message from cdisc_library_client.custom_exceptions import ( ResourceNotFoundException as LibraryResourceNotFoundException, ) @@ -144,7 +143,7 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: # asyncio.run(cache_populator.load_codelists(codelists or [])) except LibraryResourceNotFoundException: raise CTPackageNotFoundError( - f"{CT_PACKAGE_NOT_FOUND_PREFIX}: " + "Controlled terminology package(s) not found: " f"{', '.join(str(c) for c in (codelists or []))}." ) if not rule: diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index a29d791c9..669cb719c 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -40,15 +40,6 @@ class VariableMetadataNotFoundError(EngineError): ) -def library_metadata_not_found_message(standard, version, substandard=None): - version_display = (version or "").replace("-", ".") - sub_part = f" substandard {substandard}" if substandard else "" - return ( - f"No library metadata found for standard '{standard}' " - f"version '{version_display}'{sub_part}." - ) - - class LibraryMetadataNotFoundError(EngineError): code = 400 description = ( @@ -86,12 +77,9 @@ class ExcelTestDataError(EngineError): ) -CT_PACKAGE_NOT_FOUND_PREFIX = "Controlled terminology package(s) not found" - - class CTPackageNotFoundError(EngineError): code = 400 - description = f"{CT_PACKAGE_NOT_FOUND_PREFIX}." + description = "Controlled terminology package(s) not found" class NumberOfAttemptsExceeded(EngineError): diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 6ef3e4040..3b38028d3 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -20,7 +20,6 @@ from cdisc_rules_engine.rules_engine import RulesEngine from cdisc_rules_engine.exceptions.custom_exceptions import ( LibraryMetadataNotFoundError, - library_metadata_not_found_message, ) from cdisc_rules_engine.services import logger as engine_logger from cdisc_rules_engine.services.cache import ( @@ -44,6 +43,7 @@ set_max_errors_per_rule, ) from scripts.script_utils import ( + library_metadata_not_found_message, fill_cache_with_dictionaries, get_cache_service, get_library_metadata_from_cache, diff --git a/scripts/script_utils.py b/scripts/script_utils.py index 808f24f1f..fc5f1d4bd 100644 --- a/scripts/script_utils.py +++ b/scripts/script_utils.py @@ -29,9 +29,7 @@ ) from cdisc_rules_engine.exceptions.custom_exceptions import ( CTPackageNotFoundError, - CT_PACKAGE_NOT_FOUND_PREFIX, LibraryMetadataNotFoundError, - library_metadata_not_found_message, ) @@ -160,16 +158,14 @@ def get_library_metadata_from_cache(args) -> LibraryMetadataContainer: # noqa if args.define_xml_path: extensible_terms = define_xml_reader.get_extensible_codelist_mappings() ct_package_data["extensible"] = extensible_terms - requested_ct = set(args.controlled_terminology_package or []) - if args.define_xml_path and define_version.model_package == "define_2_1": - requested_ct |= define_referenced_ct + requested_ct = set(args.controlled_terminology_package or []) | define_referenced_ct missing_ct = requested_ct - published_ct_packages if missing_ct: sorted_missing = sorted( missing_ct, key=lambda x: (x is None, str(x) if x is not None else "") ) raise CTPackageNotFoundError( - f"{CT_PACKAGE_NOT_FOUND_PREFIX} in cache: " + "Controlled terminology package(s) not found in cache: " f"{', '.join(str(c) for c in sorted_missing)}." ) return LibraryMetadataContainer( @@ -616,3 +612,12 @@ def replace_yml_spaces(data): return [replace_yml_spaces(item) for item in data] else: return data + + +def library_metadata_not_found_message(standard, version, substandard=None): + version_display = (version or "").replace("-", ".") + sub_part = f" substandard {substandard}" if substandard else "" + return ( + f"No library metadata found for standard '{standard}' " + f"version '{version_display}'{sub_part}." + ) diff --git a/tests/unit/test_datasets_payload_validation.py b/tests/unit/test_datasets_payload_validation.py index a37729421..95e3bf626 100644 --- a/tests/unit/test_datasets_payload_validation.py +++ b/tests/unit/test_datasets_payload_validation.py @@ -12,8 +12,8 @@ from cdisc_rules_engine.exceptions.custom_exceptions import ( CTPackageNotFoundError, LibraryMetadataNotFoundError, - library_metadata_not_found_message, ) +from scripts.script_utils import library_metadata_not_found_message class _MockHttpResponse: From d879fcb7d39ab529598ef93ec30f9e1cc48057e0 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 10 Mar 2026 14:28:09 -0400 Subject: [PATCH 09/10] error text --- .../services/data_services/excel_data_service.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 0c7903b55..b4c1c4f3e 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -216,9 +216,8 @@ def get_datasets(self) -> List[dict]: if DATASETS_SHEET_NAME not in sheet_names: available = ", ".join(repr(s) for s in sheet_names) or "(none)" raise ExcelTestDataError( - f"The workbook does not contain a sheet named " - f"'{DATASETS_SHEET_NAME}'. Make sure there is a 'Datasets' tab " - f"(case-sensitive). Available sheet names: {available}." + f"The workbook does not contain a '{DATASETS_SHEET_NAME}' sheet. " + f"Submitted sheet names: {available}." ) worksheet = xl.parse( DATASETS_SHEET_NAME, From 55f8290283e5005e3e6fcbfeeecece67a8c7e37d Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Tue, 10 Mar 2026 15:03:45 -0400 Subject: [PATCH 10/10] test --- cdisc_rules_engine/enums/excel_test_sheets.py | 8 ++++ .../data_services/excel_data_service.py | 40 +++++++++---------- .../test_excel_data_service.py | 10 ++--- 3 files changed, 30 insertions(+), 28 deletions(-) create mode 100644 cdisc_rules_engine/enums/excel_test_sheets.py diff --git a/cdisc_rules_engine/enums/excel_test_sheets.py b/cdisc_rules_engine/enums/excel_test_sheets.py new file mode 100644 index 000000000..4e6d90581 --- /dev/null +++ b/cdisc_rules_engine/enums/excel_test_sheets.py @@ -0,0 +1,8 @@ +from cdisc_rules_engine.enums.base_enum import BaseEnum + + +class ExcelDataSheets(BaseEnum): + DATASETS_SHEET_NAME = "Datasets" + DATASET_FILENAME_COLUMN = "Filename" + DATASET_LABEL_COLUMN = "Label" + DATASETS_SHEET_REQUIRED_COLUMNS = ("Filename", "Label") diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index b4c1c4f3e..9127aeb89 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -20,15 +20,8 @@ DataReaderFactory, ) from .base_data_service import BaseDataService, cached_dataset - -DATASETS_SHEET_NAME = "Datasets" -DATASET_FILENAME_COLUMN = "Filename" -DATASET_LABEL_COLUMN = "Label" -DATASET_NAME_COLUMN = "Dataset Name" - -DATASETS_SHEET_REQUIRED_COLUMNS = ( - DATASET_FILENAME_COLUMN, - DATASET_LABEL_COLUMN, +from cdisc_rules_engine.enums.excel_test_sheets import ( + ExcelDataSheets, ) @@ -118,8 +111,6 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: def _get_dataset_name( self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str ) -> str: - if DATASET_NAME_COLUMN in metadata.columns and not metadata.empty: - return metadata[DATASET_NAME_COLUMN].iloc[0] if self.standard == "usdm": return first_record.get("instanceType", dataset_filename.split(".")[0]) return dataset_filename.split(".")[0].upper() @@ -128,7 +119,7 @@ def _get_dataset_name( def _get_datasets_worksheet(self) -> pd.DataFrame: return pd.read_excel( self.dataset_path, - sheet_name=DATASETS_SHEET_NAME, + sheet_name=ExcelDataSheets.DATASETS_SHEET_NAME.value, na_values=[""], keep_default_na=False, ) @@ -144,14 +135,19 @@ def get_raw_dataset_metadata( """ datasets_worksheet = self._get_datasets_worksheet() metadata = datasets_worksheet[ - datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name + datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] + == dataset_name ] dataset = self.get_dataset(dataset_name=dataset_name) first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {} return SDTMDatasetMetadata( name=self._get_dataset_name(metadata, first_record, dataset_name), first_record=first_record, - label=metadata[DATASET_LABEL_COLUMN].iloc[0] if not metadata.empty else "", + label=( + metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0] + if not metadata.empty + else "" + ), modification_date=datetime.fromtimestamp( os.path.getmtime(self.dataset_path) ).isoformat(), @@ -213,14 +209,14 @@ def get_datasets(self) -> List[dict]: try: with pd.ExcelFile(self.dataset_path) as xl: sheet_names = xl.sheet_names - if DATASETS_SHEET_NAME not in sheet_names: + if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names: available = ", ".join(repr(s) for s in sheet_names) or "(none)" raise ExcelTestDataError( - f"The workbook does not contain a '{DATASETS_SHEET_NAME}' sheet. " + f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. " f"Submitted sheet names: {available}." ) worksheet = xl.parse( - DATASETS_SHEET_NAME, + ExcelDataSheets.DATASETS_SHEET_NAME.value, na_values=[""], keep_default_na=False, ) @@ -233,19 +229,19 @@ def get_datasets(self) -> List[dict]: ) from e missing_cols = sorted( - set(DATASETS_SHEET_REQUIRED_COLUMNS) - set(worksheet.columns) + set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value) + - set(worksheet.columns) ) if missing_cols: raise ExcelTestDataError( - f"The '{DATASETS_SHEET_NAME}' sheet is missing required column(s): " + f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a " + f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): " f"{missing_cols}. Column headers are case-sensitive. " - f"Required: '{DATASET_FILENAME_COLUMN}', '{DATASET_LABEL_COLUMN}', " - f"and optionally '{DATASET_NAME_COLUMN}'." ) datasets = [ self.get_raw_dataset_metadata(dataset_name=fn) - for fn in worksheet[DATASET_FILENAME_COLUMN] + for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value] ] return datasets diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index 9648e2cd3..a75ca30d5 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -9,8 +9,8 @@ from cdisc_rules_engine.config.config import ConfigService from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError from cdisc_rules_engine.services.data_services import ExcelDataService -from cdisc_rules_engine.services.data_services.excel_data_service import ( - DATASETS_SHEET_NAME, +from cdisc_rules_engine.enums.excel_test_sheets import ( + ExcelDataSheets, ) from cdisc_rules_engine.models.dataset import PandasDataset @@ -217,8 +217,7 @@ def test_get_datasets_missing_datasets_sheet_raises_friendly_error(): data_service.get_datasets() msg = str(exc_info.value) - assert DATASETS_SHEET_NAME in msg or "Datasets" in msg - assert "case-sensitive" in msg + assert ExcelDataSheets.DATASETS_SHEET_NAME.value in msg finally: os.unlink(temp_path) @@ -235,7 +234,7 @@ def test_get_datasets_missing_label_column_raises_friendly_error(): try: wb = Workbook() datasets_sheet = wb.active - datasets_sheet.title = DATASETS_SHEET_NAME + datasets_sheet.title = ExcelDataSheets.DATASETS_SHEET_NAME.value datasets_sheet.append(["Filename", "label", "Dataset Name"]) datasets_sheet.append(["dm.xpt", "Demographics", "DM"]) wb.create_sheet("dm.xpt") @@ -261,7 +260,6 @@ def test_get_datasets_missing_label_column_raises_friendly_error(): msg = str(exc_info.value) assert "Label" in msg - assert "case-sensitive" in msg assert "column" in msg.lower() finally: os.unlink(temp_path)