Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
6ad14ad
#1616: User-friendly errors for missing/wrong Datasets tab and column…
RakeshBobba03 Feb 18, 2026
b2c8c88
Add USDM to StandardTypes enum
RakeshBobba03 Feb 18, 2026
b6205fe
Do not raise LibraryMetadataNotFoundError for USDM when standard meta…
RakeshBobba03 Feb 18, 2026
8daff55
Merge branch 'main' into 1616-datasets-library-validation-errors
RakeshBobba03 Feb 24, 2026
dc76434
Remove datasets key reporting, inline Datasets guidance, add CT packa…
RakeshBobba03 Feb 24, 2026
14f1395
Merge branch 'main' into 1616-datasets-library-validation-errors
RakeshBobba03 Feb 25, 2026
34286bc
Merge branch 'main' into 1616-datasets-library-validation-errors
RakeshBobba03 Feb 26, 2026
cc22934
Improve Library/Datasets/CT validation errors; remove hint text, move…
RakeshBobba03 Feb 26, 2026
1d5f6d2
Merge branch '1616-datasets-library-validation-errors' of https://git…
SFJohnson24 Mar 2, 2026
9208207
Merge branch 'main' into 1616-datasets-library-validation-errors
RakeshBobba03 Mar 5, 2026
90ecaba
Remove dataset wrapper, centralize CT check, add cached_worksheet to …
RakeshBobba03 Mar 5, 2026
b4d92b1
Merge branch '1616-datasets-library-validation-errors' of https://git…
SFJohnson24 Mar 5, 2026
f8b087b
Merge branch 'main' into 1616-datasets-library-validation-errors
SFJohnson24 Mar 5, 2026
a8d4c4c
cache & cdash
SFJohnson24 Mar 10, 2026
aef4e6d
moved script out of exceptions
SFJohnson24 Mar 10, 2026
d879fcb
error text
SFJohnson24 Mar 10, 2026
55f8290
test
SFJohnson24 Mar 10, 2026
80bfe5d
Merge branch 'main' into 1616-datasets-library-validation-errors
SFJohnson24 Mar 10, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 56 additions & 13 deletions TestRule/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,14 @@
from cdisc_rules_engine.services.cdisc_library_service import CDISCLibraryService
from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator
from scripts.run_validation import run_single_rule_validation
from cdisc_rules_engine.exceptions.custom_exceptions import (
CTPackageNotFoundError,
LibraryMetadataNotFoundError,
)
from scripts.script_utils import library_metadata_not_found_message
from cdisc_library_client.custom_exceptions import (
ResourceNotFoundException as LibraryResourceNotFoundException,
)
import json
import os
import asyncio
Expand All @@ -17,11 +25,13 @@ class BadRequestError(Exception):
pass


_REQUIRED_DATASET_KEYS = {"filename", "label", "domain", "records", "variables"}


def validate_datasets_payload(datasets):
required_keys = {"filename", "label", "domain", "records", "variables"}
missing_keys = set()
for dataset in datasets:
for key in required_keys:
for key in _REQUIRED_DATASET_KEYS:
if key not in dataset:
missing_keys.add(key)

Expand All @@ -32,19 +42,39 @@ def validate_datasets_payload(datasets):
)

if missing_keys:
raise KeyError(
f"one or more datasets missing the following keys {missing_keys}"
)
raise BadRequestError("Test data is incorrect and missing required formatting.")


def handle_exception(e: Exception):
if isinstance(e, KeyError):
if isinstance(e, BadRequestError):
return func.HttpResponse(
json.dumps({"error": "BadRequestError", "message": str(e)}),
status_code=400,
)
if isinstance(e, LibraryMetadataNotFoundError):
msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e)
return func.HttpResponse(
json.dumps(
{
"error": "LibraryMetadataNotFoundError",
"message": msg,
}
),
status_code=400,
)
if isinstance(e, CTPackageNotFoundError):
msg = getattr(e, "message", None) or getattr(e, "description", None) or str(e)
return func.HttpResponse(
json.dumps({"error": "KeyError", "message": str(e)}), status_code=400
json.dumps({"error": "CTPackageNotFoundError", "message": msg}),
status_code=400,
)
elif isinstance(e, BadRequestError):
if isinstance(e, KeyError):
msg = str(e)
if "rule" in msg.lower() or "datasets" in msg.lower():
msg = f"{msg} Ensure the request body includes the required JSON keys."
return func.HttpResponse(
json.dumps({"error": "BadRequestError", "message": str(e)}), status_code=400
json.dumps({"error": "BadRequestError", "message": msg}),
status_code=400,
)
else:
return func.HttpResponse(
Expand Down Expand Up @@ -97,12 +127,25 @@ def main(req: func.HttpRequest, context: func.Context) -> func.HttpResponse: #
asyncio.run(cache_populator.load_available_ct_packages())
if standards_data or codelists:
if standards_data:
asyncio.run(
cache_populator.load_standard(
standard, standard_version, standard_substandard
try:
asyncio.run(
cache_populator.load_standard(
standard, standard_version, standard_substandard
)
)
except LibraryResourceNotFoundException:
raise LibraryMetadataNotFoundError(
library_metadata_not_found_message(
standard, standard_version, standard_substandard
)
)
try:
asyncio.run(cache_populator.load_codelists(codelists or []))
except LibraryResourceNotFoundException:
raise CTPackageNotFoundError(
"Controlled terminology package(s) not found: "
f"{', '.join(str(c) for c in (codelists or []))}."
)
asyncio.run(cache_populator.load_codelists(codelists))
if not rule:
raise KeyError("'rule' required in request")
datasets = json_data.get("datasets")
Expand Down
8 changes: 8 additions & 0 deletions cdisc_rules_engine/enums/excel_test_sheets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from cdisc_rules_engine.enums.base_enum import BaseEnum


class ExcelDataSheets(BaseEnum):
DATASETS_SHEET_NAME = "Datasets"
DATASET_FILENAME_COLUMN = "Filename"
DATASET_LABEL_COLUMN = "Label"
DATASETS_SHEET_REQUIRED_COLUMNS = ("Filename", "Label")
11 changes: 11 additions & 0 deletions cdisc_rules_engine/enums/standard_types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from cdisc_rules_engine.enums.base_enum import BaseEnum


class StandardTypes(BaseEnum):
"""Standards supported by CDISC Library; used for CLI validation when not using --custom-standard."""

SDTMIG = "sdtmig"
SENDIG = "sendig"
ADAM = "adam"
TIG = "tig"
USDM = "usdm"
20 changes: 20 additions & 0 deletions cdisc_rules_engine/exceptions/custom_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,13 @@ class VariableMetadataNotFoundError(EngineError):
)


class LibraryMetadataNotFoundError(EngineError):
code = 400
description = (
"Library metadata not found for the provided standard and version combination."
)


class DomainNotFoundError(EngineError):
"""Raised when a required domain is not found in the dataset"""

Expand All @@ -62,6 +69,19 @@ class InvalidJSONFormat(EngineError):
description = "JSON data is malformed."


class ExcelTestDataError(EngineError):
code = 400
description = (
"Excel test data file is missing required sheets or column headers. "
"Sheet and column names are case-sensitive."
)


class CTPackageNotFoundError(EngineError):
code = 400
description = "Controlled terminology package(s) not found"


class NumberOfAttemptsExceeded(EngineError):
pass

Expand Down
90 changes: 58 additions & 32 deletions cdisc_rules_engine/services/data_services/excel_data_service.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from io import IOBase
import functools
from typing import List, Sequence
from datetime import datetime
import re
Expand All @@ -14,16 +15,14 @@
from cdisc_rules_engine.models.variable_metadata_container import (
VariableMetadataContainer,
)
from cdisc_rules_engine.services import logger
from cdisc_rules_engine.exceptions.custom_exceptions import ExcelTestDataError
from cdisc_rules_engine.services.data_readers.data_reader_factory import (
DataReaderFactory,
)
from .base_data_service import BaseDataService, cached_dataset

DATASETS_SHEET_NAME = "Datasets"
DATASET_FILENAME_COLUMN = "Filename"
DATASET_LABEL_COLUMN = "Label"
DATASET_NAME_COLUMN = "Dataset Name"
from cdisc_rules_engine.enums.excel_test_sheets import (
ExcelDataSheets,
)


class ExcelDataService(BaseDataService):
Expand Down Expand Up @@ -112,34 +111,43 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface:
def _get_dataset_name(
self, metadata: pd.DataFrame, first_record: dict, dataset_filename: str
) -> str:
if DATASET_NAME_COLUMN in metadata.columns and not metadata.empty:
return metadata[DATASET_NAME_COLUMN].iloc[0]
if self.standard == "usdm":
return first_record.get("instanceType", dataset_filename.split(".")[0])
return dataset_filename.split(".")[0].upper()

@functools.lru_cache(maxsize=None)
def _get_datasets_worksheet(self) -> pd.DataFrame:
return pd.read_excel(
self.dataset_path,
sheet_name=ExcelDataSheets.DATASETS_SHEET_NAME.value,
na_values=[""],
keep_default_na=False,
)

@cached_dataset(DatasetTypes.RAW_METADATA.value)
def get_raw_dataset_metadata(
self, dataset_name: str, **kwargs
self,
dataset_name: str,
**kwargs,
) -> SDTMDatasetMetadata:
"""
Returns dataset metadata as DatasetMetadata instance.
"""
datasets_worksheet = pd.read_excel(
self.dataset_path,
sheet_name=DATASETS_SHEET_NAME,
na_values=[""],
keep_default_na=False,
)
datasets_worksheet = self._get_datasets_worksheet()
metadata = datasets_worksheet[
datasets_worksheet[DATASET_FILENAME_COLUMN] == dataset_name
datasets_worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value]
== dataset_name
]
dataset = self.get_dataset(dataset_name=dataset_name)
first_record = dataset.data.iloc[0].to_dict() if not dataset.empty else {}
return SDTMDatasetMetadata(
name=self._get_dataset_name(metadata, first_record, dataset_name),
first_record=first_record,
label=metadata[DATASET_LABEL_COLUMN].iloc[0] if not metadata.empty else "",
label=(
metadata[ExcelDataSheets.DATASET_LABEL_COLUMN.value].iloc[0]
if not metadata.empty
else ""
),
modification_date=datetime.fromtimestamp(
os.path.getmtime(self.dataset_path)
).isoformat(),
Expand Down Expand Up @@ -199,23 +207,41 @@ def read_data(self, file_path: str) -> IOBase:

def get_datasets(self) -> List[dict]:
try:
worksheet = pd.read_excel(
self.dataset_path,
sheet_name=DATASETS_SHEET_NAME,
na_values=[""],
keep_default_na=False,
)
except TypeError as e:
logger.error(
f"Failed to read datasets from the Excel file at {self.dataset_path}. "
f"Ensure the file is in the correct format. "
f"Try opening and saving the file in Microsoft Excel. "
f"Error: {str(e)}"
)
with pd.ExcelFile(self.dataset_path) as xl:
sheet_names = xl.sheet_names
if ExcelDataSheets.DATASETS_SHEET_NAME.value not in sheet_names:
available = ", ".join(repr(s) for s in sheet_names) or "(none)"
raise ExcelTestDataError(
f"The workbook does not contain a '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet. "
f"Submitted sheet names: {available}."
)
worksheet = xl.parse(
ExcelDataSheets.DATASETS_SHEET_NAME.value,
na_values=[""],
keep_default_na=False,
)
except ExcelTestDataError:
raise
except Exception as e:
raise ExcelTestDataError(
f"Cannot read the Excel file. Ensure it is a valid .xlsx workbook. "
f"Details: {e}"
) from e

missing_cols = sorted(
set(ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value)
- set(worksheet.columns)
)
if missing_cols:
raise ExcelTestDataError(
f"The '{ExcelDataSheets.DATASETS_SHEET_NAME.value}' sheet is missing a "
f"required {ExcelDataSheets.DATASETS_SHEET_REQUIRED_COLUMNS.value} column(s): "
f"{missing_cols}. Column headers are case-sensitive. "
)

datasets = [
self.get_raw_dataset_metadata(dataset_name=dataset_filename)
for dataset_filename in worksheet[DATASET_FILENAME_COLUMN]
self.get_raw_dataset_metadata(dataset_name=fn)
for fn in worksheet[ExcelDataSheets.DATASET_FILENAME_COLUMN.value]
]
return datasets

Expand Down
28 changes: 6 additions & 22 deletions cdisc_rules_engine/services/data_services/local_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,11 @@
convert_file_size,
extract_file_name_from_path_string,
)
from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat
from .base_data_service import BaseDataService, cached_dataset
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface
from cdisc_rules_engine.models.dataset import PandasDataset
from cdisc_rules_engine.services import logger
import re


Expand Down Expand Up @@ -244,28 +244,12 @@ def get_datasets(self) -> List[dict]:
dataset_name=dataset_path
)
datasets.append(dataset_metadata)
except InvalidDatasetFormat:
raise
except Exception as e:
logger.error(
f"Failed to read metadata for dataset {dataset_path}. "
f"Error: {type(e).__name__}: {e}. Skipping this dataset."
)
file_name = extract_file_name_from_path_string(dataset_path)
datasets.append(
SDTMDatasetMetadata(
name=(
file_name.split(".")[0].upper()
if "." in file_name
else file_name.upper()
),
first_record={},
label="",
modification_date="",
filename=file_name,
full_path=dataset_path,
file_size=0,
record_count=0,
)
)
raise InvalidDatasetFormat(
f"Your data file could not be read: {dataset_path}."
) from e
return datasets

@staticmethod
Expand Down
10 changes: 10 additions & 0 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths
from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions
from cdisc_rules_engine.enums.report_types import ReportTypes
from cdisc_rules_engine.enums.standard_types import StandardTypes
from cdisc_rules_engine.models.external_dictionaries_container import (
DictionaryTypes,
ExternalDictionariesContainer,
Expand Down Expand Up @@ -478,6 +479,15 @@ def validate( # noqa

if not custom_standard:
standard = standard.lower()
supported_standards = StandardTypes.values()
if standard not in supported_standards:
supported_list = ", ".join(sorted(supported_standards))
logger.error(
f"Standard '{standard}' is not a supported standard. "
f"Supported standards: {supported_list}. "
f"Use --custom-standard flag for custom standards."
)
ctx.exit(2)

if raw_report is True:
if not (len(output_format) == 1 and output_format[0] == ReportTypes.JSON.value):
Expand Down
Loading
Loading