Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,20 @@ To validate a folder using rules for SDTM-IG version 3.4 use the following comma

**_NOTE:_** Before running a validation in the CLI, you must first populate the cache with rules to validate against. See the update-cache command below.

#### Supported Dataset Formats

CORE supports the following dataset file formats for validation:

- **XPT** - SAS Transport Format (version 5)
- **JSON** - Dataset-JSON (CDISC standard format)
- **NDJSON** - Newline Delimited JSON datasets
- **XLSX** - Excel format (Microsoft Excel files)

**Important Notes:**

- Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`).
- If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported.

##### **Validate single rule**

`python core.py validate -s sdtmig -v 3-4 -dp <path to dataset json file> -lr <path to rule json file> --meddra ./meddra/ --whodrug ./whodrug/`
Expand Down
12 changes: 11 additions & 1 deletion cdisc_rules_engine/services/data_services/local_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,17 @@ def read_metadata(
DataFormatTypes.JSON.value: DatasetJSONMetadataReader,
DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader,
}
contents_metadata = _metadata_reader_map[file_name.split(".")[1].upper()](

file_extension = file_name.split(".")[1].upper()
if file_extension not in _metadata_reader_map:
supported_formats = ", ".join(_metadata_reader_map.keys())
raise ValueError(
f"Unsupported file format '{file_extension}' in file '{file_name}'.\n"
f"Supported formats: {supported_formats}\n"
f"Please provide dataset files in SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX) format."
)

contents_metadata = _metadata_reader_map[file_extension](
file_metadata["path"], file_name
).read()
return {
Expand Down
135 changes: 112 additions & 23 deletions core.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths
from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions
from cdisc_rules_engine.enums.report_types import ReportTypes
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
from cdisc_rules_engine.models.validation_args import Validation_args
from scripts.run_validation import run_validation
from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator
Expand All @@ -28,30 +27,130 @@
generate_report_filename,
get_rules_cache_key,
)
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler
from version import __version__

VALIDATION_FORMATS_MESSAGE = (
"SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)"
)


def valid_data_file(data_path: list) -> tuple[list, set]:
allowed_formats = [format.value for format in DataFormatTypes]
allowed_formats = [
DataFormatTypes.XPT.value,
DataFormatTypes.JSON.value,
DataFormatTypes.NDJSON.value,
DataFormatTypes.XLSX.value,
]
found_formats = set()
file_list = []
ignored_files = []

for file in data_path:
file_extension = os.path.splitext(file)[1][1:].upper()
if file_extension in allowed_formats:
found_formats.add(file_extension)
file_list.append(file)
if len(found_formats) > 1:
return [], found_formats
elif len(found_formats) == 1:
elif file_extension:
ignored_files.append(os.path.basename(file))

if ignored_files:
logger = logging.getLogger("validator")
logger.warning(
f"Ignoring {len(ignored_files)} file(s) with unsupported formats: {', '.join(ignored_files[:5])}"
+ ("..." if len(ignored_files) > 5 else "")
)

if DataFormatTypes.XLSX.value in found_formats:
if len(found_formats) > 1:
return [], found_formats
elif len(file_list) > 1:
return [], found_formats
else:
return file_list, found_formats
if len(found_formats) >= 1:
return file_list, found_formats
else:
return [], set()


@click.group()
def cli():
pass


def _validate_data_directory(data: str, logger) -> tuple[list, set]:
"""Validate data directory and return dataset paths and found formats."""
dataset_paths, found_formats = valid_data_file(
[str(p) for p in Path(data).rglob("*") if p.is_file()]
)

if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1:
logger.error(
f"Argument --data contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n"
f"Excel format (XLSX) validation only supports single files.\n"
f"Please provide either a single XLSX file or use other supported formats: "
f"{VALIDATION_FORMATS_MESSAGE}"
)
return None, None

if not dataset_paths:
if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1:
logger.error(
f"Multiple XLSX files found in directory: {data}\n"
f"Excel format (XLSX) validation only supports single files.\n"
f"Please provide either a single XLSX file or use other supported formats: "
f"{VALIDATION_FORMATS_MESSAGE}"
)
else:
logger.error(
f"No valid dataset files found in directory: {data}\n"
f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n"
f"Please ensure your directory contains files in one of these formats."
)
return None, None

return dataset_paths, found_formats


def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]:
"""Validate dataset paths and return dataset paths and found formats."""
dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path])

if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1:
logger.error(
f"Argument --dataset-path contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n"
f"Excel format (XLSX) validation only supports single files.\n"
f"Please provide either a single XLSX file or use other supported formats: "
f"{VALIDATION_FORMATS_MESSAGE}"
)
return None, None

if not dataset_paths:
if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1:
logger.error(
f"Multiple XLSX files provided.\n"
f"Excel format (XLSX) validation only supports single files.\n"
f"Please provide either a single XLSX file or use other supported formats: "
f"{VALIDATION_FORMATS_MESSAGE}"
)
else:
logger.error(
f"No valid dataset files provided.\n"
f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n"
f"Please ensure your files are in one of these formats."
)
return None, None

return dataset_paths, found_formats


def _validate_no_arguments(logger) -> None:
"""Validate that at least one dataset argument is provided."""
logger.error("You must pass one of the following arguments: --dataset-path, --data")


@click.command()
@click.option(
"-ca",
Expand All @@ -70,14 +169,14 @@ def cli():
"-d",
"--data",
required=False,
help="Path to directory containing data files",
help=f"Path to directory containing data files ({VALIDATION_FORMATS_MESSAGE})",
)
@click.option(
"-dp",
"--dataset-path",
required=False,
multiple=True,
help="Absolute path to dataset file",
help=f"Absolute path to dataset file ({VALIDATION_FORMATS_MESSAGE})",
)
@click.option(
"-l",
Expand Down Expand Up @@ -323,32 +422,22 @@ def validate(
},
}
)
# Validate dataset arguments
if data:
if dataset_path:
logger.error(
"Argument --dataset-path cannot be used together with argument --data"
)
ctx.exit(2)
dataset_paths, found_formats = valid_data_file(
[str(p) for p in Path(data).rglob("*") if p.is_file()]
)
if len(found_formats) > 1:
logger.error(
f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501
)
dataset_paths, found_formats = _validate_data_directory(data, logger)
if dataset_paths is None:
ctx.exit(2)
elif dataset_path:
dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path])
if len(found_formats) > 1:
logger.error(
f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501
)
dataset_paths, found_formats = _validate_dataset_paths(dataset_path, logger)
if dataset_paths is None:
ctx.exit(2)
else:
logger.error(
"You must pass one of the following arguments: --dataset-path, --data"
)
# no need to define dataset_paths here, the program execution will stop
_validate_no_arguments(logger)
ctx.exit(2)
validate_xml_bool = True if validate_xml.lower() in ("y", "yes") else False
run_validation(
Expand Down
20 changes: 20 additions & 0 deletions scripts/list_dataset_metadata_handler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Tuple, List

from cdisc_rules_engine.config import config
from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
from cdisc_rules_engine.serializers import DatasetMetadataSerializer
from cdisc_rules_engine.services.cache import CacheServiceFactory
Expand Down Expand Up @@ -30,6 +31,25 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]:
...
]
"""
invalid_files = []

for path in dataset_paths:
file_ext = path.split(".")[-1].upper()
if file_ext not in [
DataFormatTypes.XPT.value,
DataFormatTypes.JSON.value,
DataFormatTypes.NDJSON.value,
DataFormatTypes.XLSX.value,
]:
invalid_files.append((path, file_ext))

if invalid_files:
error_msg = "Unsupported file format(s) detected:\n"
for file, ext in invalid_files:
error_msg += f" - {file} (format: {ext})\n"
error_msg += "\nSupported formats: SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)"
raise ValueError(error_msg)

cache_service = CacheServiceFactory(config).get_service()
data_service = DataServiceFactory(config, cache_service).get_service()
metadata: List[SDTMDatasetMetadata] = [
Expand Down
Loading
Loading