diff --git a/README.md b/README.md index 3223d7c3b..b2070e719 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,20 @@ To validate a folder using rules for SDTM-IG version 3.4 use the following comma **_NOTE:_** Before running a validation in the CLI, you must first populate the cache with rules to validate against. See the update-cache command below. +#### Supported Dataset Formats + +CORE supports the following dataset file formats for validation: + +- **XPT** - SAS Transport Format (version 5) +- **JSON** - Dataset-JSON (CDISC standard format) +- **NDJSON** - Newline Delimited JSON datasets +- **XLSX** - Excel format (Microsoft Excel files) + +**Important Notes:** + +- Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). +- If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. + ##### **Validate single rule** `python core.py validate -s sdtmig -v 3-4 -dp -lr --meddra ./meddra/ --whodrug ./whodrug/` diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 0de48abdb..ace053e2d 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -184,7 +184,17 @@ def read_metadata( DataFormatTypes.JSON.value: DatasetJSONMetadataReader, DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader, } - contents_metadata = _metadata_reader_map[file_name.split(".")[1].upper()]( + + file_extension = file_name.split(".")[1].upper() + if file_extension not in _metadata_reader_map: + supported_formats = ", ".join(_metadata_reader_map.keys()) + raise ValueError( + f"Unsupported file format '{file_extension}' in file '{file_name}'.\n" + f"Supported formats: {supported_formats}\n" + f"Please provide dataset files in SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX) format." + ) + + contents_metadata = _metadata_reader_map[file_extension]( file_metadata["path"], file_name ).read() return { diff --git a/core.py b/core.py index 55ec341e7..5dfc84bcb 100644 --- a/core.py +++ b/core.py @@ -14,7 +14,6 @@ from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions from cdisc_rules_engine.enums.report_types import ReportTypes -from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.validation_args import Validation_args from scripts.run_validation import run_validation from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator @@ -28,23 +27,52 @@ generate_report_filename, get_rules_cache_key, ) +from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler from version import __version__ +VALIDATION_FORMATS_MESSAGE = ( + "SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)" +) + def valid_data_file(data_path: list) -> tuple[list, set]: - allowed_formats = [format.value for format in DataFormatTypes] + allowed_formats = [ + DataFormatTypes.XPT.value, + DataFormatTypes.JSON.value, + DataFormatTypes.NDJSON.value, + DataFormatTypes.XLSX.value, + ] found_formats = set() file_list = [] + ignored_files = [] + for file in data_path: file_extension = os.path.splitext(file)[1][1:].upper() if file_extension in allowed_formats: found_formats.add(file_extension) file_list.append(file) - if len(found_formats) > 1: - return [], found_formats - elif len(found_formats) == 1: + elif file_extension: + ignored_files.append(os.path.basename(file)) + + if ignored_files: + logger = logging.getLogger("validator") + logger.warning( + f"Ignoring {len(ignored_files)} file(s) with unsupported formats: {', '.join(ignored_files[:5])}" + + ("..." if len(ignored_files) > 5 else "") + ) + + if DataFormatTypes.XLSX.value in found_formats: + if len(found_formats) > 1: + return [], found_formats + elif len(file_list) > 1: + return [], found_formats + else: + return file_list, found_formats + if len(found_formats) >= 1: return file_list, found_formats + else: + return [], set() @click.group() @@ -52,6 +80,77 @@ def cli(): pass +def _validate_data_directory(data: str, logger) -> tuple[list, set]: + """Validate data directory and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file( + [str(p) for p in Path(data).rglob("*") if p.is_file()] + ) + + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1: + logger.error( + f"Argument --data contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + return None, None + + if not dataset_paths: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files found in directory: {data}\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files found in directory: {data}\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your directory contains files in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]: + """Validate dataset paths and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) + + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1: + logger.error( + f"Argument --dataset-path contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + return None, None + + if not dataset_paths: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files provided.\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files provided.\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your files are in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_no_arguments(logger) -> None: + """Validate that at least one dataset argument is provided.""" + logger.error("You must pass one of the following arguments: --dataset-path, --data") + + @click.command() @click.option( "-ca", @@ -70,14 +169,14 @@ def cli(): "-d", "--data", required=False, - help="Path to directory containing data files", + help=f"Path to directory containing data files ({VALIDATION_FORMATS_MESSAGE})", ) @click.option( "-dp", "--dataset-path", required=False, multiple=True, - help="Absolute path to dataset file", + help=f"Absolute path to dataset file ({VALIDATION_FORMATS_MESSAGE})", ) @click.option( "-l", @@ -323,32 +422,22 @@ def validate( }, } ) + # Validate dataset arguments if data: if dataset_path: logger.error( "Argument --dataset-path cannot be used together with argument --data" ) ctx.exit(2) - dataset_paths, found_formats = valid_data_file( - [str(p) for p in Path(data).rglob("*") if p.is_file()] - ) - if len(found_formats) > 1: - logger.error( - f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) + dataset_paths, found_formats = _validate_data_directory(data, logger) + if dataset_paths is None: ctx.exit(2) elif dataset_path: - dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) - if len(found_formats) > 1: - logger.error( - f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) + dataset_paths, found_formats = _validate_dataset_paths(dataset_path, logger) + if dataset_paths is None: ctx.exit(2) else: - logger.error( - "You must pass one of the following arguments: --dataset-path, --data" - ) - # no need to define dataset_paths here, the program execution will stop + _validate_no_arguments(logger) ctx.exit(2) validate_xml_bool = True if validate_xml.lower() in ("y", "yes") else False run_validation( diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 94b316126..053713a5d 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -1,6 +1,7 @@ from typing import Tuple, List from cdisc_rules_engine.config import config +from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.serializers import DatasetMetadataSerializer from cdisc_rules_engine.services.cache import CacheServiceFactory @@ -30,6 +31,25 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: ... ] """ + invalid_files = [] + + for path in dataset_paths: + file_ext = path.split(".")[-1].upper() + if file_ext not in [ + DataFormatTypes.XPT.value, + DataFormatTypes.JSON.value, + DataFormatTypes.NDJSON.value, + DataFormatTypes.XLSX.value, + ]: + invalid_files.append((path, file_ext)) + + if invalid_files: + error_msg = "Unsupported file format(s) detected:\n" + for file, ext in invalid_files: + error_msg += f" - {file} (format: {ext})\n" + error_msg += "\nSupported formats: SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)" + raise ValueError(error_msg) + cache_service = CacheServiceFactory(config).get_service() data_service = DataServiceFactory(config, cache_service).get_service() metadata: List[SDTMDatasetMetadata] = [ diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py new file mode 100644 index 000000000..3c94be9e9 --- /dev/null +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py @@ -0,0 +1,240 @@ +import subprocess +import os +import tempfile +import shutil +import pytest +from conftest import get_python_executable + + +@pytest.mark.regression +def test_multiple_xlsx_files_shows_helpful_error(): + """Test that multiple XLSX files show a helpful error message about single file limitation""" + with tempfile.TemporaryDirectory() as temp_dir: + xlsx_files = ["ae.xlsx", "dm.xlsx"] + + for filename in xlsx_files: + open(os.path.join(temp_dir, filename), "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert ( + process.returncode != 0 + ), "Expected non-zero exit code for multiple XLSX files" + assert "KeyError" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" + assert ( + "Multiple XLSX files found" in stderr_text + or "Multiple XLSX files found" in stdout_text + ), f"Expected helpful error message about multiple XLSX files. stderr: {stderr_text}, stdout: {stdout_text}" + assert ( + "Excel format (XLSX) validation only supports single files" in stderr_text + or "Excel format (XLSX) validation only supports single files" + in stdout_text + ), "Expected explanation of XLSX limitation" + + +@pytest.mark.regression +def test_folder_with_xlsx_files_works_with_excel_service(): + """Test that a folder with a single XLSX file now works with ExcelDataService""" + with tempfile.TemporaryDirectory() as temp_dir: + xlsx_files = ["ae.xlsx"] + + for filename in xlsx_files: + open(os.path.join(temp_dir, filename), "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" + + assert ( + "No valid dataset files found" not in stderr_text + and "No valid dataset files found" not in stdout_text + ), "XLSX files should now be recognized as valid" + + +@pytest.mark.regression +def test_folder_with_unsupported_formats_shows_helpful_error(): + """Test that folders with truly unsupported formats (like PDF) show helpful error messages""" + with tempfile.TemporaryDirectory() as temp_dir: + unsupported_files = ["ae.pdf", "dm.txt"] + + for filename in unsupported_files: + open(os.path.join(temp_dir, filename), "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert ( + process.returncode != 0 + ), "Expected non-zero exit code for unsupported files" + assert "KeyError" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" + assert ( + "No valid dataset files found" in stderr_text + or "No valid dataset files found" in stdout_text + ), f"Expected helpful error message. stderr: {stderr_text}, stdout: {stdout_text}" + assert ( + "SAS V5 XPT, Dataset-JSON" in stderr_text + or "SAS V5 XPT, Dataset-JSON" in stdout_text + ), "Expected format guidance in error message" + + +@pytest.mark.regression +def test_empty_folder_shows_helpful_error(): + with tempfile.TemporaryDirectory() as temp_dir: + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert process.returncode != 0, "Expected non-zero exit code for empty folder" + assert ( + "No valid dataset files found" in stderr_text + or "No valid dataset files found" in stdout_text + ), "Expected helpful error message for empty folder" + + +@pytest.mark.regression +def test_valid_xpt_files_work_normally(): + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-dp", + os.path.join("tests", "resources", "test_dataset.xpt"), + ] + + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + + assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError" + assert "Failed to execute script" not in stderr_text, "Should not crash" + assert ( + "No valid dataset files found" not in stderr_text + ), "Should find valid XPT file" + + +@pytest.mark.regression +def test_mixed_folder_processes_valid_files(): + with tempfile.TemporaryDirectory() as temp_dir: + valid_xpt = os.path.join("tests", "resources", "test_dataset.xpt") + if os.path.exists(valid_xpt): + shutil.copy(valid_xpt, os.path.join(temp_dir, "ae.xpt")) + + xlsx_file = os.path.join(temp_dir, "dm.xlsx") + open(xlsx_file, "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert "KeyError" not in stderr_text, "Should not crash on any files" + assert "Failed to execute script" not in stderr_text, "Should not crash" + + assert ( + "No valid dataset files found" not in stderr_text + and "No valid dataset files found" not in stdout_text + ), "Should find the valid XPT file"