From 966d2bd909d2d6bcecbb691070dbe93cb565b1d3 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 21 Oct 2025 17:34:38 -0400 Subject: [PATCH 1/6] #1387 clearer error when -d folder has no datasets --- README.md | 14 ++ .../data_services/local_data_service.py | 12 +- core.py | 45 ++++- scripts/list_dataset_metadata_handler.py | 16 ++ .../test_Issues/test_CoreIssue1387.py | 181 ++++++++++++++++++ 5 files changed, 261 insertions(+), 7 deletions(-) create mode 100644 tests/QARegressionTests/test_Issues/test_CoreIssue1387.py diff --git a/README.md b/README.md index 948654624..42a0383cc 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,20 @@ To validate a folder using rules for SDTM-IG version 3.4 use the following comma **_NOTE:_** Before running a validation in the CLI, you must first populate the cache with rules to validate against. See the update-cache command below. +#### Supported Dataset Formats + +CORE supports the following dataset file formats for validation: + +- **XPT** - SAS Transport Format (version 5) +- **JSON** - Dataset-JSON (CDISC standard format) +- **NDJSON** - Newline Delimited JSON datasets + +**Important Notes:** + +- XLSX files are used for report output templates but are **not supported** as input datasets for validation. +- Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). +- If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. + ##### **Validate single rule** `python core.py validate -s sdtmig -v 3-4 -dp -lr --meddra ./meddra/ --whodrug ./whodrug/` diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 0de48abdb..388b542a9 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -184,7 +184,17 @@ def read_metadata( DataFormatTypes.JSON.value: DatasetJSONMetadataReader, DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader, } - contents_metadata = _metadata_reader_map[file_name.split(".")[1].upper()]( + + file_extension = file_name.split(".")[1].upper() + if file_extension not in _metadata_reader_map: + supported_formats = ", ".join(_metadata_reader_map.keys()) + raise ValueError( + f"Unsupported file format '{file_extension}' in file '{file_name}'.\n" + f"Supported formats: {supported_formats}\n" + f"Please provide dataset files in SAS V5 XPT or Dataset-JSON (JSON or NDJSON) format." + ) + + contents_metadata = _metadata_reader_map[file_extension]( file_metadata["path"], file_name ).read() return { diff --git a/core.py b/core.py index 58a6f0a1f..f3f439430 100644 --- a/core.py +++ b/core.py @@ -14,7 +14,6 @@ from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions from cdisc_rules_engine.enums.report_types import ReportTypes -from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.validation_args import Validation_args from scripts.run_validation import run_validation from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator @@ -31,20 +30,40 @@ from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler from version import __version__ +# Formats supported for validation (must have metadata readers in local_data_service.py) +VALIDATION_SUPPORTED_FORMATS = ["XPT", "JSON", "NDJSON"] +# User-friendly format description +VALIDATION_FORMATS_MESSAGE = "SAS V5 XPT or Dataset-JSON (JSON or NDJSON)" + def valid_data_file(data_path: list) -> tuple[list, set]: - allowed_formats = [format.value for format in DataFormatTypes] + allowed_formats = VALIDATION_SUPPORTED_FORMATS found_formats = set() file_list = [] + ignored_files = [] + for file in data_path: file_extension = os.path.splitext(file)[1][1:].upper() if file_extension in allowed_formats: found_formats.add(file_extension) file_list.append(file) + elif file_extension: # Has an extension but not supported + ignored_files.append(os.path.basename(file)) + + # Log ignored files if any + if ignored_files: + logger = logging.getLogger("validator") + logger.warning( + f"Ignoring {len(ignored_files)} file(s) with unsupported formats: {', '.join(ignored_files[:5])}" + + ("..." if len(ignored_files) > 5 else "") + ) + if len(found_formats) > 1: return [], found_formats - elif len(found_formats) == 1: + elif len(found_formats) >= 1: return file_list, found_formats + else: + return [], set() @click.group() @@ -70,14 +89,14 @@ def cli(): "-d", "--data", required=False, - help="Path to directory containing data files", + help=f"Path to directory containing data files ({VALIDATION_FORMATS_MESSAGE})", ) @click.option( "-dp", "--dataset-path", required=False, multiple=True, - help="Absolute path to dataset file", + help=f"Absolute path to dataset file ({VALIDATION_FORMATS_MESSAGE})", ) @click.option( "-l", @@ -243,7 +262,7 @@ def cli(): ), ) @click.pass_context -def validate( +def validate( # noqa: C901 ctx, cache: str, pool_size: int, @@ -336,6 +355,13 @@ def validate( f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 ) ctx.exit(2) + if not dataset_paths: + logger.error( + f"No valid dataset files found in directory: {data}\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your directory contains files in one of these formats." + ) + ctx.exit(2) elif dataset_path: dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) if len(found_formats) > 1: @@ -343,6 +369,13 @@ def validate( f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 ) ctx.exit(2) + if not dataset_paths: + logger.error( + f"No valid dataset files provided.\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your files are in one of these formats." + ) + ctx.exit(2) else: logger.error( "You must pass one of the following arguments: --dataset-path, --data" diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 94b316126..1ad1d7b44 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -30,6 +30,22 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: ... ] """ + # Validate file formats before processing + supported_formats = ["XPT", "JSON", "NDJSON"] + invalid_files = [] + + for path in dataset_paths: + file_ext = path.split(".")[-1].upper() + if file_ext not in supported_formats: + invalid_files.append((path, file_ext)) + + if invalid_files: + error_msg = "Unsupported file format(s) detected:\n" + for file, ext in invalid_files: + error_msg += f" - {file} (format: {ext})\n" + error_msg += "\nSupported formats: SAS V5 XPT or Dataset-JSON (JSON or NDJSON)" + raise ValueError(error_msg) + cache_service = CacheServiceFactory(config).get_service() data_service = DataServiceFactory(config, cache_service).get_service() metadata: List[SDTMDatasetMetadata] = [ diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py new file mode 100644 index 000000000..84c06209c --- /dev/null +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py @@ -0,0 +1,181 @@ +import subprocess +import os +import tempfile +import pytest +from conftest import get_python_executable + +""" +Test for Issue #1387: The error message shown when the -d switch points +to a folder without datasets is not user-friendly. + +These tests validate that: +1. Folders with only unsupported formats (XLSX) show helpful error messages +2. Empty folders show helpful error messages +3. Valid XPT files continue to work normally +4. Mixed folders (valid + invalid) process valid files with warnings +""" + + +@pytest.mark.regression +def test_folder_with_xlsx_files_shows_helpful_error(): + """Test that a folder with only XLSX files shows a user-friendly error message""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create dummy XLSX files + xlsx_file1 = os.path.join(temp_dir, "ae.xlsx") + xlsx_file2 = os.path.join(temp_dir, "dm.xlsx") + + # Create empty XLSX files + open(xlsx_file1, "w").close() + open(xlsx_file2, "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + # Should exit with error code + assert process.returncode != 0, "Expected non-zero exit code for invalid files" + + # Should NOT contain the old KeyError crash + assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" + + # Should contain helpful error message + assert ( + "No valid dataset files found" in stderr_text + or "No valid dataset files found" in stdout_text + ), f"Expected helpful error message. stderr: {stderr_text}, stdout: {stdout_text}" + assert ( + "SAS V5 XPT or Dataset-JSON" in stderr_text + or "SAS V5 XPT or Dataset-JSON" in stdout_text + ), "Expected format guidance in error message" + + +@pytest.mark.regression +def test_empty_folder_shows_helpful_error(): + """Test that an empty folder shows a user-friendly error message""" + with tempfile.TemporaryDirectory() as temp_dir: + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + # Should exit with error code + assert process.returncode != 0, "Expected non-zero exit code for empty folder" + + # Should contain helpful error message + assert ( + "No valid dataset files found" in stderr_text + or "No valid dataset files found" in stdout_text + ), "Expected helpful error message for empty folder" + + +@pytest.mark.regression +def test_valid_xpt_files_work_normally(): + """Test that folders with valid XPT files continue to work normally""" + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-dp", + os.path.join("tests", "resources", "test_dataset.xpt"), + ] + + process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + + # Should succeed or fail validation (but not crash with format error) + assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError" + assert "Failed to execute script" not in stderr_text, "Should not crash" + assert ( + "No valid dataset files found" not in stderr_text + ), "Should find valid XPT file" + + +@pytest.mark.regression +def test_mixed_folder_processes_valid_files(): + """Test that a folder with both valid and invalid files processes the valid ones""" + with tempfile.TemporaryDirectory() as temp_dir: + # Copy a valid test XPT file to temp directory + import shutil + + valid_xpt = os.path.join("tests", "resources", "test_dataset.xpt") + if os.path.exists(valid_xpt): + shutil.copy(valid_xpt, os.path.join(temp_dir, "ae.xpt")) + + # Create an invalid XLSX file + xlsx_file = os.path.join(temp_dir, "dm.xlsx") + open(xlsx_file, "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + # Should process successfully (found the XPT file) + assert "KeyError: 'XLSX'" not in stderr_text, "Should not crash on XLSX" + assert "Failed to execute script" not in stderr_text, "Should not crash" + + # May contain warning about ignored files (optional - our enhancement) + # But should NOT fail completely + assert ( + "No valid dataset files found" not in stderr_text + and "No valid dataset files found" not in stdout_text + ), "Should find the valid XPT file" From 5ba6703b934142925a76c2fe8925a55591297527 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 28 Oct 2025 10:27:31 -0400 Subject: [PATCH 2/6] crashes to helpful troubleshooting guidance fix --- core.py | 5 +-- scripts/list_dataset_metadata_handler.py | 28 ++----------- .../test_Issues/test_CoreIssue1387.py | 39 ++----------------- 3 files changed, 7 insertions(+), 65 deletions(-) diff --git a/core.py b/core.py index f3f439430..1433668ef 100644 --- a/core.py +++ b/core.py @@ -30,9 +30,7 @@ from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler from version import __version__ -# Formats supported for validation (must have metadata readers in local_data_service.py) VALIDATION_SUPPORTED_FORMATS = ["XPT", "JSON", "NDJSON"] -# User-friendly format description VALIDATION_FORMATS_MESSAGE = "SAS V5 XPT or Dataset-JSON (JSON or NDJSON)" @@ -47,10 +45,9 @@ def valid_data_file(data_path: list) -> tuple[list, set]: if file_extension in allowed_formats: found_formats.add(file_extension) file_list.append(file) - elif file_extension: # Has an extension but not supported + elif file_extension: ignored_files.append(os.path.basename(file)) - # Log ignored files if any if ignored_files: logger = logging.getLogger("validator") logger.warning( diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 1ad1d7b44..6c51f8f79 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -8,35 +8,13 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: - """ - Lists metadata of given datasets like: - [ - { - "domain":"AE", - "filename":"ae.xpt", - "full_path":"/Users/Aleksei_Furmenkov/PycharmProjects/cdisc-rules-engine/resources/data/ae.xpt", - "file_size":"38000", - "label":"Adverse Events", - "modification_date":"2020-08-21T09:14:26" - }, - { - "domain":"EX", - "filename":"ex.xpt", - "full_path":"/Users/Aleksei_Furmenkov/PycharmProjects/cdisc-rules-engine/resources/data/ex.xpt", - "file_size":"78050", - "label":"Exposure", - "modification_date":"2021-09-17T09:23:22" - }, - ... - ] - """ - # Validate file formats before processing - supported_formats = ["XPT", "JSON", "NDJSON"] + from core import VALIDATION_SUPPORTED_FORMATS + invalid_files = [] for path in dataset_paths: file_ext = path.split(".")[-1].upper() - if file_ext not in supported_formats: + if file_ext not in VALIDATION_SUPPORTED_FORMATS: invalid_files.append((path, file_ext)) if invalid_files: diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py index 84c06209c..634e4c215 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py @@ -4,29 +4,14 @@ import pytest from conftest import get_python_executable -""" -Test for Issue #1387: The error message shown when the -d switch points -to a folder without datasets is not user-friendly. - -These tests validate that: -1. Folders with only unsupported formats (XLSX) show helpful error messages -2. Empty folders show helpful error messages -3. Valid XPT files continue to work normally -4. Mixed folders (valid + invalid) process valid files with warnings -""" - @pytest.mark.regression def test_folder_with_xlsx_files_shows_helpful_error(): - """Test that a folder with only XLSX files shows a user-friendly error message""" with tempfile.TemporaryDirectory() as temp_dir: - # Create dummy XLSX files - xlsx_file1 = os.path.join(temp_dir, "ae.xlsx") - xlsx_file2 = os.path.join(temp_dir, "dm.xlsx") + xlsx_files = ["ae.xlsx", "dm.xlsx"] - # Create empty XLSX files - open(xlsx_file1, "w").close() - open(xlsx_file2, "w").close() + for filename in xlsx_files: + open(os.path.join(temp_dir, filename), "w").close() command = [ get_python_executable(), @@ -49,16 +34,11 @@ def test_folder_with_xlsx_files_shows_helpful_error(): stderr_text = stderr.decode() stdout_text = stdout.decode() - # Should exit with error code assert process.returncode != 0, "Expected non-zero exit code for invalid files" - - # Should NOT contain the old KeyError crash assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError crash" assert ( "Failed to execute script" not in stderr_text ), "Should not show script execution failure" - - # Should contain helpful error message assert ( "No valid dataset files found" in stderr_text or "No valid dataset files found" in stdout_text @@ -71,7 +51,6 @@ def test_folder_with_xlsx_files_shows_helpful_error(): @pytest.mark.regression def test_empty_folder_shows_helpful_error(): - """Test that an empty folder shows a user-friendly error message""" with tempfile.TemporaryDirectory() as temp_dir: command = [ get_python_executable(), @@ -94,10 +73,7 @@ def test_empty_folder_shows_helpful_error(): stderr_text = stderr.decode() stdout_text = stdout.decode() - # Should exit with error code assert process.returncode != 0, "Expected non-zero exit code for empty folder" - - # Should contain helpful error message assert ( "No valid dataset files found" in stderr_text or "No valid dataset files found" in stdout_text @@ -106,7 +82,6 @@ def test_empty_folder_shows_helpful_error(): @pytest.mark.regression def test_valid_xpt_files_work_normally(): - """Test that folders with valid XPT files continue to work normally""" command = [ get_python_executable(), "-m", @@ -125,7 +100,6 @@ def test_valid_xpt_files_work_normally(): stderr_text = stderr.decode() - # Should succeed or fail validation (but not crash with format error) assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError" assert "Failed to execute script" not in stderr_text, "Should not crash" assert ( @@ -135,16 +109,13 @@ def test_valid_xpt_files_work_normally(): @pytest.mark.regression def test_mixed_folder_processes_valid_files(): - """Test that a folder with both valid and invalid files processes the valid ones""" with tempfile.TemporaryDirectory() as temp_dir: - # Copy a valid test XPT file to temp directory import shutil valid_xpt = os.path.join("tests", "resources", "test_dataset.xpt") if os.path.exists(valid_xpt): shutil.copy(valid_xpt, os.path.join(temp_dir, "ae.xpt")) - # Create an invalid XLSX file xlsx_file = os.path.join(temp_dir, "dm.xlsx") open(xlsx_file, "w").close() @@ -169,12 +140,8 @@ def test_mixed_folder_processes_valid_files(): stderr_text = stderr.decode() stdout_text = stdout.decode() - # Should process successfully (found the XPT file) assert "KeyError: 'XLSX'" not in stderr_text, "Should not crash on XLSX" assert "Failed to execute script" not in stderr_text, "Should not crash" - - # May contain warning about ignored files (optional - our enhancement) - # But should NOT fail completely assert ( "No valid dataset files found" not in stderr_text and "No valid dataset files found" not in stdout_text From 4b4b5da0c7b6813e9383f34a87a9e5faf4c29ba1 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 28 Oct 2025 17:57:34 -0400 Subject: [PATCH 3/6] add XLSX support, use DataFormatTypes enum, and refactor validate function complexity --- README.md | 2 +- core.py | 123 +++++++++++++----- scripts/list_dataset_metadata_handler.py | 34 ++++- .../test_Issues/test_CoreIssue1387.py | 103 ++++++++++++++- 4 files changed, 217 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 42a0383cc..958b4d98c 100644 --- a/README.md +++ b/README.md @@ -236,10 +236,10 @@ CORE supports the following dataset file formats for validation: - **XPT** - SAS Transport Format (version 5) - **JSON** - Dataset-JSON (CDISC standard format) - **NDJSON** - Newline Delimited JSON datasets +- **XLSX** - Excel format (Microsoft Excel files) **Important Notes:** -- XLSX files are used for report output templates but are **not supported** as input datasets for validation. - Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). - If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. diff --git a/core.py b/core.py index 08db616b1..af441a835 100644 --- a/core.py +++ b/core.py @@ -27,15 +27,22 @@ generate_report_filename, get_rules_cache_key, ) +from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler from version import __version__ -VALIDATION_SUPPORTED_FORMATS = ["XPT", "JSON", "NDJSON"] -VALIDATION_FORMATS_MESSAGE = "SAS V5 XPT or Dataset-JSON (JSON or NDJSON)" +VALIDATION_FORMATS_MESSAGE = ( + "SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)" +) def valid_data_file(data_path: list) -> tuple[list, set]: - allowed_formats = VALIDATION_SUPPORTED_FORMATS + allowed_formats = [ + DataFormatTypes.XPT.value, + DataFormatTypes.JSON.value, + DataFormatTypes.NDJSON.value, + DataFormatTypes.XLSX.value, + ] found_formats = set() file_list = [] ignored_files = [] @@ -55,7 +62,11 @@ def valid_data_file(data_path: list) -> tuple[list, set]: + ("..." if len(ignored_files) > 5 else "") ) - if len(found_formats) > 1: + if "XLSX" in found_formats and len(found_formats) > 1: + return [], found_formats + elif "XLSX" in found_formats and len(file_list) > 1: + return [], found_formats + elif len(found_formats) > 1: return [], found_formats elif len(found_formats) >= 1: return file_list, found_formats @@ -260,8 +271,74 @@ def cli(): "If true, limits reported issues per dataset per rule." ), ) +def _validate_data_directory(data: str, logger) -> tuple[list, set]: + """Validate data directory and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file( + [str(Path(data).joinpath(fn)) for fn in os.listdir(data)] + ) + + if len(found_formats) > 1: + logger.error( + f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 + ) + return None, None + + if not dataset_paths: + if "XLSX" in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files found in directory: {data}\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files found in directory: {data}\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your directory contains files in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]: + """Validate dataset paths and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) + + if len(found_formats) > 1: + logger.error( + f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 + ) + return None, None + + if not dataset_paths: + if "XLSX" in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files provided.\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files provided.\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your files are in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_no_arguments(logger) -> None: + """Validate that at least one dataset argument is provided.""" + logger.error("You must pass one of the following arguments: --dataset-path, --data") + + +@click.command() @click.pass_context -def validate( # noqa: C901 +def validate( ctx, cache: str, pool_size: int, @@ -340,46 +417,22 @@ def validate( # noqa: C901 }, } ) + # Validate dataset arguments if data: if dataset_path: logger.error( "Argument --dataset-path cannot be used together with argument --data" ) ctx.exit(2) - dataset_paths, found_formats = valid_data_file( - [str(Path(data).joinpath(fn)) for fn in os.listdir(data)] - ) - if len(found_formats) > 1: - logger.error( - f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) - ctx.exit(2) - if not dataset_paths: - logger.error( - f"No valid dataset files found in directory: {data}\n" - f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" - f"Please ensure your directory contains files in one of these formats." - ) + dataset_paths, found_formats = _validate_data_directory(data, logger) + if dataset_paths is None: ctx.exit(2) elif dataset_path: - dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) - if len(found_formats) > 1: - logger.error( - f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) - ctx.exit(2) - if not dataset_paths: - logger.error( - f"No valid dataset files provided.\n" - f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" - f"Please ensure your files are in one of these formats." - ) + dataset_paths, found_formats = _validate_dataset_paths(dataset_path, logger) + if dataset_paths is None: ctx.exit(2) else: - logger.error( - "You must pass one of the following arguments: --dataset-path, --data" - ) - # no need to define dataset_paths here, the program execution will stop + _validate_no_arguments(logger) ctx.exit(2) validate_xml_bool = True if validate_xml.lower() in ("y", "yes") else False run_validation( diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py index 6c51f8f79..053713a5d 100644 --- a/scripts/list_dataset_metadata_handler.py +++ b/scripts/list_dataset_metadata_handler.py @@ -1,6 +1,7 @@ from typing import Tuple, List from cdisc_rules_engine.config import config +from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.serializers import DatasetMetadataSerializer from cdisc_rules_engine.services.cache import CacheServiceFactory @@ -8,20 +9,45 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]: - from core import VALIDATION_SUPPORTED_FORMATS - + """ + Lists metadata of given datasets like: + [ + { + "domain":"AE", + "filename":"ae.xpt", + "full_path":"/Users/Aleksei_Furmenkov/PycharmProjects/cdisc-rules-engine/resources/data/ae.xpt", + "file_size":"38000", + "label":"Adverse Events", + "modification_date":"2020-08-21T09:14:26" + }, + { + "domain":"EX", + "filename":"ex.xpt", + "full_path":"/Users/Aleksei_Furmenkov/PycharmProjects/cdisc-rules-engine/resources/data/ex.xpt", + "file_size":"78050", + "label":"Exposure", + "modification_date":"2021-09-17T09:23:22" + }, + ... + ] + """ invalid_files = [] for path in dataset_paths: file_ext = path.split(".")[-1].upper() - if file_ext not in VALIDATION_SUPPORTED_FORMATS: + if file_ext not in [ + DataFormatTypes.XPT.value, + DataFormatTypes.JSON.value, + DataFormatTypes.NDJSON.value, + DataFormatTypes.XLSX.value, + ]: invalid_files.append((path, file_ext)) if invalid_files: error_msg = "Unsupported file format(s) detected:\n" for file, ext in invalid_files: error_msg += f" - {file} (format: {ext})\n" - error_msg += "\nSupported formats: SAS V5 XPT or Dataset-JSON (JSON or NDJSON)" + error_msg += "\nSupported formats: SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)" raise ValueError(error_msg) cache_service = CacheServiceFactory(config).get_service() diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py index 634e4c215..2e3a348bd 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py @@ -6,7 +6,8 @@ @pytest.mark.regression -def test_folder_with_xlsx_files_shows_helpful_error(): +def test_multiple_xlsx_files_shows_helpful_error(): + """Test that multiple XLSX files show a helpful error message about single file limitation""" with tempfile.TemporaryDirectory() as temp_dir: xlsx_files = ["ae.xlsx", "dm.xlsx"] @@ -34,18 +35,109 @@ def test_folder_with_xlsx_files_shows_helpful_error(): stderr_text = stderr.decode() stdout_text = stdout.decode() - assert process.returncode != 0, "Expected non-zero exit code for invalid files" + assert ( + process.returncode != 0 + ), "Expected non-zero exit code for multiple XLSX files" + assert "KeyError" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" + assert ( + "Multiple XLSX files found" in stderr_text + or "Multiple XLSX files found" in stdout_text + ), f"Expected helpful error message about multiple XLSX files. stderr: {stderr_text}, stdout: {stdout_text}" + assert ( + "Excel format (XLSX) validation only supports single files" in stderr_text + or "Excel format (XLSX) validation only supports single files" + in stdout_text + ), "Expected explanation of XLSX limitation" + + +@pytest.mark.regression +def test_folder_with_xlsx_files_works_with_excel_service(): + """Test that a folder with a single XLSX file now works with ExcelDataService""" + with tempfile.TemporaryDirectory() as temp_dir: + xlsx_files = ["ae.xlsx"] + + for filename in xlsx_files: + open(os.path.join(temp_dir, filename), "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + assert "KeyError: 'XLSX'" not in stderr_text, "Should not show KeyError crash" assert ( "Failed to execute script" not in stderr_text ), "Should not show script execution failure" + + assert ( + "No valid dataset files found" not in stderr_text + and "No valid dataset files found" not in stdout_text + ), "XLSX files should now be recognized as valid" + + +@pytest.mark.regression +def test_folder_with_unsupported_formats_shows_helpful_error(): + """Test that folders with truly unsupported formats (like PDF) show helpful error messages""" + with tempfile.TemporaryDirectory() as temp_dir: + unsupported_files = ["ae.pdf", "dm.txt"] + + for filename in unsupported_files: + open(os.path.join(temp_dir, filename), "w").close() + + command = [ + get_python_executable(), + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-v", + "3.4", + "-d", + temp_dir, + ] + + process = subprocess.Popen( + command, stdout=subprocess.PIPE, stderr=subprocess.PIPE + ) + stdout, stderr = process.communicate() + + stderr_text = stderr.decode() + stdout_text = stdout.decode() + + assert ( + process.returncode != 0 + ), "Expected non-zero exit code for unsupported files" + assert "KeyError" not in stderr_text, "Should not show KeyError crash" + assert ( + "Failed to execute script" not in stderr_text + ), "Should not show script execution failure" assert ( "No valid dataset files found" in stderr_text or "No valid dataset files found" in stdout_text ), f"Expected helpful error message. stderr: {stderr_text}, stdout: {stdout_text}" assert ( - "SAS V5 XPT or Dataset-JSON" in stderr_text - or "SAS V5 XPT or Dataset-JSON" in stdout_text + "SAS V5 XPT, Dataset-JSON" in stderr_text + or "SAS V5 XPT, Dataset-JSON" in stdout_text ), "Expected format guidance in error message" @@ -140,8 +232,9 @@ def test_mixed_folder_processes_valid_files(): stderr_text = stderr.decode() stdout_text = stdout.decode() - assert "KeyError: 'XLSX'" not in stderr_text, "Should not crash on XLSX" + assert "KeyError" not in stderr_text, "Should not crash on any files" assert "Failed to execute script" not in stderr_text, "Should not crash" + assert ( "No valid dataset files found" not in stderr_text and "No valid dataset files found" not in stdout_text From 040c51b43bfb21b69c04687667fabfb7fb4887fd Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 28 Oct 2025 19:42:17 -0400 Subject: [PATCH 4/6] CI fixes --- core.py | 152 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 79 insertions(+), 73 deletions(-) diff --git a/core.py b/core.py index af441a835..e88966124 100644 --- a/core.py +++ b/core.py @@ -62,13 +62,14 @@ def valid_data_file(data_path: list) -> tuple[list, set]: + ("..." if len(ignored_files) > 5 else "") ) - if "XLSX" in found_formats and len(found_formats) > 1: - return [], found_formats - elif "XLSX" in found_formats and len(file_list) > 1: - return [], found_formats - elif len(found_formats) > 1: - return [], found_formats - elif len(found_formats) >= 1: + if "XLSX" in found_formats: + if len(found_formats) > 1: + return [], found_formats + elif len(file_list) > 1: + return [], found_formats + else: + return file_list, found_formats + if len(found_formats) >= 1: return file_list, found_formats else: return [], set() @@ -79,6 +80,77 @@ def cli(): pass +def _validate_data_directory(data: str, logger) -> tuple[list, set]: + """Validate data directory and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file( + [str(Path(data).joinpath(fn)) for fn in os.listdir(data)] + ) + + if "XLSX" in found_formats and len(found_formats) > 1: + logger.error( + f"Argument --data contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + return None, None + + if not dataset_paths: + if "XLSX" in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files found in directory: {data}\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files found in directory: {data}\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your directory contains files in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]: + """Validate dataset paths and return dataset paths and found formats.""" + dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) + + if "XLSX" in found_formats and len(found_formats) > 1: + logger.error( + f"Argument --dataset-path contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + return None, None + + if not dataset_paths: + if "XLSX" in found_formats and len(found_formats) == 1: + logger.error( + f"Multiple XLSX files provided.\n" + f"Excel format (XLSX) validation only supports single files.\n" + f"Please provide either a single XLSX file or use other supported formats: " + f"{VALIDATION_FORMATS_MESSAGE}" + ) + else: + logger.error( + f"No valid dataset files provided.\n" + f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" + f"Please ensure your files are in one of these formats." + ) + return None, None + + return dataset_paths, found_formats + + +def _validate_no_arguments(logger) -> None: + """Validate that at least one dataset argument is provided.""" + logger.error("You must pass one of the following arguments: --dataset-path, --data") + + @click.command() @click.option( "-ca", @@ -271,72 +343,6 @@ def cli(): "If true, limits reported issues per dataset per rule." ), ) -def _validate_data_directory(data: str, logger) -> tuple[list, set]: - """Validate data directory and return dataset paths and found formats.""" - dataset_paths, found_formats = valid_data_file( - [str(Path(data).joinpath(fn)) for fn in os.listdir(data)] - ) - - if len(found_formats) > 1: - logger.error( - f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) - return None, None - - if not dataset_paths: - if "XLSX" in found_formats and len(found_formats) == 1: - logger.error( - f"Multiple XLSX files found in directory: {data}\n" - f"Excel format (XLSX) validation only supports single files.\n" - f"Please provide either a single XLSX file or use other supported formats: " - f"{VALIDATION_FORMATS_MESSAGE}" - ) - else: - logger.error( - f"No valid dataset files found in directory: {data}\n" - f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" - f"Please ensure your directory contains files in one of these formats." - ) - return None, None - - return dataset_paths, found_formats - - -def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]: - """Validate dataset paths and return dataset paths and found formats.""" - dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) - - if len(found_formats) > 1: - logger.error( - f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})." # noqa: E501 - ) - return None, None - - if not dataset_paths: - if "XLSX" in found_formats and len(found_formats) == 1: - logger.error( - f"Multiple XLSX files provided.\n" - f"Excel format (XLSX) validation only supports single files.\n" - f"Please provide either a single XLSX file or use other supported formats: " - f"{VALIDATION_FORMATS_MESSAGE}" - ) - else: - logger.error( - f"No valid dataset files provided.\n" - f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n" - f"Please ensure your files are in one of these formats." - ) - return None, None - - return dataset_paths, found_formats - - -def _validate_no_arguments(logger) -> None: - """Validate that at least one dataset argument is provided.""" - logger.error("You must pass one of the following arguments: --dataset-path, --data") - - -@click.command() @click.pass_context def validate( ctx, From 15a98a2823dba9c207aa856851f5f2a3cdaab59d Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 29 Oct 2025 08:25:27 -0400 Subject: [PATCH 5/6] local data service error update --- cdisc_rules_engine/services/data_services/local_data_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 388b542a9..ace053e2d 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -191,7 +191,7 @@ def read_metadata( raise ValueError( f"Unsupported file format '{file_extension}' in file '{file_name}'.\n" f"Supported formats: {supported_formats}\n" - f"Please provide dataset files in SAS V5 XPT or Dataset-JSON (JSON or NDJSON) format." + f"Please provide dataset files in SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX) format." ) contents_metadata = _metadata_reader_map[file_extension]( From 8a87c1b07ddc072e09d5e99d33dcbb4080c4ccbc Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 5 Nov 2025 10:32:19 -0500 Subject: [PATCH 6/6] use DataFormatTypes.XLSX.value enum and move shutil import to top --- core.py | 12 ++++++------ .../test_Issues/test_CoreIssue1387.py | 3 +-- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/core.py b/core.py index 65391ec97..5dfc84bcb 100644 --- a/core.py +++ b/core.py @@ -62,7 +62,7 @@ def valid_data_file(data_path: list) -> tuple[list, set]: + ("..." if len(ignored_files) > 5 else "") ) - if "XLSX" in found_formats: + if DataFormatTypes.XLSX.value in found_formats: if len(found_formats) > 1: return [], found_formats elif len(file_list) > 1: @@ -83,10 +83,10 @@ def cli(): def _validate_data_directory(data: str, logger) -> tuple[list, set]: """Validate data directory and return dataset paths and found formats.""" dataset_paths, found_formats = valid_data_file( - [str(Path(data).joinpath(fn)) for fn in os.listdir(data)] + [str(p) for p in Path(data).rglob("*") if p.is_file()] ) - if "XLSX" in found_formats and len(found_formats) > 1: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1: logger.error( f"Argument --data contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" f"Excel format (XLSX) validation only supports single files.\n" @@ -96,7 +96,7 @@ def _validate_data_directory(data: str, logger) -> tuple[list, set]: return None, None if not dataset_paths: - if "XLSX" in found_formats and len(found_formats) == 1: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1: logger.error( f"Multiple XLSX files found in directory: {data}\n" f"Excel format (XLSX) validation only supports single files.\n" @@ -118,7 +118,7 @@ def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set """Validate dataset paths and return dataset paths and found formats.""" dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path]) - if "XLSX" in found_formats and len(found_formats) > 1: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1: logger.error( f"Argument --dataset-path contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n" f"Excel format (XLSX) validation only supports single files.\n" @@ -128,7 +128,7 @@ def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set return None, None if not dataset_paths: - if "XLSX" in found_formats and len(found_formats) == 1: + if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1: logger.error( f"Multiple XLSX files provided.\n" f"Excel format (XLSX) validation only supports single files.\n" diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py index 2e3a348bd..3c94be9e9 100644 --- a/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1387.py @@ -1,6 +1,7 @@ import subprocess import os import tempfile +import shutil import pytest from conftest import get_python_executable @@ -202,8 +203,6 @@ def test_valid_xpt_files_work_normally(): @pytest.mark.regression def test_mixed_folder_processes_valid_files(): with tempfile.TemporaryDirectory() as temp_dir: - import shutil - valid_xpt = os.path.join("tests", "resources", "test_dataset.xpt") if os.path.exists(valid_xpt): shutil.copy(valid_xpt, os.path.join(temp_dir, "ae.xpt"))