cdisc-org · gerrycampion · Nov 5, 2025 · Oct 21, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/README.md b/README.md
@@ -229,6 +229,20 @@ To validate a folder using rules for SDTM-IG version 3.4 use the following comma
 
 **_NOTE:_** Before running a validation in the CLI, you must first populate the cache with rules to validate against. See the update-cache command below.
 
+#### Supported Dataset Formats
+
+CORE supports the following dataset file formats for validation:
+
+- **XPT** - SAS Transport Format (version 5)
+- **JSON** - Dataset-JSON (CDISC standard format)
+- **NDJSON** - Newline Delimited JSON datasets
+- **XLSX** - Excel format (Microsoft Excel files)
+
+**Important Notes:**
+
+- Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`).
+- If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported.
+
 ##### **Validate single rule**
 
 `python core.py validate -s sdtmig -v 3-4 -dp <path to dataset json file> -lr <path to rule json file> --meddra ./meddra/ --whodrug ./whodrug/`

diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py
@@ -184,7 +184,17 @@ def read_metadata(
             DataFormatTypes.JSON.value: DatasetJSONMetadataReader,
             DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader,
         }
-        contents_metadata = _metadata_reader_map[file_name.split(".")[1].upper()](
+
+        file_extension = file_name.split(".")[1].upper()
+        if file_extension not in _metadata_reader_map:
+            supported_formats = ", ".join(_metadata_reader_map.keys())
+            raise ValueError(
+                f"Unsupported file format '{file_extension}' in file '{file_name}'.\n"
+                f"Supported formats: {supported_formats}\n"
+                f"Please provide dataset files in SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX) format."
+            )
+
+        contents_metadata = _metadata_reader_map[file_extension](
             file_metadata["path"], file_name
         ).read()
         return {

diff --git a/core.py b/core.py
@@ -14,7 +14,6 @@
 from cdisc_rules_engine.enums.default_file_paths import DefaultFilePaths
 from cdisc_rules_engine.enums.progress_parameter_options import ProgressParameterOptions
 from cdisc_rules_engine.enums.report_types import ReportTypes
-from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
 from cdisc_rules_engine.models.validation_args import Validation_args
 from scripts.run_validation import run_validation
 from cdisc_rules_engine.services.cache.cache_populator_service import CachePopulator
@@ -28,30 +27,130 @@
     generate_report_filename,
     get_rules_cache_key,
 )
+from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
 from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler
 from version import __version__
 
+VALIDATION_FORMATS_MESSAGE = (
+    "SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)"
+)
+
 
 def valid_data_file(data_path: list) -> tuple[list, set]:
-    allowed_formats = [format.value for format in DataFormatTypes]
+    allowed_formats = [
+        DataFormatTypes.XPT.value,
+        DataFormatTypes.JSON.value,
+        DataFormatTypes.NDJSON.value,
+        DataFormatTypes.XLSX.value,
+    ]
     found_formats = set()
     file_list = []
+    ignored_files = []
+
     for file in data_path:
         file_extension = os.path.splitext(file)[1][1:].upper()
         if file_extension in allowed_formats:
             found_formats.add(file_extension)
             file_list.append(file)
-    if len(found_formats) > 1:
-        return [], found_formats
-    elif len(found_formats) == 1:
+        elif file_extension:
+            ignored_files.append(os.path.basename(file))
+
+    if ignored_files:
+        logger = logging.getLogger("validator")
+        logger.warning(
+            f"Ignoring {len(ignored_files)} file(s) with unsupported formats: {', '.join(ignored_files[:5])}"
+            + ("..." if len(ignored_files) > 5 else "")
+        )
+
+    if DataFormatTypes.XLSX.value in found_formats:
+        if len(found_formats) > 1:
+            return [], found_formats
+        elif len(file_list) > 1:
+            return [], found_formats
+        else:
+            return file_list, found_formats
+    if len(found_formats) >= 1:
         return file_list, found_formats
+    else:
+        return [], set()
 
 
 @click.group()
 def cli():
     pass
 
 
+def _validate_data_directory(data: str, logger) -> tuple[list, set]:
+    """Validate data directory and return dataset paths and found formats."""
+    dataset_paths, found_formats = valid_data_file(
+        [str(p) for p in Path(data).rglob("*") if p.is_file()]
+    )
+
+    if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1:
+        logger.error(
+            f"Argument --data contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n"
+            f"Excel format (XLSX) validation only supports single files.\n"
+            f"Please provide either a single XLSX file or use other supported formats: "
+            f"{VALIDATION_FORMATS_MESSAGE}"
+        )
+        return None, None
+
+    if not dataset_paths:
+        if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1:
+            logger.error(
+                f"Multiple XLSX files found in directory: {data}\n"
+                f"Excel format (XLSX) validation only supports single files.\n"
+                f"Please provide either a single XLSX file or use other supported formats: "
+                f"{VALIDATION_FORMATS_MESSAGE}"
+            )
+        else:
+            logger.error(
+                f"No valid dataset files found in directory: {data}\n"
+                f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n"
+                f"Please ensure your directory contains files in one of these formats."
+            )
+        return None, None
+
+    return dataset_paths, found_formats
+
+
+def _validate_dataset_paths(dataset_path: tuple[str], logger) -> tuple[list, set]:
+    """Validate dataset paths and return dataset paths and found formats."""
+    dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path])
+
+    if DataFormatTypes.XLSX.value in found_formats and len(found_formats) > 1:
+        logger.error(
+            f"Argument --dataset-path contains XLSX files mixed with other formats ({', '.join(found_formats)}).\n"
+            f"Excel format (XLSX) validation only supports single files.\n"
+            f"Please provide either a single XLSX file or use other supported formats: "
+            f"{VALIDATION_FORMATS_MESSAGE}"
+        )
+        return None, None
+
+    if not dataset_paths:
+        if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1:
+            logger.error(
+                f"Multiple XLSX files provided.\n"
+                f"Excel format (XLSX) validation only supports single files.\n"
+                f"Please provide either a single XLSX file or use other supported formats: "
+                f"{VALIDATION_FORMATS_MESSAGE}"
+            )
+        else:
+            logger.error(
+                f"No valid dataset files provided.\n"
+                f"Supported formats: {VALIDATION_FORMATS_MESSAGE}\n"
+                f"Please ensure your files are in one of these formats."
+            )
+        return None, None
+
+    return dataset_paths, found_formats
+
+
+def _validate_no_arguments(logger) -> None:
+    """Validate that at least one dataset argument is provided."""
+    logger.error("You must pass one of the following arguments: --dataset-path, --data")
+
+
 @click.command()
 @click.option(
     "-ca",
@@ -70,14 +169,14 @@ def cli():
     "-d",
     "--data",
     required=False,
-    help="Path to directory containing data files",
+    help=f"Path to directory containing data files ({VALIDATION_FORMATS_MESSAGE})",
 )
 @click.option(
     "-dp",
     "--dataset-path",
     required=False,
     multiple=True,
-    help="Absolute path to dataset file",
+    help=f"Absolute path to dataset file ({VALIDATION_FORMATS_MESSAGE})",
 )
 @click.option(
     "-l",
@@ -323,32 +422,22 @@ def validate(
             },
         }
     )
+    # Validate dataset arguments
     if data:
         if dataset_path:
             logger.error(
                 "Argument --dataset-path cannot be used together with argument --data"
             )
             ctx.exit(2)
-        dataset_paths, found_formats = valid_data_file(
-            [str(p) for p in Path(data).rglob("*") if p.is_file()]
-        )
-        if len(found_formats) > 1:
-            logger.error(
-                f"Argument --data contains more than one allowed file format ({', '.join(found_formats)})."  # noqa: E501
-            )
+        dataset_paths, found_formats = _validate_data_directory(data, logger)
+        if dataset_paths is None:
             ctx.exit(2)
     elif dataset_path:
-        dataset_paths, found_formats = valid_data_file([dp for dp in dataset_path])
-        if len(found_formats) > 1:
-            logger.error(
-                f"Argument --dataset-path contains more than one allowed file format ({', '.join(found_formats)})."  # noqa: E501
-            )
+        dataset_paths, found_formats = _validate_dataset_paths(dataset_path, logger)
+        if dataset_paths is None:
             ctx.exit(2)
     else:
-        logger.error(
-            "You must pass one of the following arguments: --dataset-path, --data"
-        )
-        # no need to define dataset_paths here, the program execution will stop
+        _validate_no_arguments(logger)
         ctx.exit(2)
     validate_xml_bool = True if validate_xml.lower() in ("y", "yes") else False
     run_validation(

diff --git a/scripts/list_dataset_metadata_handler.py b/scripts/list_dataset_metadata_handler.py
@@ -1,6 +1,7 @@
 from typing import Tuple, List
 
 from cdisc_rules_engine.config import config
+from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
 from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
 from cdisc_rules_engine.serializers import DatasetMetadataSerializer
 from cdisc_rules_engine.services.cache import CacheServiceFactory
@@ -30,6 +31,25 @@ def list_dataset_metadata_handler(dataset_paths: Tuple[str]) -> List[dict]:
        ...
     ]
     """
+    invalid_files = []
+
+    for path in dataset_paths:
+        file_ext = path.split(".")[-1].upper()
+        if file_ext not in [
+            DataFormatTypes.XPT.value,
+            DataFormatTypes.JSON.value,
+            DataFormatTypes.NDJSON.value,
+            DataFormatTypes.XLSX.value,
+        ]:
+            invalid_files.append((path, file_ext))
+
+    if invalid_files:
+        error_msg = "Unsupported file format(s) detected:\n"
+        for file, ext in invalid_files:
+            error_msg += f"  - {file} (format: {ext})\n"
+        error_msg += "\nSupported formats: SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)"
+        raise ValueError(error_msg)
+
     cache_service = CacheServiceFactory(config).get_service()
     data_service = DataServiceFactory(config, cache_service).get_service()
     metadata: List[SDTMDatasetMetadata] = [