diff --git a/README.md b/README.md index 39883c5c2..d5036907d 100644 --- a/README.md +++ b/README.md @@ -138,19 +138,21 @@ This will show the list of validation options. -ca, --cache TEXT Relative path to cache files containing pre loaded metadata and rules -ps, --pool-size INTEGER Number of parallel processes for validation - -d, --data TEXT Path to directory containing data files + -d, --data TEXT Path to directory containing data files. + DATA_DIR environment variable can be used to pass value. -dp, --dataset-path TEXT Absolute path to dataset file. Can be specified multiple times. - -dxp, --define-xml-path TEXT Path to Define-XML + DATASET_PATH environment variable can be used to pass values separated by ':' on Unix and ';' for Windows. + -dxp, --define-xml-path TEXT Path to Define-XML. DEFINE environment variable can be used to pass value. -l, --log-level [info|debug|error|critical|disabled|warn] Sets log level for engine logs, logs are disabled by default -rt, --report-template TEXT File path of report template to use for excel output - -s, --standard TEXT CDISC standard to validate against + -s, --standard TEXT CDISC standard to validate against. STANDARD environment variable can be used to pass value. [required] - -v, --version TEXT Standard version to validate against + -v, --version TEXT Standard version to validate against. VERSION environment variable can be used to pass value. [required] - -ss, --substandard TEXT Substandard to validate against + -ss, --substandard TEXT Substandard to validate against. SUBSTANDARD environment variable can be used to pass value. "SDTM", "SEND", "ADaM", or "CDASH" [required for TIG] -uc, --use-case TEXT Use Case for TIG Validation @@ -161,7 +163,8 @@ This will show the list of validation options. against, can provide more than one NOTE: if a defineXML is provided, if it is version 2.1 engine will use the CT laid out in the define. If it is - version 2.0, -ct is expected to specify the CT package + version 2.0, -ct is expected to specify the CT package. + CONTROLLED_TERMINOLOGY_PACKAGE environment variable can be used to pass values separated by ':' on Unix and ';' for Windows. -o, --output TEXT Report output file destination and name. Path will be relative to the validation execution directory and should end in the desired output filename @@ -204,7 +207,7 @@ This will show the list of validation options. if both .env and -me are specified, the larger value will be used. If either sets the per_dataset_flag to true, it will be true If limit is set to 0, no maximum will be enforced. No maximum is the default behavior. - -dv, --define-version TEXT Define-XML version used for validation + -dv, --define-version TEXT Define-XML version used for validation. DEFINE_VERSION environment variable can be used to pass value. -dxp, --define-xml-path Path to define-xml file. -vx, --validate-xml Enable XML validation (default 'y' to enable, otherwise disable). --whodrug TEXT Path to directory with WHODrug dictionary @@ -221,9 +224,12 @@ This will show the list of validation options. --snomed-url TEXT Base url of snomed api to use. (ex. https://snowstorm.snomedtools.org/snowstorm/snomed-ct) --snomed-edition TEXT Edition of snomed to use. (ex. SNOMEDCT-US) -r, --rules TEXT Specify rule core ID ex. CORE-000001. Can be specified multiple times. + RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows. -er, --exclude-rules TEXT Specify rule core ID to exclude, ex. CORE-000001. Can be specified multiple times. + EXCLUDE_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows. -lr, --local-rules TEXT Specify relative path to directory or file containing local rule yml and/or json rule files. + LOCAL_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows. -cs, --custom-standard Adding this flag tells engine to use a custom standard specified with -s and -v that has been uploaded to the cache using update-cache -cse, --custom-standard-encoding TEXT diff --git a/cdisc_rules_engine/enums/dataformat_types.py b/cdisc_rules_engine/enums/dataformat_types.py index 972d50ce4..6ad55b87d 100644 --- a/cdisc_rules_engine/enums/dataformat_types.py +++ b/cdisc_rules_engine/enums/dataformat_types.py @@ -8,3 +8,4 @@ class DataFormatTypes(BaseEnum): USDM = "USDM" XLSX = "XLSX" XPT = "XPT" + CSV = "CSV" diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index 94436108a..a0e3b140f 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -82,6 +82,11 @@ class CTPackageNotFoundError(EngineError): description = "Controlled terminology package(s) not found" +class InvalidCSVFile(EngineError): + code = 400 + description = "CSV data is malformed." + + class NumberOfAttemptsExceeded(EngineError): pass diff --git a/cdisc_rules_engine/interfaces/data_reader_interface.py b/cdisc_rules_engine/interfaces/data_reader_interface.py index a21c43416..903d1b511 100644 --- a/cdisc_rules_engine/interfaces/data_reader_interface.py +++ b/cdisc_rules_engine/interfaces/data_reader_interface.py @@ -11,8 +11,8 @@ def __init__( self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING ): """ - :param dataset_implementation DatasetInterface: The dataset type to return. - :param encoding str: The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8). + :param DatasetInterface dataset_implementation : The dataset type to return. + :param str encoding : The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8). """ self.dataset_implementation = dataset_implementation self.encoding = encoding @@ -26,3 +26,7 @@ def read(self, data): def from_file(self, file_path): raise NotImplementedError + + def to_parquet(self, file_path) -> tuple[int, str]: + """Returns number of rows and path to the parquet file""" + raise NotImplementedError diff --git a/cdisc_rules_engine/services/csv_metadata_reader.py b/cdisc_rules_engine/services/csv_metadata_reader.py new file mode 100644 index 000000000..63bb8afa4 --- /dev/null +++ b/cdisc_rules_engine/services/csv_metadata_reader.py @@ -0,0 +1,168 @@ +import logging +from datetime import datetime +from pathlib import Path + +import pandas as pd + +from cdisc_rules_engine.constants import DEFAULT_ENCODING + + +class DatasetCSVMetadataReader: + def __init__( + self, file_path: str, file_name: str, encoding: str = DEFAULT_ENCODING + ): + self.file_path = file_path + self.file_name = file_name + self.encoding = encoding + + def read(self) -> dict: + dataset_name = Path(self.file_name).stem.lower() + variables_file_path = Path(self.file_path).parent / "variables.csv" + + if not variables_file_path.exists(): + logger = logging.getLogger("validator") + logger.info("No variables file found for %s", dataset_name) + variables_meta = {} + else: + variables_meta = self.__get_variable_metadata( + dataset_name, variables_file_path + ) + + metadata = { + "dataset_name": dataset_name.upper(), + "dataset_modification_date": datetime.fromtimestamp( + Path(self.file_path).stat().st_mtime + ).isoformat(), + "adam_info": { + "categorization_scheme": {}, + "w_indexes": {}, + "period": {}, + "selection_algorithm": {}, + }, + } + metadata.update(variables_meta) + metadata.update(self.__data_meta()) + metadata.update(self.__dataset_label()) + return metadata + + def __get_variable_metadata( + self, dataset_name: str, variables_file_path: Path + ) -> dict: + logger = logging.getLogger("validator") + try: + meta_df = pd.read_csv(variables_file_path, encoding=self.encoding) + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. " + f"Please specify the correct encoding using the -e flag." + ) + return {} + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + return {} + + meta_df["dataset"] = meta_df["dataset"].apply( + lambda x: Path(str(x)).stem.lower() + ) + + dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name] + + if dataset_meta_df.empty: + logger = logging.getLogger("validator") + logger.info("No dataset metadata found for %s", dataset_name) + return {} + + variable_names = dataset_meta_df["variable"].tolist() + variable_labels = dataset_meta_df["label"].tolist() + + variable_name_to_label_map = dict(zip(variable_names, variable_labels)) + variable_name_to_data_type_map = dict( + zip(variable_names, dataset_meta_df["type"]) + ) + variable_name_to_size_map = { + var: (int(length) if pd.notna(length) else None) + for var, length in zip(variable_names, dataset_meta_df["length"]) + } + return { + "variable_names": variable_names, + "variable_labels": variable_labels, + "variable_formats": [""] * len(variable_names), + "variable_name_to_label_map": variable_name_to_label_map, + "variable_name_to_data_type_map": variable_name_to_data_type_map, + "variable_name_to_size_map": variable_name_to_size_map, + "number_of_variables": len(variable_names), + } + + def __dataset_label(self) -> dict: + logger = logging.getLogger("validator") + tables_file_path = Path(self.file_path).parent / "tables.csv" + if not tables_file_path.exists(): + return {} + + try: + tables_df = pd.read_csv(tables_file_path, encoding=self.encoding) + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"\n Error reading CSV from: {self.file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) + return {} + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + return {} + + if "Filename" not in tables_df.columns or "Label" not in tables_df.columns: + return {} + + tables_df["dataset"] = tables_df["Filename"].apply( + lambda x: Path(str(x)).stem.lower() + ) + + current_dataset = Path(self.file_name).stem.lower() + match = tables_df[tables_df["dataset"] == current_dataset] + + if match.empty: + return {} + + return {"dataset_label": str(match.iloc[0]["Label"])} + + def __data_meta(self): + logger = logging.getLogger("validator") + result = { + "dataset_length": 0, + "first_record": {}, + } + try: + first_row_df = pd.read_csv(self.file_path, encoding=self.encoding, nrows=1) + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"\n Error reading CSV from: {self.file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) + return result + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + return result + + if not first_row_df.empty: + result["first_record"] = ( + first_row_df.iloc[0].fillna("").astype(str).to_dict() + ) + + try: + with open(self.file_path, encoding=self.encoding) as f: + result["dataset_length"] = max( + sum(1 for _ in f) - 1, 0 + ) # subtract header + except (UnicodeDecodeError, UnicodeError) as e: + logger.error( + f"\n Error reading CSV from: {self.file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) + except Exception as e: + logger.error("Error reading CSV file %s. %s", self.file_path, e) + + return result diff --git a/cdisc_rules_engine/services/data_readers/csv_reader.py b/cdisc_rules_engine/services/data_readers/csv_reader.py new file mode 100644 index 000000000..675c9ff38 --- /dev/null +++ b/cdisc_rules_engine/services/data_readers/csv_reader.py @@ -0,0 +1,50 @@ +import tempfile + +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile +from cdisc_rules_engine.interfaces import DataReaderInterface +import pandas as pd + + +class CSVReader(DataReaderInterface): + def read(self, data): + """ + Function for reading data from a specific file type and returning a + pandas dataframe of the data. + """ + raise NotImplementedError + + def from_file(self, file_path): + try: + with open(file_path, "r", encoding=self.encoding) as fp: + data = pd.read_csv(fp, sep=",", header=0, index_col=False) + return data + except (UnicodeDecodeError, UnicodeError) as e: + raise InvalidCSVFile( + f"\n Error reading CSV from: {file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) + except Exception as e: + raise InvalidCSVFile( + f"\n Error reading CSV from: {file_path}" + f"\n {type(e).__name__}: {e}" + ) + + def to_parquet(self, file_path: str) -> tuple[int, str]: + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") + + dataset = pd.read_csv(file_path, chunksize=20000, encoding=self.encoding) + + created = False + num_rows = 0 + + for chunk in dataset: + num_rows += len(chunk) + + if not created: + chunk.to_parquet(temp_file.name, engine="fastparquet") + created = True + else: + chunk.to_parquet(temp_file.name, engine="fastparquet", append=True) + + return num_rows, temp_file.name diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py index 2df492a86..6f835e789 100644 --- a/cdisc_rules_engine/services/data_readers/data_reader_factory.py +++ b/cdisc_rules_engine/services/data_readers/data_reader_factory.py @@ -4,6 +4,7 @@ DataReaderInterface, FactoryInterface, ) +from cdisc_rules_engine.services.data_readers.csv_reader import CSVReader from cdisc_rules_engine.services.data_readers.xpt_reader import XPTReader from cdisc_rules_engine.services.data_readers.dataset_json_reader import ( DatasetJSONReader, @@ -19,12 +20,13 @@ class DataReaderFactory(FactoryInterface): - _reader_map = { + _reader_map: dict[str, Type[DataReaderInterface]] = { DataFormatTypes.XPT.value: XPTReader, DataFormatTypes.PARQUET.value: ParquetReader, DataFormatTypes.JSON.value: DatasetJSONReader, DataFormatTypes.NDJSON.value: DatasetNDJSONReader, DataFormatTypes.USDM.value: JSONReader, + DataFormatTypes.CSV.value: CSVReader, } def __init__( diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 6f2408a51..980d1e6c4 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -20,6 +20,7 @@ from cdisc_rules_engine.services.datasetndjson_metadata_reader import ( DatasetNDJSONMetadataReader, ) +from cdisc_rules_engine.services.csv_metadata_reader import DatasetCSVMetadataReader from cdisc_rules_engine.utilities.utils import ( convert_file_size, extract_file_name_from_path_string, @@ -194,6 +195,7 @@ def read_metadata( DataFormatTypes.XPT.value: DatasetXPTMetadataReader, DataFormatTypes.JSON.value: DatasetJSONMetadataReader, DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader, + DataFormatTypes.CSV.value: DatasetCSVMetadataReader, } file_extension = file_name.split(".")[1].upper() diff --git a/core.py b/core.py index 2ef8d21ef..a00bc2fa1 100644 --- a/core.py +++ b/core.py @@ -66,6 +66,7 @@ def valid_data_file(data_path: list) -> tuple[list, set]: DataFormatTypes.JSON.value, DataFormatTypes.NDJSON.value, DataFormatTypes.XLSX.value, + DataFormatTypes.CSV.value, ] found_formats = set() file_list = [] @@ -104,6 +105,44 @@ def cli(): pass +def _filter_dataset_paths( + dataset_paths: list[str], encoding: str = DEFAULT_ENCODING +) -> list[str]: + """ + Filters dataset paths based on tables.csv content (if exists). + + Keeps only datasets listed in tables.csv (Filename column). + Always excludes tables.csv and variables.csv from result. + """ + import pandas as pd + + paths = [Path(p) for p in dataset_paths] + tables_path = next((p for p in paths if p.name.lower() == "tables.csv"), None) + + dataset_files = [ + p for p in paths if p.name.lower() not in ("tables.csv", "variables.csv") + ] + + if not tables_path: + return [str(p) for p in dataset_files if p.suffix.lower() == ".csv"] + + tables_df = pd.read_csv(tables_path, encoding=encoding) + + if "Filename" not in tables_df.columns: + return [str(p) for p in dataset_files if p.suffix.lower() == ".csv"] + + allowed_datasets = { + Path(str(name)).stem.lower() for name in tables_df["Filename"].dropna() + } + + filtered = { + str(p) + for p in dataset_files + if p.suffix.lower() == ".csv" and p.stem.lower() in allowed_datasets + } + return list(filtered) + + def _validate_data_directory( data: str, logger, filetype: str = None ) -> tuple[list, set]: @@ -127,7 +166,8 @@ def _validate_data_directory( f"{VALIDATION_FORMATS_MESSAGE}" ) return [], set() - + elif DataFormatTypes.CSV.value in found_formats: + dataset_paths = _filter_dataset_paths(dataset_paths) if not dataset_paths: if DataFormatTypes.XLSX.value in found_formats and len(found_formats) == 1: logger.error( @@ -203,6 +243,20 @@ def _validate_no_arguments(logger) -> None: logger.error("You must pass one of the following arguments: --dataset-path, --data") +def load_custom_dotenv(ctx, param, value): + if not value: + return value + if isinstance(value, str): + env_path = os.path.join(value, ".env") + elif isinstance(value, tuple): + env_path = os.path.join(Path(value[0]).parent, ".env") + else: + return value + if os.path.exists(env_path): + load_dotenv(env_path, override=False) + return value + + @click.command() @click.option( "-ca", @@ -222,6 +276,8 @@ def _validate_no_arguments(logger) -> None: "--data", required=False, help=f"Path to directory containing data files ({VALIDATION_FORMATS_MESSAGE})", + envvar="DATA_DIR", + callback=load_custom_dotenv, ) @click.option( "-ft", @@ -236,6 +292,8 @@ def _validate_no_arguments(logger) -> None: required=False, multiple=True, help=f"Absolute path to dataset file ({VALIDATION_FORMATS_MESSAGE})", + envvar="DATASET_PATH", + callback=load_custom_dotenv, ) @click.option( "-l", @@ -255,6 +313,7 @@ def _validate_no_arguments(logger) -> None: required=True, default=None, help="CDISC standard to validate against", + envvar="STANDARD", ) @click.option( "-v", @@ -262,6 +321,7 @@ def _validate_no_arguments(logger) -> None: required=True, default=None, help="Standard version to validate against", + envvar="VERSION", ) @click.option( "-ss", @@ -269,6 +329,7 @@ def _validate_no_arguments(logger) -> None: default=None, type=click.Choice(["sdtm", "send", "adam", "cdash"], case_sensitive=False), help="CDISC Substandard to validate against. Any of SDTM, SEND, ADaM, CDASH", + envvar="SUBSTANDARD", ) @click.option( "-uc", @@ -289,6 +350,7 @@ def _validate_no_arguments(logger) -> None: "Controlled terminology package to validate against, " "can provide more than one" ), + envvar="CONTROLLED_TERMINOLOGY_PACKAGE", ) @click.option( "-o", @@ -320,6 +382,7 @@ def _validate_no_arguments(logger) -> None: "--define-version", type=click.Choice(["2-1", "2-0", "2.0", "2.1"]), help="Define-XML version used for validation", + envvar="DEFINE_VERSION", ) @click.option("--whodrug", help="Path to directory with WHODrug dictionary files") @click.option("--meddra", help="Path to directory with MedDRA dictionary files") @@ -338,12 +401,14 @@ def _validate_no_arguments(logger) -> None: "-r", multiple=True, help="Specify rule core ID ex. CORE-000001. Can be specified multiple times", + envvar="RULES", ) @click.option( "--exclude-rules", "-er", multiple=True, help="Specify rule core ID to exclude, ex. CORE-000001. Can be specified multiple times", + envvar="EXCLUDE_RULES", ) @click.option( "--local-rules", @@ -352,6 +417,7 @@ def _validate_no_arguments(logger) -> None: type=click.Path(exists=True, readable=True, resolve_path=True), help="Path to directory containing local rules.", multiple=True, + envvar="LOCAL_RULES", ) @click.option( "--custom-standard", @@ -372,7 +438,13 @@ def _validate_no_arguments(logger) -> None: "is printed." ), ) -@click.option("-dxp", "--define-xml-path", required=False, help="Path to Define-XML") +@click.option( + "-dxp", + "--define-xml-path", + required=False, + help="Path to Define-XML", + envvar="DEFINE", +) @click.option( "-vx", "--validate-xml", @@ -731,6 +803,7 @@ def update_cache( required=False, help="Rule ID to get rule for.", multiple=True, + envvar="RULEID", ) @click.pass_context def list_rules( diff --git a/tests/QARegressionTests/test_Issues/test_CoreIssue1558.py b/tests/QARegressionTests/test_Issues/test_CoreIssue1558.py new file mode 100644 index 000000000..5ee178901 --- /dev/null +++ b/tests/QARegressionTests/test_Issues/test_CoreIssue1558.py @@ -0,0 +1,58 @@ +import os +import subprocess +import unittest + +import pytest +import json +from conftest import get_python_executable + + +@pytest.mark.regression +class TestCoreIssue1501(unittest.TestCase): + def test_raw_report(self): + # Run the command in the terminal + command = [ + f"{get_python_executable()}", + "-m", + "core", + "validate", + "-s", + "sdtmig", + "-r", + "CORE-000007", + "-v", + "3.4", + "-d", + os.path.join( + "tests", + "resources", + "CoreIssue1558", + "datasets", + ), + "--output-format", + "json", + "-ps", + "1", + ] + subprocess.run(command, check=True) + + files = os.listdir() + json_files = [ + file + for file in files + if file.startswith("CORE-Report-") and file.endswith(".json") + ] + json_report_path = sorted(json_files)[-1] + # Open the JSON report file + json_report = json.load(open(json_report_path)) + assert { + "Conformance_Details", + "Dataset_Details", + "Issue_Summary", + "Issue_Details", + "Rules_Report", + }.issubset(json_report.keys()) + datasets = {x["filename"] for x in json_report["Dataset_Details"]} + assert {"LB", "DM"}.issubset(datasets) + if os.path.exists(json_report_path): + os.remove(json_report_path) diff --git a/tests/resources/CoreIssue1558/datasets/dm.csv b/tests/resources/CoreIssue1558/datasets/dm.csv new file mode 100644 index 000000000..540695d4a --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/dm.csv @@ -0,0 +1,19 @@ +STUDYID,DOMAIN,USUBJID,SUBJID,RFSTDTC,RFENDTC,RFXSTDTC,RFXENDTC,RFICDTC,RFPENDTC,DTHDTC,DTHFL,SITEID,BRTHDTC,AGE,AGEU,SEX,RACE,ETHNIC,ARMCD,ARM,ACTARMCD,ACTARM,ARMNRS,ACTARMUD,COUNTRY +CDISCPILOT01,DM,CDISC001,1115,2012-11-30,2013-01-23,2012-11-30,2013-01-23,2012-11-23,2013-05-20,,,701,1928,84,YEARS,M,WHITE,NOT HISPANIC OR LATINO,ZAN_LOW,Zanomaline Low Dose (54 mg),ZAN_LOW,Zanomaline Low Dose (54 mg),,,USA +CDISCPILOT01,DM,CDISC002,1211,2012-11-15,2013-01-14,2012-11-15,2013-01-12,2012-10-30,2013-01-14,2013-01-14,Y,701,1936,76,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_LOW,Zanomaline Low Dose (54 mg),ZAN_LOW,Zanomaline Low Dose (54 mg),,,USA +CDISCPILOT01,DM,CDISC003,1302,2013-08-29,2013-11-05,2013-08-29,2013-11-05,2013-08-20,2014-02-13,,,701,1951,61,YEARS,M,WHITE,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC004,1345,2013-10-08,2014-03-18,2013-10-08,2014-03-18,2013-10-01,2014-03-18,,,701,1950,63,YEARS,F,WHITE,NOT HISPANIC OR LATINO,PLACEBO,Placebo,PLACEBO,Placebo,,,USA +CDISCPILOT01,DM,CDISC005,1383,2013-02-04,2013-08-06,2013-02-04,2013-08-06,2013-01-22,2013-08-06,,,701,1941,72,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC006,1429,2013-03-19,2013-04-30,2013-03-19,2013-04-30,2013-02-25,2013-04-30,,,701,1929,84,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_LOW,Zanomaline Low Dose (54 mg),ZAN_LOW,Zanomaline Low Dose (54 mg),,,USA +CDISCPILOT01,DM,CDISC007,1444,2013-01-05,2013-02-13,2013-01-05,2013-02-12,2012-12-31,2013-06-20,,,701,1949,63,YEARS,M,WHITE,HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC008,1445,2014-05-11,2014-11-01,2014-05-11,2014-11-01,2014-05-01,2014-11-01,2014-11-01,Y,704,1939,75,YEARS,M,MULTIPLE,NOT HISPANIC OR LATINO,PLACEBO,Placebo,PLACEBO,Placebo,,,USA +CDISCPILOT01,DM,CDISC009,1087,2012-10-22,2013-04-28,2012-10-22,2013-04-28,2012-10-06,2013-04-28,,,708,1938,74,YEARS,F,WHITE,NOT HISPANIC OR LATINO,PLACEBO,Placebo,PLACEBO,Placebo,,,USA +CDISCPILOT01,DM,CDISC010,1236,2013-09-21,2013-09-26,2013-09-21,2013-09-21,2013-09-08,2013-09-26,,,708,1927,86,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC011,1336,2012-12-07,2013-06-05,2012-12-07,2013-06-05,2012-11-21,2013-07-05,,,708,1939,73,YEARS,M,WHITE,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC012,1378,2013-09-03,2014-01-28,2013-09-03,2014-01-28,2013-08-24,2014-01-28,,,708,1946,67,YEARS,M,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,PLACEBO,Placebo,PLACEBO,Placebo,,,USA +CDISCPILOT01,DM,CDISC013,1083,2013-07-22,2013-08-03,2013-07-22,2013-08-01,2013-07-09,2013-08-03,2013-08-03,Y,710,1924,89,YEARS,F,WHITE,NOT HISPANIC OR LATINO,PLACEBO,Placebo,PLACEBO,Placebo,,,USA +CDISCPILOT01,DM,CDISC014,1012,2013-04-03,2013-05-02,2013-04-03,2013-04-29,2013-03-20,2013-09-18,,,711,1945,67,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA +CDISCPILOT01,DM,CDISC015,1022,,,,,2014-03-17,2014-03-17,,,711,1928,86,YEARS,F,WHITE,NOT HISPANIC OR LATINO,,,,,SCREEN FAILURE,,USA +CDISCPILOT01,DM,CDISC016,1143,2013-04-03,2013-06-01,2013-04-03,2013-05-30,2013-03-30,2013-09-22,,,711,1936,76,YEARS,F,WHITE,NOT HISPANIC OR LATINO,ZAN_LOW,Zanomaline Low Dose (54 mg),ZAN_LOW,Zanomaline Low Dose (54 mg),,,USA +CDISCPILOT01,DM,CDISC017,1250,2013-09-21,2014-02-08,2013-09-21,2014-01-31,2013-08-21,2014-03-08,,,718,1931,82,YEARS,F,WHITE,HISPANIC OR LATINO,ZAN_LOW,Zanomaline Low Dose (54 mg),ZAN_LOW,Zanomaline Low Dose (54 mg),,,USA +CDISCPILOT01,DM,CDISC018,1427,2012-12-17,2013-02-18,2012-12-17,2013-02-11,2012-12-13,2013-06-03,,,718,1938,74,YEARS,F,BLACK OR AFRICAN AMERICAN,NOT HISPANIC OR LATINO,ZAN_HIGH,Zanomaline High Dose (81 mg),ZAN_HIGH,Zanomaline High Dose (81 mg),,,USA diff --git a/tests/resources/CoreIssue1558/datasets/lb.csv b/tests/resources/CoreIssue1558/datasets/lb.csv new file mode 100644 index 000000000..218d1d4be --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/lb.csv @@ -0,0 +1,58 @@ +STUDYID,DOMAIN,USUBJID,LBSEQ,LBTESTCD,LBTEST,LBCAT,LBORRES,LBORRESU,LBORNRLO,LBORNRHI,LBSTRESC,LBSTRESN,LBSTRESU,LBSTNRLO,LBSTNRHI,LBNRIND,LBLOBXFL,VISITNUM,VISIT,EPOCH,LBDTC,LBDY +CDISCPILOT01,LB,CDISC001,1,ALB,Albumin,CHEMISTRY,3.9,g/dL,3.5,4.6,39,39.0,g/L,35.0,46.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,2,ALP,Alkaline Phosphatase,CHEMISTRY,93,U/L,35,115,93,93.0,U/L,35.0,115.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,3,ALT,Alanine Aminotransferase,CHEMISTRY,18,U/L,6,35,18,18.0,U/L,6.0,35.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,4,AST,Aspartate Aminotransferase,CHEMISTRY,26,U/L,11,36,26,26.0,U/L,11.0,36.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,5,BASO,Basophils,HEMATOLOGY,0.03,10^9/L,0,0.2,0.03,0.03,10^9/L,0.0,0.2,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,6,BILI,Bilirubin,CHEMISTRY,0.5,mg/dL,0.2,1.2,8.55,8.55,umol/L,3.0,21.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,7,UREAN,Urea Nitrogen,CHEMISTRY,21,mg/dL,4,24,7.497,7.497,mmol/L,1.4,8.6,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,8,CA,Calcium,CHEMISTRY,9.1,mg/dL,8.4,10.3,2.27045,2.27045,mmol/L,2.1,2.57,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,9,CHOL,Cholesterol,CHEMISTRY,254,mg/dL,149,286,6.56844,6.56844,mmol/L,3.85,7.4,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,10,CK,Creatine Kinase,CHEMISTRY,79,U/L,22,198,79,79.0,U/L,22.0,198.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,11,CL,Chloride,CHEMISTRY,102,mEq/L,94,112,102,102.0,mmol/L,94.0,112.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,12,COLOR,Color,URINALYSIS,NORMAL,,,,NORMAL,,,,,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,13,CREAT,Creatinine,CHEMISTRY,1.3,mg/dL,0.8,1.6,114.92,114.92,umol/L,71.0,141.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,14,EOS,Eosinophils,HEMATOLOGY,0.08,10^9/L,0,0.57,0.08,0.08,10^9/L,0.0,0.57,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,15,GGT,Gamma Glutamyl Transferase,CHEMISTRY,31,U/L,10,50,31,31.0,U/L,10.0,50.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,16,GLUC,Glucose,CHEMISTRY,74,mg/dL,50,250,4.10774,4.10774,mmol/L,2.8,13.9,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,17,HCT,Hematocrit,HEMATOLOGY,46.0,%,37,51,0.46,0.46,,0.37,0.51,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,18,HGB,Hemoglobin,HEMATOLOGY,14.9,g/dL,12.5,17,9.24694,9.24694,mmol/L,7.76,10.55,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,19,K,Potassium,CHEMISTRY,4.3,mEq/L,3.4,5.4,4.3,4.3,mmol/L,3.4,5.4,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,20,KETONES,Ketones,URINALYSIS,0,,,,0,0.0,,,,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,21,LYM,Lymphocytes,HEMATOLOGY,1.20,10^9/L,0.8,3,1.2,1.2,10^9/L,0.8,3.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,22,MCH,Ery. Mean Corpuscular Hemoglobin,HEMATOLOGY,30,pg,26,34,1.8618,1.8618,fmol,1.6,2.1,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,23,MCHC,Ery. Mean Corpuscular HGB Concentration,HEMATOLOGY,33,g/dL,31,38,20.4798,20.4798,mmol/L,19.0,24.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,24,MCV,Ery. Mean Corpuscular Volume,HEMATOLOGY,92,fL,80,100,92,92.0,fL,80.0,100.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,25,MONO,Monocytes,HEMATOLOGY,0.38,10^9/L,0.12,0.92,0.38,0.38,10^9/L,0.12,0.92,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,26,SODIUM,Sodium,CHEMISTRY,139,mEq/L,135,145,139,139.0,mmol/L,135.0,145.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,27,PH,pH,URINALYSIS,5,,5,8,5,5.0,,5.0,8.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,28,PHOS,Phosphate,CHEMISTRY,3.4,mg/dL,2.2,5.1,1.09786,1.09786,mmol/L,0.71,1.65,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,29,PLAT,Platelets,HEMATOLOGY,150,10^9/L,130,394,150,150.0,10^9/L,130.0,394.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,30,PROT,Protein,CHEMISTRY,6.7,g/dL,6,8,67,67.0,g/L,60.0,80.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,31,RBC,Erythrocytes,HEMATOLOGY,5.00,10^12/L,4,5.8,5,5.0,10^12/L,4.0,5.8,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,32,SPGRAV,Specific Gravity,URINALYSIS,1.017,,1.006,1.03,1.017,1.017,,1.006,1.03,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,33,TSH,Thyrotropin,OTHER,2.63,mIU/L,.32,5,2.63,2.63,mU/L,0.32,5.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,34,URATE,Urate,CHEMISTRY,4.9,mg/dL,2.5,7.5,291.452,291.452,umol/L,149.0,446.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,35,UROBIL,Urobilinogen,URINALYSIS,0,,,,0,0.0,,,,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,36,VITB12,Vitamin B12,OTHER,641,ng/L,200,900,472.9298,472.9298,pmol/L,148.0,664.0,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,37,WBC,Leukocytes,HEMATOLOGY,4.99,10^9/L,3.8,10.7,4.99,4.99,10^9/L,3.8,10.7,NORMAL,Y,1.0,SCREENING 1,SCREENING,2012-11-23T11:20,-7 +CDISCPILOT01,LB,CDISC001,38,ALB,Albumin,CHEMISTRY,3.6,g/dL,3.5,4.6,36,36.0,g/L,35.0,46.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,39,ALP,Alkaline Phosphatase,CHEMISTRY,88,U/L,35,115,88,88.0,U/L,35.0,115.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,40,ALT,Alanine Aminotransferase,CHEMISTRY,15,U/L,6,35,15,15.0,U/L,6.0,35.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,41,AST,Aspartate Aminotransferase,CHEMISTRY,25,U/L,11,36,25,25.0,U/L,11.0,36.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,42,BASO,Basophils,HEMATOLOGY,0.04,10^9/L,0,0.2,0.04,0.04,10^9/L,0.0,0.2,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,43,BILI,Bilirubin,CHEMISTRY,0.4,mg/dL,0.2,1.2,6.84,6.84,umol/L,3.0,21.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,44,UREAN,Urea Nitrogen,CHEMISTRY,22,mg/dL,4,24,7.854,7.854,mmol/L,1.4,8.6,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,45,CA,Calcium,CHEMISTRY,8.9,mg/dL,8.4,10.3,2.22055,2.22055,mmol/L,2.1,2.57,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,46,CHOL,Cholesterol,CHEMISTRY,226,mg/dL,149,286,5.84436,5.84436,mmol/L,3.85,7.4,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,47,CK,Creatine Kinase,CHEMISTRY,73,U/L,22,198,73,73.0,U/L,22.0,198.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,48,CL,Chloride,CHEMISTRY,107,mEq/L,94,112,107,107.0,mmol/L,94.0,112.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,49,COLOR,Color,URINALYSIS,NORMAL,,,,NORMAL,,,,,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,50,CREAT,Creatinine,CHEMISTRY,1.3,mg/dL,0.8,1.6,114.92,114.92,umol/L,71.0,141.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,51,EOS,Eosinophils,HEMATOLOGY,0.08,10^9/L,0,0.57,0.08,0.08,10^9/L,0.0,0.57,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,52,GGT,Gamma Glutamyl Transferase,CHEMISTRY,31,U/L,10,50,31,31.0,U/L,10.0,50.0,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,53,GLUC,Glucose,CHEMISTRY,85,mg/dL,50,250,4.71835,4.71835,mmol/L,2.8,13.9,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,54,HCT,Hematocrit,HEMATOLOGY,45.0,%,37,51,0.45,0.45,,0.37,0.51,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,55,HGB,Hemoglobin,HEMATOLOGY,14.6,g/dL,12.5,17,9.06076,9.06076,mmol/L,7.76,10.55,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,56,K,Potassium,CHEMISTRY,5.1,mEq/L,3.4,5.4,5.1,5.1,mmol/L,3.4,5.4,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 +CDISCPILOT01,LB,CDISC001,57,KETONES,Ketones,URINALYSIS,0,,,,0,0.0,,,,NORMAL,,4.0,WEEK 2,TREATMENT,2012-12-13T14:05,14 diff --git a/tests/resources/CoreIssue1558/datasets/tables.csv b/tests/resources/CoreIssue1558/datasets/tables.csv new file mode 100644 index 000000000..316b3b84c --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/tables.csv @@ -0,0 +1,4 @@ +Filename,Label +pp.xpt,Pharmacokinetics Parameters +dm.xpt,Demographics +lb.xpt,Some Description \ No newline at end of file diff --git a/tests/resources/CoreIssue1558/datasets/variables.csv b/tests/resources/CoreIssue1558/datasets/variables.csv new file mode 100644 index 000000000..07b372de5 --- /dev/null +++ b/tests/resources/CoreIssue1558/datasets/variables.csv @@ -0,0 +1,13 @@ +dataset,variable,label,type,length +dm.xpt,STUDYID,Study Identifier,Char,200 +dm.xpt,DOMAIN,Domain Abbreviation,Char,2 +dm.xpt,USUBJID,Unique Subject Identifier,Char,200 +dm.xpt,SUBJID,Subject Identifier for the Study,Char,40 +dm.xpt,RFSTDTC,Subject Reference Start Date/Time,Char,20 +pp.xpt,STUDYID,Study Identifier,Char,200 +pp.xpt,DOMAIN,Domain Abbreviation,Char,2 +pp.xpt,USUBJID,Unique Subject Identifier,Char,200 +pp.xpt,PPSEQ,Sequence Number,Num,8 +pp.xpt,PPGRPID,Group ID,Char,40 +pp.xpt,PPTESTCD,Parameter Short Name,Char,8 +pp.xpt,PPTEST,Parameter Name,Char,40 \ No newline at end of file diff --git a/tests/unit/test_csv_reader.py b/tests/unit/test_csv_reader.py new file mode 100644 index 000000000..78a80feea --- /dev/null +++ b/tests/unit/test_csv_reader.py @@ -0,0 +1,480 @@ +import logging +import tempfile +import textwrap +from datetime import datetime +from pathlib import Path +from unittest.mock import patch + +import pandas as pd + +from cdisc_rules_engine.services.csv_metadata_reader import DatasetCSVMetadataReader +from cdisc_rules_engine.services.data_readers.csv_reader import CSVReader +from core import _filter_dataset_paths + +DEFAULT_ENCODING = "utf-8" + + +def norm(paths): + """Normalize path separators for cross-platform comparison.""" + return [Path(p) for p in paths] + + +class TestNoTablesCsv: + + # Then in each affected test: + def test_excludes_non_csv_files(self): + paths = ["/data/dm.csv", "/data/readme.txt", "/data/report.xlsx"] + assert norm(_filter_dataset_paths(paths)) == norm(["/data/dm.csv"]) + + def test_variables_csv_excluded_without_tables_csv(self): + paths = ["/data/variables.csv", "/data/dm.csv"] + assert norm(_filter_dataset_paths(paths)) == norm(["/data/dm.csv"]) + + def test_returns_all_csv_files(self): + paths = ["/data/dm.csv", "/data/customers.csv"] + result = _filter_dataset_paths(paths) + assert sorted(norm(result)) == sorted(norm(paths)) + + def test_only_non_csv_returns_empty(self): + assert _filter_dataset_paths(["/data/readme.txt", "/data/image.png"]) == [] + + +class TestTablesCsvMissingFilenameColumn: + def test_returns_all_csv_dataset_files(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Name": ["dm"]}).to_csv(tables_csv, index=False) + + dm = tmp_path / "dm.csv" + customers = tmp_path / "customers.csv" + dm.touch() + customers.touch() + + paths = [str(tables_csv), str(dm), str(customers)] + result = _filter_dataset_paths(paths) + assert sorted(result) == sorted([str(dm), str(customers)]) + + def test_non_csv_files_excluded_when_no_filename_col(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Name": ["dm"]}).to_csv(tables_csv, index=False) + + dm = tmp_path / "dm.csv" + readme = tmp_path / "readme.txt" + dm.touch() + readme.touch() + + result = _filter_dataset_paths([str(tables_csv), str(dm), str(readme)]) + assert str(dm) in result + assert str(readme) not in result + + +class TestTablesCsvWithFilenameColumn: + def test_keeps_only_allowed_datasets(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv", "customers.csv"]}).to_csv( + tables_csv, index=False + ) + + dm = tmp_path / "dm.csv" + customers = tmp_path / "customers.csv" + orders = tmp_path / "orders.csv" + for f in (dm, customers, orders): + f.touch() + + result = _filter_dataset_paths( + [str(tables_csv), str(dm), str(customers), str(orders)] + ) + assert sorted(result) == sorted([str(dm), str(customers)]) + assert str(orders) not in result + + def test_variables_csv_excluded_even_if_listed(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["variables.csv", "dm.csv"]}).to_csv( + tables_csv, index=False + ) + variables = tmp_path / "variables.csv" + dm = tmp_path / "dm.csv" + variables.touch() + dm.touch() + + result = _filter_dataset_paths([str(tables_csv), str(variables), str(dm)]) + assert str(variables) not in result + assert str(dm) in result + + def test_filename_with_path_prefix_uses_stem_matching(self, tmp_path): + """Filename 'subdir/dm.csv' -> stem 'dm' -> matches 'dm.csv' on disk.""" + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["subdir/dm.csv"]}).to_csv(tables_csv, index=False) + dm = tmp_path / "dm.csv" + dm.touch() + + result = _filter_dataset_paths([str(tables_csv), str(dm)]) + assert str(dm) in result + + def test_nan_filenames_are_ignored(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv", None]}).to_csv(tables_csv, index=False) + dm = tmp_path / "dm.csv" + unknown = tmp_path / "unknown.csv" + dm.touch() + unknown.touch() + + result = _filter_dataset_paths([str(tables_csv), str(dm), str(unknown)]) + assert str(dm) in result + assert str(unknown) not in result + + def test_no_matching_files_returns_empty(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["nonexistent.csv"]}).to_csv(tables_csv, index=False) + dm = tmp_path / "dm.csv" + dm.touch() + + assert _filter_dataset_paths([str(tables_csv), str(dm)]) == [] + + def test_non_csv_files_excluded_even_if_stem_matches(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv"]}).to_csv(tables_csv, index=False) + + dm_xlsx = tmp_path / "dm.xlsx" + dm_xlsx.touch() + + assert _filter_dataset_paths([str(tables_csv), str(dm_xlsx)]) == [] + + def test_encoding_is_forwarded_to_read_csv(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv"]}).to_csv(tables_csv, index=False) + (tmp_path / "dm.csv").touch() + + with patch("pandas.read_csv", wraps=pd.read_csv) as mock_read: + _filter_dataset_paths( + [str(tables_csv), str(tmp_path / "dm.csv")], encoding="latin-1" + ) + mock_read.assert_called_once_with(tables_csv, encoding="latin-1") + + +class TestEdgeCases: + def test_only_tables_csv_in_input(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv"]}).to_csv(tables_csv, index=False) + assert _filter_dataset_paths([str(tables_csv)]) == [] + + def test_only_variables_csv_in_input(self): + assert _filter_dataset_paths(["/data/variables.csv"]) == [] + + def test_empty_filename_column_returns_empty(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": pd.Series([], dtype=str)}).to_csv( + tables_csv, index=False + ) + dm = tmp_path / "dm.csv" + dm.touch() + + assert _filter_dataset_paths([str(tables_csv), str(dm)]) == [] + + def test_all_filename_values_nan_returns_empty(self, tmp_path): + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": [None, None]}).to_csv(tables_csv, index=False) + dm = tmp_path / "dm.csv" + dm.touch() + + assert _filter_dataset_paths([str(tables_csv), str(dm)]) == [] + + def test_duplicate_paths_removed(self, tmp_path): + """The function does not deduplicate; duplicates in -> duplicates out.""" + tables_csv = tmp_path / "tables.csv" + pd.DataFrame({"Filename": ["dm.csv"]}).to_csv(tables_csv, index=False) + dm = tmp_path / "dm.csv" + dm.touch() + + paths = [str(tables_csv), str(dm), str(dm)] + result = _filter_dataset_paths(paths) + assert result.count(str(dm)) == 1 + + +VARIABLES_CSV = textwrap.dedent( + """\ + dataset,variable,label,type,length + patients.csv,id,Patient ID,integer,10 + patients.csv,name,Patient Name,string,50 + patients.csv,age,Patient Age,integer,3 +""" +) + +DATA_CSV = textwrap.dedent( + """\ + id,name,age + 1,Alice,30 + 2,Bob,25 + 3,Carol,40 +""" +) + +TABLES_CSV = textwrap.dedent( + """\ + Filename,Label + patients.csv,Patient Dataset +""" +) + + +def _write(path: Path, content: str) -> None: + path.write_text(content, encoding="utf-8") + + +class TestDatasetCSVMetadataReaderRead: + """Tests for DatasetCSVMetadataReader.read()""" + + def setup_method(self): + self.tmpdir = tempfile.mkdtemp() + self.data_path = Path(self.tmpdir) / "patients.csv" + _write(self.data_path, DATA_CSV) + + def _variables_path(self): + return Path(self.tmpdir) / "variables.csv" + + def test_returns_dict_with_expected_keys(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + result = reader.read() + + expected_keys = { + "dataset_name", + "variable_names", + "variable_labels", + "variable_formats", + "variable_name_to_label_map", + "variable_name_to_data_type_map", + "variable_name_to_size_map", + "number_of_variables", + "dataset_modification_date", + "adam_info", + "dataset_length", + "first_record", + } + assert expected_keys.issubset(result.keys()) + + def test_variable_names_and_labels(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + result = reader.read() + assert result["variable_names"] == ["id", "name", "age"] + assert result["variable_labels"] == [ + "Patient ID", + "Patient Name", + "Patient Age", + ] + + def test_number_of_variables(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + assert reader.read()["number_of_variables"] == 3 + + def test_variable_formats_are_empty_strings(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + result = reader.read() + assert all(fmt == "" for fmt in result["variable_formats"]) + + def test_dataset_length_equals_data_rows(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + # DATA_CSV has 3 data rows (+ 1 header) + assert reader.read()["dataset_length"] == 3 + + def test_first_record_matches_first_data_row(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + first = reader.read()["first_record"] + assert first == {"id": "1", "name": "Alice", "age": "30"} + + def test_modification_date_is_iso_string(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + mod_date = reader.read()["dataset_modification_date"] + # Should parse without raising + datetime.fromisoformat(mod_date) + + def test_adam_info_structure(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + adam = reader.read()["adam_info"] + assert adam == { + "categorization_scheme": {}, + "w_indexes": {}, + "period": {}, + "selection_algorithm": {}, + } + + def test_variable_name_to_label_map(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + m = reader.read()["variable_name_to_label_map"] + assert m == {"id": "Patient ID", "name": "Patient Name", "age": "Patient Age"} + + def test_variable_name_to_size_map_with_values(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + sizes = reader.read()["variable_name_to_size_map"] + assert sizes == {"id": 10, "name": 50, "age": 3} + + def test_variable_name_to_size_map_with_nan_length(self): + variables_with_nan = textwrap.dedent( + """\ + dataset,variable,label,type,length + patients.csv,id,Patient ID,integer, + """ + ) + _write(self._variables_path(), variables_with_nan) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + sizes = reader.read()["variable_name_to_size_map"] + assert sizes["id"] is None + + def test_dataset_name_lookup_is_case_insensitive(self): + """File name with mixed case should still match variables.csv entry.""" + variables_upper = VARIABLES_CSV.replace("patients.csv", "PATIENTS.CSV") + _write(self._variables_path(), variables_upper) + reader = DatasetCSVMetadataReader(str(self.data_path), "PATIENTS.CSV") + result = reader.read() + assert result["dataset_name"] == "PATIENTS" + + def test_returns_partial_meta_when_no_variables_file(self, caplog): + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + with caplog.at_level(logging.INFO, logger="validator"): + result = reader.read() + assert { + "dataset_name", + "dataset_modification_date", + "adam_info", + "dataset_length", + "first_record", + }.issubset(set(result.keys())) + assert result["first_record"] == {"age": "30", "id": "1", "name": "Alice"} + assert "No variables file found" in caplog.text + + def test_dataset_label_added_when_tables_csv_present(self): + _write(self._variables_path(), VARIABLES_CSV) + _write(Path(self.tmpdir) / "tables.csv", TABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + result = reader.read() + assert result.get("dataset_label") == "Patient Dataset" + + def test_no_dataset_label_when_tables_csv_absent(self): + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(self.data_path), "patients.csv") + result = reader.read() + assert "dataset_label" not in result + + def test_empty_data_file_returns_empty_first_record(self): + empty_data = Path(self.tmpdir) / "patients.csv" + _write(empty_data, "id,name,age\n") # header only + _write(self._variables_path(), VARIABLES_CSV) + reader = DatasetCSVMetadataReader(str(empty_data), "patients.csv") + result = reader.read() + assert result["dataset_length"] == 0 + assert result["first_record"] == {} + + +class TestCSVReaderFromFile: + """Tests for CSVReader.from_file()""" + + def setup_method(self): + self.tmpdir = tempfile.mkdtemp() + self.csv_path = Path(self.tmpdir) / "data.csv" + _write(self.csv_path, DATA_CSV) + + def test_returns_dataframe(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert isinstance(df, pd.DataFrame) + + def test_correct_number_of_rows(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert len(df) == 3 + + def test_correct_column_names(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert list(df.columns) == ["id", "name", "age"] + + def test_correct_values(self): + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert df.iloc[0]["name"] == "Alice" + assert df.iloc[1]["age"] == 25 + + def test_no_index_column(self): + """index_col=False means the DataFrame index should be the default RangeIndex.""" + reader = CSVReader() + df = reader.from_file(str(self.csv_path)) + assert list(df.index) == [0, 1, 2] + + def test_empty_csv_returns_empty_dataframe(self): + empty_path = Path(self.tmpdir) / "empty.csv" + _write(empty_path, "id,name,age\n") + reader = CSVReader() + df = reader.from_file(str(empty_path)) + assert df.empty + assert list(df.columns) == ["id", "name", "age"] + + +class TestCSVReaderToParquet: + + def setup_method(self): + self.tmpdir = tempfile.mkdtemp() + + def _write_csv(self, name: str, content: str) -> str: + p = Path(self.tmpdir) / name + _write(p, content) + return str(p) + + def test_returns_tuple_of_int_and_string(self): + path = self._write_csv("data.csv", DATA_CSV) + reader = CSVReader() + result = reader.to_parquet(path) + assert isinstance(result, tuple) + num_rows, parquet_path = result + assert isinstance(num_rows, int) + assert isinstance(parquet_path, str) + + def test_row_count_matches_csv(self): + path = self._write_csv("data.csv", DATA_CSV) + reader = CSVReader() + num_rows, _ = reader.to_parquet(path) + assert num_rows == 3 + + def test_parquet_file_is_created(self): + path = self._write_csv("data.csv", DATA_CSV) + reader = CSVReader() + _, parquet_path = reader.to_parquet(path) + assert Path(parquet_path).exists() + + def test_parquet_content_matches_source(self): + path = self._write_csv("data.csv", DATA_CSV) + reader = CSVReader() + _, parquet_path = reader.to_parquet(path) + df = pd.read_parquet(parquet_path) + assert list(df.columns) == ["id", "name", "age"] + assert len(df) == 3 + + def test_large_file_chunked_row_count(self): + """Simulate a file larger than the 20 000-row chunk boundary.""" + rows = "\n".join(f"{i},Name{i},{20 + i % 50}" for i in range(1, 25_001)) + content = "id,name,age\n" + rows + path = self._write_csv("large.csv", content) + reader = CSVReader() + num_rows, parquet_path = reader.to_parquet(path) + assert num_rows == 25_000 + df = pd.read_parquet(parquet_path) + assert len(df) == 25_000 + + def test_parquet_suffix(self): + path = self._write_csv("data.csv", DATA_CSV) + reader = CSVReader() + _, parquet_path = reader.to_parquet(path) + assert parquet_path.endswith(".parquet") + + def test_empty_csv_returns_zero_rows(self): + path = self._write_csv("empty.csv", "id,name,age\n") + reader = CSVReader() + num_rows, parquet_path = reader.to_parquet(path) + assert num_rows == 0