diff --git a/cdisc_rules_engine/services/data_readers/json_reader.py b/cdisc_rules_engine/services/data_readers/json_reader.py index fb80530f7..8d7586151 100644 --- a/cdisc_rules_engine/services/data_readers/json_reader.py +++ b/cdisc_rules_engine/services/data_readers/json_reader.py @@ -10,7 +10,10 @@ def from_file(self, file_path): try: with open(file_path, "r", encoding=self.encoding) as fp: json_data = load(fp) + self._detect_whitespace_in_dataset_keys(json_data, file_path) return json_data + except InvalidJSONFormat: + raise except (UnicodeDecodeError, UnicodeError) as e: raise InvalidJSONFormat( f"\n Error reading JSON from: {file_path}" @@ -23,5 +26,21 @@ def from_file(self, file_path): f"\n {type(e).__name__}: {e}" ) + def _detect_whitespace_in_dataset_keys(self, json_data: dict, file_path: str): + offending = [] + for dataset in json_data.get("datasets", []): + dataset_name = dataset.get("filename") + records = dataset.get("records", {}) + for key in records: + if key != key.strip(): + offending.append(f" dataset '{dataset_name}': {repr(key)}") + if offending: + offending_list = "\n".join(offending) + raise InvalidJSONFormat( + f"\n Error reading JSON from: {file_path}" + f"\n The following column keys contain leading/trailing whitespace:" + f"\n{offending_list}" + ) + def read(self, data): pass diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index 9127aeb89..3b3e2224a 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -105,6 +105,12 @@ def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: false_values=["False", "FALSE", "false", False, 0, "0"], ) dataframe = dataframe.replace({nan: None}) + offending = [col for col in dataframe.columns if col != col.strip()] + if offending: + raise ExcelTestDataError( + f"Sheet '{dataset_name}' has column headers with leading/trailing whitespace: " + f"{[repr(c) for c in offending]}." + ) dataset = PandasDataset(dataframe) return dataset diff --git a/tests/resources/Datasets_whitespace.json b/tests/resources/Datasets_whitespace.json new file mode 100644 index 000000000..f467e059a --- /dev/null +++ b/tests/resources/Datasets_whitespace.json @@ -0,0 +1,470 @@ +{ + "datasets": [ + { + "filename": "ex.xpt", + "label": "Exposure", + "domain": "EX", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 10 + }, + { + "name": "DOMAIN ", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier ", + "type": "Char", + "length": 20 + }, + { + "name": "EXSEQ", + "label": "Sequence Number", + "type": "Num", + "length": 8 + }, + { + "name": "EXTRT", + "label": "Name of Actual Treatment ", + "type": "Char", + "length": 20 + }, + { + "name": "EXDOSE", + "label": "Dose per Administration ", + "type": "Num", + "length": 8 + }, + { + "name": "EXDOSU", + "label": "Dose Units ", + "type": "Char", + "length": 20 + }, + { + "name": "EXDOSFRM", + "label": "Dose Form ", + "type": "Char", + "length": 20 + }, + { + "name": "EXDOSFRQ", + "label": "Dosing Frequency Per Interval ", + "type": "Char", + "length": 20 + }, + { + "name": "EXROUTE", + "label": "Route of Administration ", + "type": "Char", + "length": 20 + }, + { + "name": "EXLOT", + "label": "Lot Number ", + "type": "Char", + "length": 20 + }, + { + "name": "RPHASE", + "label": "Reproductive Phase ", + "type": "Char", + "length": 20 + }, + { + "name": "RPPLDY", + "label": "Planned Repro Phase Day", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLSTDY", + "label": "Planned Repro Phase Day of Obs Start ", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLENDY", + "label": "Planned Repro Phase Day of Obs End ", + "type": "Num", + "length": 8 + }, + { + "name": "EXRPDY", + "label": "Actual Repro Phase Day of Observation ", + "type": "Num", + "length": 8 + }, + { + "name": "EXRPSTDY", + "label": "Actual Repro Phase Day of Obs Start", + "type": "Num", + "length": 8 + }, + { + "name": "EXRPENDY", + "label": "Actual Repro Phase Day of Obs End", + "type": "Num", + "length": 8 + }, + { + "name": "EXSTDTC", + "label": "Start Date/Time of Treatment ", + "type": "Char", + "length": 20 + } + ], + "records": { + "STUDYID": [ + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 " + ], + "DOMAIN ": ["EX", "EX", "EX", "EX", "EX", "EX"], + "USUBJID": [ + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 " + ], + "EXSEQ": [1, 2, 3, 4, 5, 6], + "EXTRT": ["DRUG A", "DRUG A", "DRUG A", "DRUG A", "DRUG A", "DRUG A"], + "EXDOSE": [1, 1, 1, 1, 1, 1], + "EXDOSU": ["mg", "mg", "mg", "mg", "mg", "mg"], + "EXDOSFRM": [ + "POWDER", + "POWDER", + "POWDER", + "POWDER", + "POWDER", + "POWDER" + ], + "EXDOSFRQ": ["", "", "", "", "", ""], + "EXROUTE": ["ORAL", "ORAL", "ORAL", "ORAL", "ORAL", "ORAL"], + "EXLOT": ["", "", "", "", "", ""], + "RPHASE": [ + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION " + ], + "RPPLDY": [1.01, 2, 3, 4, -5, 0], + "RPPLSTDY": [1, 2.6, 3, 4, 5, 6], + "RPPLENDY": [1, 2, 1.1, 4, 5, 6], + "EXRPDY": [1, 2, 3, -2.2, 5, 6], + "EXRPSTDY": [1, 2, 3, 4, 2.3, 6], + "EXRPENDY": [1, 2, 3, 4, 5, -4.1], + "EXSTDTC": [ + "2023-01-01", + "2023-01-02", + "2023-01-03", + "2023-01-04", + "2023-01-05", + "2023-01-06" + ] + } + }, + { + "filename": "lb.xpt", + "label": "Laboratory", + "domain": "LB", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 10 + }, + { + "name": "DOMAIN ", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier ", + "type": "Char", + "length": 20 + }, + { + "name": "LBSEQ", + "label": "Sequence Number", + "type": "Num", + "length": 8 + }, + { + "name": "LBTESTCD", + "label": "Lab Test or Examination Short Name ", + "type": "Char", + "length": 8 + }, + { + "name": "LBTEST", + "label": "Lab Test or Examination Name ", + "type": "Char", + "length": 40 + }, + { + "name": "LBORRES", + "label": "Result or Findings as Collected ", + "type": "Char", + "length": 20 + }, + { + "name": "RPHASE", + "label": "Reproductive Phase ", + "type": "Char", + "length": 20 + }, + { + "name": "RPPLDY", + "label": "Planned Repro Phase Day", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLSTDY", + "label": "Planned Repro Phase Day of Obs Start ", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLENDY", + "label": "Planned Repro Phase Day of Obs End ", + "type": "Num", + "length": 8 + }, + { + "name": "LBRPDY", + "label": "Actual Repro Phase Day of Observation ", + "type": "Num", + "length": 8 + }, + { + "name": "LBRPSTDY", + "label": "Actual Repro Phase Day of Obs Start", + "type": "Num", + "length": 8 + }, + { + "name": "LBRPENDY", + "label": "Actual Repro Phase Day of Obs End", + "type": "Num", + "length": 8 + }, + { + "name": "LBSTDTC", + "label": "Start Date/Time of Treatment ", + "type": "Char", + "length": 20 + } + ], + "records": { + "STUDYID ": [ + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 " + ], + "DOMAIN ": ["LB", "LB", "LB", "LB", "LB", "LB"], + "USUBJID": [ + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 " + ], + "LBSEQ": [1, 2, 3, 4, 5, 6], + "LBTESTCD": ["GLUC", "GLUC", "GLUC", "GLUC", "GLUC", "GLUC"], + "LBTEST": [ + "Glucose", + "Glucose", + "Glucose", + "Glucose", + "Glucose", + "Glucose" + ], + "LBORRES": [ + "POSITIVE", + "POSITIVE", + "POSITIVE", + "POSITIVE", + "POSITIVE", + "POSITIVE" + ], + "RPHASE": [ + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION " + ], + "RPPLDY": [1.01, 2, 3, 4, -5, -5], + "RPPLSTDY": [1, 2.6, 3, 4, 5, 5], + "RPPLENDY": [1, 2, 1.1, 4, 5, 5], + "LBRPDY": [1, 2, 3, -2.2, 5, 5], + "LBRPSTDY": [1, 2, 3, 4, 2.3, 5], + "LBRPENDY": [1, 2, 3, 4, 5, 5.7], + "LBSTDTC": [ + "2023-01-01", + "2023-01-02", + "2023-01-03", + "2023-01-04", + "2023-01-05", + "2023-01-06" + ] + } + }, + { + "filename": "ds.xpt", + "label": "Disposition", + "domain": "DS", + "variables": [ + { + "name": "STUDYID", + "label": "Study Identifier", + "type": "Char", + "length": 10 + }, + { + "name": "DOMAIN ", + "label": "Domain Abbreviation", + "type": "Char", + "length": 2 + }, + { + "name": "USUBJID", + "label": "Unique Subject Identifier ", + "type": "Char", + "length": 20 + }, + { + "name": "DSSEQ", + "label": "Sequence Number", + "type": "Num", + "length": 8 + }, + { + "name": "DSTERM", + "label": "Reported Term for the Disposition Event ", + "type": "Char", + "length": 100 + }, + { + "name": "RPHASE", + "label": "Reproductive Phase ", + "type": "Char", + "length": 20 + }, + { + "name": "RPPLDY", + "label": "Planned Repro Phase Day", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLSTDY", + "label": "Planned Repro Phase Day of Obs Start ", + "type": "Num", + "length": 8 + }, + { + "name": "RPPLENDY", + "label": "Planned Repro Phase Day of Obs End ", + "type": "Num", + "length": 8 + }, + { + "name": "DSRPDY", + "label": "Actual Repro Phase Day of Observation ", + "type": "Num", + "length": 8 + }, + { + "name": "DSRPSTDY", + "label": "Actual Repro Phase Day of Obs Start", + "type": "Num", + "length": 8 + }, + { + "name": "DSRPENDY", + "label": "Actual Repro Phase Day of Obs End", + "type": "Num", + "length": 8 + }, + { + "name": "DSSTDTC", + "label": "Start Date/Time of Treatment ", + "type": "Char", + "length": 20 + } + ], + "records": { + "STUDYID": [ + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 ", + "EFD111 " + ], + "DOMAIN ": ["DS", "DS", "DS", "DS", "DS", "DS"], + "USUBJID": [ + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 ", + "EFD111-0001 " + ], + "DSSEQ": [1, 2, 3, 4, 5, 6], + "DSTERM": ["aa", "bb", "cc", "dd", "ee", "ee"], + "RPHASE": [ + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION ", + "GESTATION " + ], + "RPPLDY": [1.01, 2, 3, 4, -5, -6], + "RPPLSTDY": [1, 2.6, 3, 4, 5, 6], + "RPPLENDY": [1, 2, 1.1, 4, 5, 6], + "DSRPDY": [1, 2, 3, -2.2, 5, 6], + "DSRPSTDY": [1, 2, 3, 4, 2.3, 6], + "DSRPENDY": [1, 2, 3, 4, 5, 6.1], + "DSSTDTC": [ + "2023-01-01", + "2023-01-02", + "2023-01-03", + "2023-01-04", + "2023-01-05", + "2023-01-06" + ] + } + } + ], + "standard": { + "product": "sendig", + "version": "3-1" + }, + "codelists": [] +} diff --git a/tests/resources/Datasets_whitespace.xlsx b/tests/resources/Datasets_whitespace.xlsx new file mode 100644 index 000000000..240199496 Binary files /dev/null and b/tests/resources/Datasets_whitespace.xlsx differ diff --git a/tests/unit/test_json_reader.py b/tests/unit/test_json_reader.py new file mode 100644 index 000000000..961505ae9 --- /dev/null +++ b/tests/unit/test_json_reader.py @@ -0,0 +1,32 @@ +import os +import pytest +from cdisc_rules_engine.services.data_readers.json_reader import JSONReader +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidJSONFormat + + +def test_json_reader_whitespace_error(): + test_dataset_path = ( + f"{os.path.dirname(__file__)}/../resources/Datasets_whitespace.json" + ) + with pytest.raises(InvalidJSONFormat) as exc_info: + JSONReader(encoding="utf-8").from_file(test_dataset_path) + assert "leading/trailing whitespace" in str(exc_info.value.message) + + +def test_whitespace_from_record_keys(): + test_dataset_path = ( + f"{os.path.dirname(__file__)}/../resources/Datasets_whitespace.json" + ) + with pytest.raises(InvalidJSONFormat) as exc_info: + JSONReader(encoding="utf-8").from_file(test_dataset_path) + assert "leading/trailing whitespace" in str(exc_info.value.message) + + +def test_json_reader_clean_file_returns_dict(): + test_dataset_path = f"{os.path.dirname(__file__)}/../resources/CG0027-positive.json" + json_data = JSONReader(encoding="utf-8").from_file(test_dataset_path) + assert isinstance(json_data, dict) + assert "datasets" in json_data + assert len(json_data["datasets"]) > 0 + assert json_data["datasets"][0]["domain"] == "AE" + assert len(json_data["datasets"][0]["records"]["AESEQ"]) == 2 diff --git a/tests/unit/test_services/test_data_service/test_excel_data_service.py b/tests/unit/test_services/test_data_service/test_excel_data_service.py index a75ca30d5..1e6623aad 100644 --- a/tests/unit/test_services/test_data_service/test_excel_data_service.py +++ b/tests/unit/test_services/test_data_service/test_excel_data_service.py @@ -15,6 +15,13 @@ from cdisc_rules_engine.models.dataset import PandasDataset +@pytest.fixture(autouse=True) +def reset_excel_data_service(): + ExcelDataService._instance = None + yield + ExcelDataService._instance = None + + @pytest.mark.parametrize( "dataset_name", ("ecaa.xpt", "ecbb.xpt", "suppec.xpt"), @@ -33,6 +40,28 @@ def test_get_dataset(dataset_name): assert isinstance(data, PandasDataset) +@pytest.mark.parametrize( + "dataset_name", + ("ex.xpt", "lb.xpt", "ds.xpt"), +) +def test_whitespace_get_dataset_raises(dataset_name): + dataset_path = ( + f"{os.path.dirname(__file__)}/../../../resources/Datasets_whitespace.xlsx" + ) + mock_cache = MagicMock() + mock_cache.get_dataset.return_value = None + data_service = ExcelDataService.get_instance( + config=ConfigService(), + cache_service=mock_cache, + dataset_implementation=PandasDataset, + dataset_path=dataset_path, + ) + with pytest.raises(ExcelTestDataError) as exc_info: + data_service.get_dataset(dataset_name=dataset_name) + assert "leading/trailing whitespace" in str(exc_info.value.message) + assert any(col in exc_info.value.message for col in ["STUDYID", "DOMAIN", "EXSEQ"]) + + @pytest.mark.parametrize( "expected_result", (