diff --git a/cdisc_rules_engine/exceptions/__init__.py b/cdisc_rules_engine/exceptions/__init__.py index e69de29bb..509279a2f 100644 --- a/cdisc_rules_engine/exceptions/__init__.py +++ b/cdisc_rules_engine/exceptions/__init__.py @@ -0,0 +1,51 @@ +from .custom_exceptions import ( + EngineError, + DatasetNotFoundError, + ReferentialIntegrityError, + MissingDataError, + RuleExecutionError, + RuleFormatError, + InvalidMatchKeyError, + VariableMetadataNotFoundError, + DomainNotFoundError, + DomainNotFoundInDefineXMLError, + InvalidDatasetFormat, + InvalidJSONFormat, + NumberOfAttemptsExceeded, + InvalidDictionaryVariable, + UnsupportedDictionaryType, + FailedSchemaValidation, + SchemaNotFoundError, + InvalidSchemaProvidedError, + PreprocessingError, + OperationError, + DatasetBuilderError, + DateTimeParserError, + UnsupportedXptFormatError, +) + +__all__ = [ + "EngineError", + "DatasetNotFoundError", + "ReferentialIntegrityError", + "MissingDataError", + "RuleExecutionError", + "RuleFormatError", + "InvalidMatchKeyError", + "VariableMetadataNotFoundError", + "DomainNotFoundError", + "DomainNotFoundInDefineXMLError", + "InvalidDatasetFormat", + "InvalidJSONFormat", + "NumberOfAttemptsExceeded", + "InvalidDictionaryVariable", + "UnsupportedDictionaryType", + "FailedSchemaValidation", + "SchemaNotFoundError", + "InvalidSchemaProvidedError", + "PreprocessingError", + "OperationError", + "DatasetBuilderError", + "DateTimeParserError", + "UnsupportedXptFormatError", +] diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index 669cb719c..94436108a 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -125,3 +125,10 @@ class DatasetBuilderError(EngineError): class DateTimeParserError(EngineError): code = 400 description = "Failure to parse a datetime string" + + +class UnsupportedXptFormatError(EngineError): + code = 400 + description = ( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index 11ec4d936..d20e1e85d 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -7,22 +7,33 @@ from cdisc_rules_engine.interfaces import ( DataReaderInterface, ) +from cdisc_rules_engine.exceptions import UnsupportedXptFormatError class XPTReader(DataReaderInterface): + def _read_sas(self, source, **kwargs): + try: + return pd.read_sas(source, encoding=self.encoding, **kwargs) + except ValueError as exc: + message = str(exc) + if "Header record is not an XPORT file" in message: + raise UnsupportedXptFormatError( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) from exc + raise def read(self, data): - df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) + df = self._read_sas(BytesIO(data), format="xport") df = self._format_floats(df) return df def _read_pandas(self, file_path): - data = pd.read_sas(file_path, format="xport", encoding=self.encoding) + data = self._read_sas(file_path, format="xport") return PandasDataset(self._format_floats(data)) - def to_parquet(self, file_path: str) -> str: + def to_parquet(self, file_path: str) -> tuple[int, str]: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") - dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) + dataset = self._read_sas(file_path, chunksize=20000) created = False num_rows = 0 for chunk in dataset: diff --git a/tests/resources/test_dataset_sas_v8.xpt b/tests/resources/test_dataset_sas_v8.xpt new file mode 100644 index 000000000..4015cd44c Binary files /dev/null and b/tests/resources/test_dataset_sas_v8.xpt differ diff --git a/tests/unit/test_xpt_reader.py b/tests/unit/test_xpt_reader.py index 6a3af07ea..843dda4f2 100644 --- a/tests/unit/test_xpt_reader.py +++ b/tests/unit/test_xpt_reader.py @@ -1,6 +1,9 @@ import os +import pytest + from cdisc_rules_engine.services.data_readers.xpt_reader import XPTReader +from cdisc_rules_engine.exceptions import UnsupportedXptFormatError def test_read(): @@ -17,3 +20,34 @@ def test_read(): Verify that the rounding of incredibly small values to 0 is applied. """ assert value == 0 or abs(value) > 10**-16 + + +def test_read_xpt_v5_no_error(): + """Verify that an XPT v5 file can be read without errors.""" + test_dataset_path: str = os.path.join( + os.path.dirname(__file__), "..", "resources", "test_dataset.xpt" + ) + with open(test_dataset_path, "rb") as f: + data = f.read() + + reader = XPTReader() + df = reader.read(data) + assert not df.empty + + +def test_read_xpt_v8_unsupported_error(): + """Verify that XPT v8 format raises UnsupportedXptFormatError.""" + test_dataset_path: str = os.path.join( + os.path.dirname(__file__), "..", "resources", "test_dataset_sas_v8.xpt" + ) + with open(test_dataset_path, "rb") as f: + data = f.read() + + reader = XPTReader() + expected_msg = ( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) + with pytest.raises(UnsupportedXptFormatError) as exc_info: + reader.read(data) + + assert expected_msg in str(exc_info.value)