From 5cece5e6974351cf2761945123b2a4581fede4a0 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Fri, 27 Feb 2026 11:39:44 +0100 Subject: [PATCH 1/4] Add custom exceptions for unsupported XPT format and enhance error handling in XPTReader --- cdisc_rules_engine/exceptions/__init__.py | 51 +++++++++++++++++++ .../exceptions/custom_exceptions.py | 7 +++ .../services/data_readers/xpt_reader.py | 17 +++++++ 3 files changed, 75 insertions(+) diff --git a/cdisc_rules_engine/exceptions/__init__.py b/cdisc_rules_engine/exceptions/__init__.py index e69de29bb..509279a2f 100644 --- a/cdisc_rules_engine/exceptions/__init__.py +++ b/cdisc_rules_engine/exceptions/__init__.py @@ -0,0 +1,51 @@ +from .custom_exceptions import ( + EngineError, + DatasetNotFoundError, + ReferentialIntegrityError, + MissingDataError, + RuleExecutionError, + RuleFormatError, + InvalidMatchKeyError, + VariableMetadataNotFoundError, + DomainNotFoundError, + DomainNotFoundInDefineXMLError, + InvalidDatasetFormat, + InvalidJSONFormat, + NumberOfAttemptsExceeded, + InvalidDictionaryVariable, + UnsupportedDictionaryType, + FailedSchemaValidation, + SchemaNotFoundError, + InvalidSchemaProvidedError, + PreprocessingError, + OperationError, + DatasetBuilderError, + DateTimeParserError, + UnsupportedXptFormatError, +) + +__all__ = [ + "EngineError", + "DatasetNotFoundError", + "ReferentialIntegrityError", + "MissingDataError", + "RuleExecutionError", + "RuleFormatError", + "InvalidMatchKeyError", + "VariableMetadataNotFoundError", + "DomainNotFoundError", + "DomainNotFoundInDefineXMLError", + "InvalidDatasetFormat", + "InvalidJSONFormat", + "NumberOfAttemptsExceeded", + "InvalidDictionaryVariable", + "UnsupportedDictionaryType", + "FailedSchemaValidation", + "SchemaNotFoundError", + "InvalidSchemaProvidedError", + "PreprocessingError", + "OperationError", + "DatasetBuilderError", + "DateTimeParserError", + "UnsupportedXptFormatError", +] diff --git a/cdisc_rules_engine/exceptions/custom_exceptions.py b/cdisc_rules_engine/exceptions/custom_exceptions.py index e3e5b79d5..b82531440 100644 --- a/cdisc_rules_engine/exceptions/custom_exceptions.py +++ b/cdisc_rules_engine/exceptions/custom_exceptions.py @@ -105,3 +105,10 @@ class DatasetBuilderError(EngineError): class DateTimeParserError(EngineError): code = 400 description = "Failure to parse a datetime string" + + +class UnsupportedXptFormatError(EngineError): + code = 400 + description = ( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index 11ec4d936..5a9cc4c78 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -7,20 +7,37 @@ from cdisc_rules_engine.interfaces import ( DataReaderInterface, ) +from cdisc_rules_engine.exceptions import UnsupportedXptFormatError class XPTReader(DataReaderInterface): + def _ensure_supported_transport_version(self, data): + try: + pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) + except Exception as exc: + raise UnsupportedXptFormatError( + f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" + ) from exc + def read(self, data): + self._ensure_supported_transport_version(data) df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) df = self._format_floats(df) return df def _read_pandas(self, file_path): + with open(file_path, "rb") as f: + raw = f.read(4096) + self._ensure_supported_transport_version(raw) data = pd.read_sas(file_path, format="xport", encoding=self.encoding) return PandasDataset(self._format_floats(data)) def to_parquet(self, file_path: str) -> str: + with open(file_path, "rb") as f: + raw = f.read(4096) + self._ensure_supported_transport_version(raw) + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) created = False From db145d2514e79659548e08c712afc56f4cbc63f0 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 4 Mar 2026 13:20:00 +0100 Subject: [PATCH 2/4] Refactor XPTReader to streamline error handling for unsupported XPT formats --- .../services/data_readers/xpt_reader.py | 30 +++++++++---------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index 5a9cc4c78..20c5d7c28 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -11,35 +11,33 @@ class XPTReader(DataReaderInterface): - - def _ensure_supported_transport_version(self, data): + def read(self, data): try: - pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) + df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) except Exception as exc: raise UnsupportedXptFormatError( f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" ) from exc - - def read(self, data): - self._ensure_supported_transport_version(data) - df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) df = self._format_floats(df) return df def _read_pandas(self, file_path): - with open(file_path, "rb") as f: - raw = f.read(4096) - self._ensure_supported_transport_version(raw) - data = pd.read_sas(file_path, format="xport", encoding=self.encoding) + try: + data = pd.read_sas(file_path, format="xport", encoding=self.encoding) + except Exception as exc: + raise UnsupportedXptFormatError( + f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" + ) from exc return PandasDataset(self._format_floats(data)) def to_parquet(self, file_path: str) -> str: - with open(file_path, "rb") as f: - raw = f.read(4096) - self._ensure_supported_transport_version(raw) - temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") - dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) + try: + dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) + except Exception as exc: + raise UnsupportedXptFormatError( + f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" + ) from exc created = False num_rows = 0 for chunk in dataset: From 82d605d6d5c2b0a276c5f363c9103627d1e2c962 Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Wed, 4 Mar 2026 13:32:05 +0100 Subject: [PATCH 3/4] Refactor XPTReader to improve SAS data reading and error handling --- .../services/data_readers/xpt_reader.py | 23 +++++++------------ 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index 20c5d7c28..9f084ffbc 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -11,33 +11,26 @@ class XPTReader(DataReaderInterface): - def read(self, data): + def _read_sas(self, source, **kwargs): try: - df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) + return pd.read_sas(source, encoding=self.encoding, **kwargs) except Exception as exc: raise UnsupportedXptFormatError( f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" ) from exc + + def read(self, data): + df = self._read_sas(BytesIO(data), format="xport") df = self._format_floats(df) return df def _read_pandas(self, file_path): - try: - data = pd.read_sas(file_path, format="xport", encoding=self.encoding) - except Exception as exc: - raise UnsupportedXptFormatError( - f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" - ) from exc + data = self._read_sas(file_path, format="xport") return PandasDataset(self._format_floats(data)) - def to_parquet(self, file_path: str) -> str: + def to_parquet(self, file_path: str) -> tuple[int, str]: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") - try: - dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) - except Exception as exc: - raise UnsupportedXptFormatError( - f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" - ) from exc + dataset = self._read_sas(file_path, chunksize=20000) created = False num_rows = 0 for chunk in dataset: From 3b425bcb0ce3b03926b43fa60ecc2139334c29af Mon Sep 17 00:00:00 2001 From: alexfurmenkov Date: Mon, 9 Mar 2026 13:04:53 +0100 Subject: [PATCH 4/4] Enhance error handling in XPTReader for unsupported XPT formats and add tests for XPT v5 and v8 --- .../services/data_readers/xpt_reader.py | 11 +++--- tests/resources/test_dataset_sas_v8.xpt | Bin 0 -> 21920 bytes tests/unit/test_xpt_reader.py | 34 ++++++++++++++++++ 3 files changed, 41 insertions(+), 4 deletions(-) create mode 100644 tests/resources/test_dataset_sas_v8.xpt diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index 9f084ffbc..d20e1e85d 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -14,10 +14,13 @@ class XPTReader(DataReaderInterface): def _read_sas(self, source, **kwargs): try: return pd.read_sas(source, encoding=self.encoding, **kwargs) - except Exception as exc: - raise UnsupportedXptFormatError( - f"Unsupported XPT (SAS Transport) format. Only Transport v5 is supported. Original error: {exc}" - ) from exc + except ValueError as exc: + message = str(exc) + if "Header record is not an XPORT file" in message: + raise UnsupportedXptFormatError( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) from exc + raise def read(self, data): df = self._read_sas(BytesIO(data), format="xport") diff --git a/tests/resources/test_dataset_sas_v8.xpt b/tests/resources/test_dataset_sas_v8.xpt new file mode 100644 index 0000000000000000000000000000000000000000..4015cd44c26153ebcb0bbf1b00620a6e93e7635f GIT binary patch literal 21920 zcmd5^Td&+m5%vjek_}&pq5ud+amg#4{dj&&f`n0wfRcutb1X z;sGgt04d_S@DvGN7x994WZ(7;@e}qZ>`$P&t9$I}arrVA={Sk1+v6_R*VR>BUER*a z=bj(4*dIl)_by&x;GPW_W0U-EH}PtfUW_F?ftG6&*fG|-)#{1+KmO&z&MiljiZ7>g z>bp+wc58UM)xF(nnf2mU_3y4>Qh2BSR03Z6jlyg9bPw2_meEIwpO=j0DfD11ewv%4 zS-)|=924}j$5X!1uV#Olb(8Cl9g7yfZ#cs_?e^Yd}?n zG3$b5Sy!#2BwcvRz+;*gy}Rfhw3e6tP1mdVSeNvZIVxniU|sN{DGz3hX{vcwQO!v| zr|MU$_*fTp%N!N5T(&;9NEYrL=od{j?<%S}>E~4ag(^OU{c^^oypNcNNvF^mD3yql!;qzX(}wSQojU`tels z3sTLyiW0I^@*&5|k#z}hgekmO(<&MC?}RMhui``M=*YS(+m57;!W3xTi;(39njgi7 z)X|akJoi1A;)ERpReVZhxdv@P6wxn8 zHSa1)$P!lZDUszxn9HIOyjasJ8T9XjEYm7JC3f_ZY%A>uWZK=1W}2Ut*`%Z$UAA6^ zw*DD3(9~>8)ro$Is`!xoa>05)RCmryUqKAEPv7b%;$BTwCU%qH~e`t zg5kn6)orRMAi((v4n0?FCDFDl??iKLYBX2e)>6+k~+F#-Q>PMLs^iTZK*mT z%SW0Y*{&%6f_a8FC61*WFR2H#?j>c}%rpF3^P~6_`dyUeEr~C8{q5Z+pYt2;?d@2z^)G$G8%(={TGtt}dJMkOvT%*SuE# zkehMuuJJtJ%Euc@z^1q!Ag&&pE_pZ(>bm)od3jgAu^(KI5m%qhCp=!lqAirfhcQ_i z*dKHp?m=Hq5Z8dsnY+If`7k%AENoB#@r=LaAjW`>q^tU zO4lhcEt~VQ7>777L#fS`eenk<=!W?d^Vc(c+-^0+b(nY_=4C*W%LC|jS5v?7Pld}C zt1lV%ZuCgu4SF0l)0eTXL|k2|FS`Z72Y-SBy5B^)Aq=6fkMVKa5&18OH|jci8rSag zm)*^zcV9vQo9gR-3Rl~o2e3SOkB$v(r`_qUGAbGu#Jxt|#?)8yc{v{+x;MAO^{K+u zX_`MI?Q5s}Vfz?H-Sm*wlD8Ap}jk`af2tAA`Of7loH^>?MOkC5g>z+$1k`rG?? za^Kwa8%^VR$X@66Q%d)VK()pD=6%kv8@llQTNs={&W6c^b|S)#vNYns;tjEHz6V^& zjrJh>6pLdt;5}+3+h|<7jzgG@2djB`;J$?&uW7w;ILO8!B<1HJe76H%o-Kk=;8iS~ z_qLz1p0@!f-#TuK{bf~h|DkXxS`XJS!v%Q&*lG|*O;`}e{&)e2#0@6d4O(BOeKqwP zVjQBrM9ICca4DKeph|^)W5^jca~kK_4i;~~I8^hO7?JMJ6t4G?*2Sglt1Z73J;?mk z)E~ko$tsJ&nEL_ZDU3Hd$IgJz*9kW32wdl5972kthvQI`+~1*qn%2^nojdn+u)Js^ z`vPUbU*Qi`iGPf^G_8w^VKZlc@qB~z7bb3ucmuqJ%43U?`wj}IX^I=>xXJ@`Va4M16|V0z!!^gpyR5#dB;teOQsWTj zFWXi7MDBN+;d+3N_rz}3oyANXbO4tDU!qT=IG*q^wY<#XGW_AL`<0-j+<7(~XEgO2 zj}aH-5B+!nA04n@Ixqjy0S72;U%UEwp{2|Sw4R#b%H^X@d04gQmA$PjFKX(mq2GW! zKnwkbEV*jKN7K9vvRvp7SNSN1xp3}F95ILUhwvXf$a@aW5>C9bsP`P?pF?Jw(Umeu%Ulu3Y}m<)gdh zNAuQL()u#}VI%w6%mXmQ!xdc*dNb!w>&0hi+`a@ZAUqL*YZb(De*8^;acre*waQ z`>w*(DBc+Gr`Qb6Ew2tn@%bFKDl0w!JDh#x_fwp%>nL1fq;<#PGkn}`og07H5SIz` E|J8!Ck^lez literal 0 HcmV?d00001 diff --git a/tests/unit/test_xpt_reader.py b/tests/unit/test_xpt_reader.py index 6a3af07ea..843dda4f2 100644 --- a/tests/unit/test_xpt_reader.py +++ b/tests/unit/test_xpt_reader.py @@ -1,6 +1,9 @@ import os +import pytest + from cdisc_rules_engine.services.data_readers.xpt_reader import XPTReader +from cdisc_rules_engine.exceptions import UnsupportedXptFormatError def test_read(): @@ -17,3 +20,34 @@ def test_read(): Verify that the rounding of incredibly small values to 0 is applied. """ assert value == 0 or abs(value) > 10**-16 + + +def test_read_xpt_v5_no_error(): + """Verify that an XPT v5 file can be read without errors.""" + test_dataset_path: str = os.path.join( + os.path.dirname(__file__), "..", "resources", "test_dataset.xpt" + ) + with open(test_dataset_path, "rb") as f: + data = f.read() + + reader = XPTReader() + df = reader.read(data) + assert not df.empty + + +def test_read_xpt_v8_unsupported_error(): + """Verify that XPT v8 format raises UnsupportedXptFormatError.""" + test_dataset_path: str = os.path.join( + os.path.dirname(__file__), "..", "resources", "test_dataset_sas_v8.xpt" + ) + with open(test_dataset_path, "rb") as f: + data = f.read() + + reader = XPTReader() + expected_msg = ( + "Unsupported XPT (SAS Transport) format. Only Transport v5 is supported." + ) + with pytest.raises(UnsupportedXptFormatError) as exc_info: + reader.read(data) + + assert expected_msg in str(exc_info.value)