diff --git a/README.md b/README.md index 94032ee0d..845489536 100644 --- a/README.md +++ b/README.md @@ -206,6 +206,7 @@ This will show the list of validation options. "[████████████████████████████--------] 78%"is printed. -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times + -e, --encoding TEXT File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc. --help Show this message and exit. ``` @@ -241,6 +242,22 @@ CORE supports the following dataset file formats for validation: - Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). - If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. +#### File Encoding + +CORE defaults to utf-8 encoding when reading datasets. If your files use a different encoding, you must specify it using the `-e` or `--encoding` flag: + +```bash +python core.py validate -s sdtmig -v 3-4 -dp path/to/dataset.xpt -e cp1252 +``` + +The encoding name must be a valid Python codec name. Common encodings include: + +- `utf-8`, `utf-16`, `utf-32` - Unicode encodings +- `cp1252` - Windows-1252 (commonly used for files exported from Excel or SAS) +- `latin-1` - ISO-8859-1 + +If an invalid encoding is specified, CORE will display an error message with the supported encoding names. + #### Validate single rule ```bash diff --git a/cdisc_rules_engine/constants/__init__.py b/cdisc_rules_engine/constants/__init__.py index dd9b42e14..77d0a6aea 100644 --- a/cdisc_rules_engine/constants/__init__.py +++ b/cdisc_rules_engine/constants/__init__.py @@ -21,3 +21,5 @@ VALIDATION_FORMATS_MESSAGE = ( "SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)" ) + +DEFAULT_ENCODING: str = "utf-8" diff --git a/cdisc_rules_engine/interfaces/data_reader_interface.py b/cdisc_rules_engine/interfaces/data_reader_interface.py index bc2df4d9a..a21c43416 100644 --- a/cdisc_rules_engine/interfaces/data_reader_interface.py +++ b/cdisc_rules_engine/interfaces/data_reader_interface.py @@ -1,4 +1,5 @@ from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.constants import DEFAULT_ENCODING class DataReaderInterface: @@ -6,11 +7,15 @@ class DataReaderInterface: Interface for reading binary data from different file typs into pandas dataframes """ - def __init__(self, dataset_implementation=PandasDataset): + def __init__( + self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING + ): """ :param dataset_implementation DatasetInterface: The dataset type to return. + :param encoding str: The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8). """ self.dataset_implementation = dataset_implementation + self.encoding = encoding def read(self, data): """ diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py index 03954bdba..bf1d60083 100644 --- a/cdisc_rules_engine/models/validation_args.py +++ b/cdisc_rules_engine/models/validation_args.py @@ -28,5 +28,6 @@ "jsonata_custom_functions", "max_report_rows", "max_errors_per_rule", + "encoding", ], ) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 9df700be1..05666741d 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -87,6 +87,7 @@ def __init__( standard_substandard=self.standard_substandard, library_metadata=self.library_metadata, max_dataset_size=self.max_dataset_size, + encoding=kwargs.get("encoding"), ) self.dataset_implementation = data_service_factory.get_dataset_implementation() kwargs["dataset_implementation"] = self.dataset_implementation diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py index 5fb718975..2df492a86 100644 --- a/cdisc_rules_engine/services/data_readers/data_reader_factory.py +++ b/cdisc_rules_engine/services/data_readers/data_reader_factory.py @@ -15,6 +15,7 @@ from cdisc_rules_engine.services.data_readers.json_reader import JSONReader from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.constants import DEFAULT_ENCODING class DataReaderFactory(FactoryInterface): @@ -26,9 +27,15 @@ class DataReaderFactory(FactoryInterface): DataFormatTypes.USDM.value: JSONReader, } - def __init__(self, service_name: str = None, dataset_implementation=PandasDataset): + def __init__( + self, + service_name: str = None, + dataset_implementation=PandasDataset, + encoding: str = None, + ): self._default_service_name = service_name self.dataset_implementation = dataset_implementation + self.encoding = encoding @classmethod def register_service(cls, name: str, service: Type[DataReaderInterface]): @@ -47,7 +54,9 @@ def get_service(self, name: str = None, **kwargs) -> DataReaderInterface: """ service_name = name or self._default_service_name if service_name in self._reader_map: - return self._reader_map[service_name](self.dataset_implementation) + reader_class = self._reader_map[service_name] + encoding = self.encoding or DEFAULT_ENCODING + return reader_class(self.dataset_implementation, encoding=encoding) raise ValueError( f"Service name must be in {list(self._reader_map.keys())}, " f"given service name is {service_name}" diff --git a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py index 937b7bf51..71e312528 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py @@ -15,14 +15,15 @@ class DatasetJSONReader(DataReaderInterface): + def get_schema(self) -> dict: - schema = JSONReader().from_file( + schema = JSONReader(encoding="utf-8").from_file( os.path.join("resources", "schema", "dataset.schema.json") ) return schema def read_json_file(self, file_path: str) -> dict: - return JSONReader().from_file(file_path) + return JSONReader(encoding=self.encoding).from_file(file_path) def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: # Load Dataset-JSON Schema diff --git a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py index 89f0c663b..48b998e40 100644 --- a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py +++ b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py @@ -16,16 +16,23 @@ class DatasetNDJSONReader(DataReaderInterface): + def get_schema(self) -> dict: - schema = JSONReader().from_file( + schema = JSONReader(encoding="utf-8").from_file( os.path.join("resources", "schema", "dataset-ndjson-schema.json") ) return schema def read_json_file(self, file_path: str) -> dict: - with open(file_path, "r") as file: - lines = file.readlines() - return json.loads(lines[0]), [json.loads(line) for line in lines[1:]] + try: + with open(file_path, "r", encoding=self.encoding) as file: + lines = file.readlines() + return json.loads(lines[0]), [json.loads(line) for line in lines[1:]] + except (UnicodeDecodeError, UnicodeError) as e: + raise ValueError( + f"Could not decode NDJSON file {file_path} with {self.encoding} encoding: {e}. " + f"Please specify the correct encoding using the -e flag." + ) def _raw_dataset_from_file(self, file_path) -> pd.DataFrame: # Load Dataset-JSON Schema diff --git a/cdisc_rules_engine/services/data_readers/json_reader.py b/cdisc_rules_engine/services/data_readers/json_reader.py index f7928ae07..fb80530f7 100644 --- a/cdisc_rules_engine/services/data_readers/json_reader.py +++ b/cdisc_rules_engine/services/data_readers/json_reader.py @@ -8,9 +8,15 @@ class JSONReader(DataReaderInterface): def from_file(self, file_path): try: - with open(file_path, "rb") as fp: - json = load(fp) - return json + with open(file_path, "r", encoding=self.encoding) as fp: + json_data = load(fp) + return json_data + except (UnicodeDecodeError, UnicodeError) as e: + raise InvalidJSONFormat( + f"\n Error reading JSON from: {file_path}" + f"\n Failed to decode with {self.encoding} encoding: {e}" + f"\n Please specify the correct encoding using the -e flag." + ) except Exception as e: raise InvalidJSONFormat( f"\n Error reading JSON from: {file_path}" diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py index b7176620a..11ec4d936 100644 --- a/cdisc_rules_engine/services/data_readers/xpt_reader.py +++ b/cdisc_rules_engine/services/data_readers/xpt_reader.py @@ -10,18 +10,19 @@ class XPTReader(DataReaderInterface): + def read(self, data): - df = pd.read_sas(BytesIO(data), format="xport", encoding="utf-8") + df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding) df = self._format_floats(df) return df def _read_pandas(self, file_path): - data = pd.read_sas(file_path, format="xport", encoding="utf-8") + data = pd.read_sas(file_path, format="xport", encoding=self.encoding) return PandasDataset(self._format_floats(data)) def to_parquet(self, file_path: str) -> str: temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet") - dataset = pd.read_sas(file_path, chunksize=20000, encoding="utf-8") + dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding) created = False num_rows = 0 for chunk in dataset: diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py index b580d9bd0..60c3ab739 100644 --- a/cdisc_rules_engine/services/data_services/data_service_factory.py +++ b/cdisc_rules_engine/services/data_services/data_service_factory.py @@ -37,6 +37,7 @@ def __init__( standard_substandard: str = None, library_metadata: LibraryMetadataContainer = None, max_dataset_size: int = 0, + encoding: str = None, ): if config.getValue("DATA_SERVICE_TYPE"): self.data_service_name = config.getValue("DATA_SERVICE_TYPE") @@ -51,12 +52,13 @@ def __init__( self.standard_substandard = standard_substandard self.library_metadata = library_metadata self.max_dataset_size = max_dataset_size + self.encoding = encoding self.dataset_size_threshold = self.config.get_dataset_size_threshold() def get_data_service( self, dataset_paths: Iterable[str] = [] ) -> DataServiceInterface: - if USDMDataService.is_valid_data(dataset_paths): + if USDMDataService.is_valid_data(dataset_paths, encoding=self.encoding): """Get json file tree to dataset data service""" return self.get_service( "usdm", @@ -66,11 +68,12 @@ def get_data_service( library_metadata=self.library_metadata, dataset_path=dataset_paths[0], dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) - elif DummyDataService.is_valid_data(dataset_paths): + elif DummyDataService.is_valid_data(dataset_paths, encoding=self.encoding): """Get dummy data service""" return self.get_dummy_data_service( - data=DummyDataService.get_data(dataset_paths) + data=DummyDataService.get_data(dataset_paths, encoding=self.encoding) ) elif ExcelDataService.is_valid_data(dataset_paths): """Get Excel file to dataset data service""" @@ -82,6 +85,7 @@ def get_data_service( library_metadata=self.library_metadata, dataset_path=dataset_paths[0], dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) else: """Get local Directory data service""" @@ -93,6 +97,7 @@ def get_data_service( library_metadata=self.library_metadata, dataset_paths=dataset_paths, dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface: @@ -104,6 +109,7 @@ def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterfa standard_substandard=self.standard_substandard, library_metadata=self.library_metadata, dataset_implementation=self.get_dataset_implementation(), + encoding=self.encoding, ) def get_dataset_implementation(self): diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py index f163b0a14..9275e2262 100644 --- a/cdisc_rules_engine/services/data_services/dummy_data_service.py +++ b/cdisc_rules_engine/services/data_services/dummy_data_service.py @@ -15,6 +15,7 @@ from cdisc_rules_engine.services.data_readers import DataReaderFactory from cdisc_rules_engine.services.data_readers.json_reader import JSONReader from cdisc_rules_engine.services.data_services import BaseDataService +from cdisc_rules_engine.constants import DEFAULT_ENCODING from cdisc_rules_engine.models.dataset import PandasDataset @@ -42,7 +43,12 @@ def get_instance( ): return cls( cache_service=cache_service, - reader_factory=DataReaderFactory(), + reader_factory=DataReaderFactory( + dataset_implementation=kwargs.get( + "dataset_implementation", PandasDataset + ), + encoding=kwargs.get("encoding"), + ), config=config, **kwargs, ) @@ -177,17 +183,21 @@ def get_datasets(self) -> Iterable[SDTMDatasetMetadata]: return self.data @staticmethod - def get_data(dataset_paths: Sequence[str]): - json = JSONReader().from_file(dataset_paths[0]) + def get_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING): + json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file( + dataset_paths[0] + ) return [DummyDataset(data) for data in json.get("datasets", [])] @staticmethod - def is_valid_data(dataset_paths: Sequence[str]): + def is_valid_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING): if ( dataset_paths and len(dataset_paths) == 1 and dataset_paths[0].lower().endswith(".json") ): - json = JSONReader().from_file(dataset_paths[0]) + json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file( + dataset_paths[0] + ) return "datasets" in json return False diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py index de6822da2..dbfa60567 100644 --- a/cdisc_rules_engine/services/data_services/excel_data_service.py +++ b/cdisc_rules_engine/services/data_services/excel_data_service.py @@ -54,7 +54,8 @@ def get_instance( reader_factory=DataReaderFactory( dataset_implementation=kwargs.get( "dataset_implementation", PandasDataset - ) + ), + encoding=kwargs.get("encoding"), ), config=config, **kwargs, diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index ace053e2d..cffb61bd3 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -28,6 +28,7 @@ from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset import PandasDataset +from cdisc_rules_engine.services import logger import re @@ -45,6 +46,7 @@ def __init__( cache_service, reader_factory, config, **kwargs ) self.dataset_paths: Iterable[str] = kwargs.get("dataset_paths", []) + self.encoding: str = kwargs.get("encoding") @classmethod def get_instance( @@ -53,13 +55,22 @@ def get_instance( config: ConfigInterface = None, **kwargs, ): - if cls._instance is None: + """ + Return the singleton instance. Reset the instance when encoding is + explicitly requested and differs from the cached one (e.g., validation + runs multiple times with different encodings in the same process). + """ + encoding = kwargs.get("encoding") + if cls._instance is None or ( + encoding is not None and cls._instance.encoding != encoding + ): service = cls( cache_service=cache_service, reader_factory=DataReaderFactory( dataset_implementation=kwargs.get( "dataset_implementation", PandasDataset - ) + ), + encoding=encoding, ), config=config, **kwargs, @@ -195,7 +206,7 @@ def read_metadata( ) contents_metadata = _metadata_reader_map[file_extension]( - file_metadata["path"], file_name + file_metadata["path"], file_name, encoding=self.encoding ).read() return { "file_metadata": file_metadata, @@ -226,10 +237,35 @@ def to_parquet(self, file_path: str) -> str: return reader.to_parquet(file_path) def get_datasets(self) -> List[dict]: - datasets = [ - self.get_raw_dataset_metadata(dataset_name=dataset_path) - for dataset_path in self.dataset_paths - ] + datasets = [] + for dataset_path in self.dataset_paths: + try: + dataset_metadata = self.get_raw_dataset_metadata( + dataset_name=dataset_path + ) + datasets.append(dataset_metadata) + except Exception as e: + logger.error( + f"Failed to read metadata for dataset {dataset_path}. " + f"Error: {type(e).__name__}: {e}. Skipping this dataset." + ) + file_name = extract_file_name_from_path_string(dataset_path) + datasets.append( + SDTMDatasetMetadata( + name=( + file_name.split(".")[0].upper() + if "." in file_name + else file_name.upper() + ), + first_record={}, + label="", + modification_date="", + filename=file_name, + full_path=dataset_path, + file_size=0, + record_count=0, + ) + ) return datasets @staticmethod diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 3d603a1ed..275e1c676 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -77,6 +77,7 @@ def __init__( cache_service, reader_factory, config, **kwargs ) self.dataset_path: str = kwargs.get("dataset_path", "") + self.encoding: str = kwargs.get("encoding") with open(os.path.join("resources", "schema", "USDM.yaml")) as entity_dict: self.entity_dict: dict = safe_load(entity_dict) @@ -107,7 +108,8 @@ def get_instance( reader_factory=DataReaderFactory( dataset_implementation=kwargs.get( "dataset_implementation", PandasDataset - ) + ), + encoding=kwargs.get("encoding"), ), config=config, **kwargs, @@ -477,12 +479,12 @@ def __get_domain_from_dataset_name(self, dataset_name: str) -> str: return extract_file_name_from_path_string(dataset_name).split(".")[0] @staticmethod - def is_valid_data(dataset_paths: Sequence[str]): + def is_valid_data(dataset_paths: Sequence[str], encoding: str = None): if ( dataset_paths and len(dataset_paths) == 1 and dataset_paths[0].lower().endswith(".json") ): - json = JSONReader().from_file(dataset_paths[0]) + json = JSONReader(encoding=encoding).from_file(dataset_paths[0]) return "study" in json and "datasetJSONVersion" not in json return False diff --git a/cdisc_rules_engine/services/datasetjson_metadata_reader.py b/cdisc_rules_engine/services/datasetjson_metadata_reader.py index f77856977..5aa1b6d93 100644 --- a/cdisc_rules_engine/services/datasetjson_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetjson_metadata_reader.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.adam_variable_reader import AdamVariableReader from cdisc_rules_engine.services.data_readers.json_reader import JSONReader +from cdisc_rules_engine.constants import DEFAULT_ENCODING class DatasetJSONMetadataReader: @@ -14,22 +15,27 @@ class DatasetJSONMetadataReader: from .json file. """ - def __init__(self, file_path: str, file_name: str): + def __init__( + self, file_path: str, file_name: str, encoding: str = DEFAULT_ENCODING + ): self._metadata_container = {} self._file_path = file_path self._first_record = None self._dataset_name = file_name.split(".")[0].upper() + self.encoding = encoding def read(self) -> dict: """ Extracts metadata from .json file. """ # Load Dataset-JSON Schema - schema = JSONReader().from_file( + schema = JSONReader(encoding="utf-8").from_file( os.path.join("resources", "schema", "dataset.schema.json") ) - datasetjson = JSONReader().from_file(self._file_path) + datasetjson = JSONReader(encoding=self.encoding or DEFAULT_ENCODING).from_file( + self._file_path + ) try: jsonschema.validate(datasetjson, schema) diff --git a/cdisc_rules_engine/services/datasetndjson_metadata_reader.py b/cdisc_rules_engine/services/datasetndjson_metadata_reader.py index ded014fa5..d4f0987a2 100644 --- a/cdisc_rules_engine/services/datasetndjson_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetndjson_metadata_reader.py @@ -7,6 +7,7 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.services.adam_variable_reader import AdamVariableReader from cdisc_rules_engine.services.data_readers.json_reader import JSONReader +from cdisc_rules_engine.constants import DEFAULT_ENCODING class DatasetNDJSONMetadataReader: @@ -15,23 +16,33 @@ class DatasetNDJSONMetadataReader: from .ndjson file. """ - def __init__(self, file_path: str, file_name: str): + def __init__( + self, file_path: str, file_name: str, encoding: str = DEFAULT_ENCODING + ): self._metadata_container = {} self._file_path = file_path self._first_record = None self._dataset_name = file_name.split(".")[0].upper() + self.encoding = encoding def read(self) -> dict: """ Extracts metadata from .ndjson file. """ # Load Dataset-NDJSON Schema - schema = JSONReader().from_file( + schema = JSONReader(encoding="utf-8").from_file( os.path.join("resources", "schema", "dataset-ndjson-schema.json") ) - with open(self._file_path, "r") as file: - lines = file.readlines() + encoding = self.encoding or DEFAULT_ENCODING + try: + with open(self._file_path, "r", encoding=encoding) as file: + lines = file.readlines() + except (UnicodeDecodeError, UnicodeError) as e: + raise ValueError( + f"Could not decode NDJSON file {self._file_path} with {encoding} encoding: {e}. " + f"Please specify the correct encoding using the -e flag." + ) metadatandjson = json.loads(lines[0]) diff --git a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py index 90d4c214c..914747c17 100644 --- a/cdisc_rules_engine/services/datasetxpt_metadata_reader.py +++ b/cdisc_rules_engine/services/datasetxpt_metadata_reader.py @@ -3,6 +3,7 @@ from cdisc_rules_engine.services import logger from cdisc_rules_engine.config import config from cdisc_rules_engine.services.adam_variable_reader import AdamVariableReader +from cdisc_rules_engine.constants import DEFAULT_ENCODING import os @@ -14,7 +15,9 @@ class DatasetXPTMetadataReader: # TODO. Maybe in future it is worth having multiple constructors # like from_bytes, from_file etc. But now there is no immediate need for that. - def __init__(self, file_path: str, file_name: str): + def __init__( + self, file_path: str, file_name: str, encoding: str = DEFAULT_ENCODING + ): file_size = os.path.getsize(file_path) if file_size > config.get_dataset_size_threshold(): self._estimate_dataset_length = True @@ -26,16 +29,18 @@ def __init__(self, file_path: str, file_name: str): self._first_record = None self._dataset_name = file_name.split(".")[0].upper() self._file_path = file_path + self.encoding = encoding def read(self) -> dict: """ Extracts metadata from binary contents of .xpt file. """ + encoding = self.encoding or DEFAULT_ENCODING try: dataset, metadata = pyreadstat.read_xport( - self._file_path, row_limit=self.row_limit + self._file_path, encoding=encoding, row_limit=self.row_limit ) - except pyreadstat.ReadstatError: + except (pyreadstat.ReadstatError, UnicodeDecodeError): return { "variable_labels": [], "variable_names": [], @@ -94,7 +99,10 @@ def _extract_first_record(self, df): return None def _calculate_dataset_length(self): - df, meta = pyreadstat.read_xport(self._file_path, metadataonly=True) + encoding = self.encoding or DEFAULT_ENCODING + _, meta = pyreadstat.read_xport( + self._file_path, encoding=encoding, metadataonly=True + ) row_size = sum(meta.variable_storage_width.values()) total_size = os.path.getsize(self._file_path) start = self._read_header(self._file_path) diff --git a/core.py b/core.py index ef104df6b..f28186b74 100644 --- a/core.py +++ b/core.py @@ -4,6 +4,7 @@ """ import asyncio +import codecs import json import logging import os @@ -35,7 +36,7 @@ get_rules_cache_key, validate_dataset_files_exist, ) -from cdisc_rules_engine.constants import VALIDATION_FORMATS_MESSAGE +from cdisc_rules_engine.constants import VALIDATION_FORMATS_MESSAGE, DEFAULT_ENCODING from scripts.list_dataset_metadata_handler import list_dataset_metadata_handler from scripts.run_validation import run_validation from version import __version__ @@ -45,6 +46,19 @@ ) +def validate_encoding(ctx, param, value): + if value is None: + return DEFAULT_ENCODING + try: + codecs.lookup(value) + return value + except LookupError: + raise click.BadParameter( + f"Invalid encoding '{value}'. Please provide a valid encoding name " + f"(e.g., utf-8, utf-16, utf-32, cp1252, latin-1)." + ) + + def valid_data_file(data_path: list) -> tuple[list, set]: allowed_formats = [ DataFormatTypes.XPT.value, @@ -379,6 +393,18 @@ def _validate_no_arguments(logger) -> None: "If true, limits reported issues per dataset per rule." ), ) +@click.option( + "-e", + "--encoding", + default=DEFAULT_ENCODING, + required=False, + callback=validate_encoding, + help=( + f"File encoding for reading datasets. " + f"Defaults to {DEFAULT_ENCODING}. " + f"Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc." + ), +) @click.pass_context def validate( # noqa ctx, @@ -416,6 +442,7 @@ def validate( # noqa jsonata_custom_functions: tuple[()] | tuple[tuple[str, str], ...], max_report_rows: int, max_errors_per_rule: tuple[int, bool], + encoding: str, ): """ Validate data using CDISC Rules Engine @@ -510,6 +537,7 @@ def validate( # noqa jsonata_custom_functions, max_report_rows, max_errors_per_rule, + encoding, ) ) @@ -877,6 +905,7 @@ def test_validate(filetype): (), None, max_report_errors, + None, ) ) print(f"{filetype.upper()} validation completed successfully!") diff --git a/scripts/run_validation.py b/scripts/run_validation.py index 3f6df90bb..fe3573b22 100644 --- a/scripts/run_validation.py +++ b/scripts/run_validation.py @@ -95,6 +95,7 @@ def validate_single_rule( jsonata_custom_functions=args.jsonata_custom_functions, max_errors_per_rule=max_errors_per_rule, errors_per_dataset_flag=per_dataset_flag, + encoding=args.encoding, ) results = engine.validate_single_rule(rule, datasets) results = list(itertools.chain(*results.values())) @@ -150,6 +151,7 @@ def run_validation(args: Validation_args): standard_version=standard_version, standard_substandard=standard_substandard, library_metadata=library_metadata, + encoding=args.encoding, ).get_data_service(args.dataset_paths) # install dictionaries if needed dictionary_versions = fill_cache_with_dictionaries( diff --git a/tests/unit/test_dataset_json_reader.py b/tests/unit/test_dataset_json_reader.py index 649cbb5cb..8d4dc6186 100644 --- a/tests/unit/test_dataset_json_reader.py +++ b/tests/unit/test_dataset_json_reader.py @@ -1,8 +1,14 @@ import os +import tempfile +import json +import pytest + +from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset from cdisc_rules_engine.services.data_readers.dataset_json_reader import ( DatasetJSONReader, ) +from cdisc_rules_engine.exceptions.custom_exceptions import InvalidJSONFormat def test_from_file(): @@ -10,10 +16,99 @@ def test_from_file(): f"{os.path.dirname(__file__)}/../resources/test_dataset.json" ) - reader = DatasetJSONReader() + reader = DatasetJSONReader(PandasDataset) dataframe = reader.from_file(test_dataset_path) for value in dataframe["EXDOSE"]: """ Verify that the rounding of incredibly small values to 0 is applied. """ assert value == 0 or abs(value) > 10**-16 + + +def test_read_json_file_fails_with_wrong_encoding(): + test_data = { + "datasetJSONVersion": "1.1", + "datasetJSONCreationDateTime": "2024-01-01T00:00:00", + "sourceSystem": {"name": "Test", "version": "1.0"}, + "studyOID": "TEST.1", + "metaDataVersionOID": "MDV.1", + "itemGroupOID": "IG.TEST", + "records": 1, + "name": "TEST", + "label": "Test Dataset", + "columns": [ + { + "itemOID": "IT.TEST.STUDYID", + "name": "STUDYID", + "label": "Study Identifier", + "dataType": "string", + "length": 10, + } + ], + "rows": [["STUDY001"]], + } + with tempfile.NamedTemporaryFile(mode="wb", suffix=".json", delete=False) as f: + json_str = json.dumps(test_data, ensure_ascii=False) + json_bytes = json_str.encode("cp1252").replace( + b'"Test Dataset"', b'"Test\x92s Dataset"' + ) + f.write(json_bytes) + temp_path = f.name + + try: + reader = DatasetJSONReader(PandasDataset, encoding="utf-8") + with pytest.raises(InvalidJSONFormat): + reader.read_json_file(temp_path) + finally: + os.unlink(temp_path) + + +def _minimal_dataset_json(): + """Minimal valid Dataset-JSON with non-ASCII character for encoding tests.""" + return { + "datasetJSONVersion": "1.1", + "datasetJSONCreationDateTime": "2024-01-01T00:00:00", + "sourceSystem": {"name": "Test", "version": "1.0"}, + "studyOID": "TEST.1", + "metaDataVersionOID": "MDV.1", + "itemGroupOID": "IG.TEST", + "records": 1, + "name": "TEST", + "label": "Test Dataset", + "columns": [ + { + "itemOID": "IT.TEST.STUDYID", + "name": "STUDYID", + "label": "Study Identifier", + "dataType": "string", + "length": 10, + } + ], + "rows": [["STUDY001"]], + } + + +@pytest.mark.parametrize( + "encoding,label", + [ + ("utf-8", "Test Dataset — utf-8"), + ("utf-16", "Test Dataset — utf-16"), + ("utf-32", "Test Dataset — utf-32"), + ("cp1252", "Test Dataset"), + ("latin-1", "Test Dataset latin-1 \xe9"), + ], +) +def test_read_json_file_succeeds_with_encoding(encoding, label): + """Test each encoding mentioned in README (utf-8, utf-16, utf-32, cp1252, latin-1).""" + test_data = _minimal_dataset_json() + test_data["label"] = label + with tempfile.NamedTemporaryFile(mode="wb", suffix=".json", delete=False) as f: + f.write(json.dumps(test_data, ensure_ascii=False).encode(encoding)) + temp_path = f.name + try: + reader = DatasetJSONReader(PandasDataset, encoding=encoding) + result = reader.read_json_file(temp_path) + assert result["name"] == "TEST" + assert result["label"] == label + finally: + os.unlink(temp_path) diff --git a/tests/unit/test_services/test_data_service/test_data_service.py b/tests/unit/test_services/test_data_service/test_data_service.py index 31ae87a91..c3316d4d2 100644 --- a/tests/unit/test_services/test_data_service/test_data_service.py +++ b/tests/unit/test_services/test_data_service/test_data_service.py @@ -207,6 +207,7 @@ def test_get_dataset_class(dataset_metadata, data, expected_class): None, None, None, + None, ) ) data_service = LocalDataService( @@ -292,6 +293,7 @@ def test_get_dataset_class_associated_domains(): None, None, None, + None, ) ) data_service = LocalDataService(