cdisc-org · gerrycampion · Feb 11, 2026 · Nov 25, 2025 · Nov 26, 2025 · Nov 26, 2025
diff --git a/README.md b/README.md
@@ -206,6 +206,7 @@ This will show the list of validation options.
                                   "[████████████████████████████--------]
                                   78%"is printed.
   -jcf, --jsonata-custom-functions Pair containing a variable name and a Path to directory containing a set of custom JSONata functions. Can be specified multiple times
+  -e, --encoding TEXT            File encoding for reading datasets. If not specified, defaults to utf-8. Supported encodings: utf-8, utf-16, utf-32, cp1252, latin-1, etc.
   --help                          Show this message and exit.
 ```
 
@@ -241,6 +242,22 @@ CORE supports the following dataset file formats for validation:
 - Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`).
 - If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported.
 
+#### File Encoding
+
+CORE defaults to utf-8 encoding when reading datasets. If your files use a different encoding, you must specify it using the `-e` or `--encoding` flag:
+
+```bash
+python core.py validate -s sdtmig -v 3-4 -dp path/to/dataset.xpt -e cp1252
+```
+
+The encoding name must be a valid Python codec name. Common encodings include:
+
+- `utf-8`, `utf-16`, `utf-32` - Unicode encodings
+- `cp1252` - Windows-1252 (commonly used for files exported from Excel or SAS)
+- `latin-1` - ISO-8859-1
+
+If an invalid encoding is specified, CORE will display an error message with the supported encoding names.
+
 #### Validate single rule
 
 ```bash

diff --git a/cdisc_rules_engine/constants/__init__.py b/cdisc_rules_engine/constants/__init__.py
@@ -21,3 +21,5 @@
 VALIDATION_FORMATS_MESSAGE = (
     "SAS V5 XPT, Dataset-JSON (JSON or NDJSON), or Excel (XLSX)"
 )
+
+DEFAULT_ENCODING: str = "utf-8"
diff --git a/cdisc_rules_engine/interfaces/data_reader_interface.py b/cdisc_rules_engine/interfaces/data_reader_interface.py
@@ -1,16 +1,21 @@
 from cdisc_rules_engine.models.dataset import PandasDataset
+from cdisc_rules_engine.constants import DEFAULT_ENCODING
 
 
 class DataReaderInterface:
     """
     Interface for reading binary data from different file typs into pandas dataframes
     """
 
-    def __init__(self, dataset_implementation=PandasDataset):
+    def __init__(
+        self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING
+    ):
         """
         :param dataset_implementation DatasetInterface: The dataset type to return.
+        :param encoding str: The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
         """
         self.dataset_implementation = dataset_implementation
+        self.encoding = encoding
 
     def read(self, data):
         """

diff --git a/cdisc_rules_engine/models/validation_args.py b/cdisc_rules_engine/models/validation_args.py
@@ -28,5 +28,6 @@
         "jsonata_custom_functions",
         "max_report_rows",
         "max_errors_per_rule",
+        "encoding",
     ],
 )
diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py
@@ -87,6 +87,7 @@ def __init__(
             standard_substandard=self.standard_substandard,
             library_metadata=self.library_metadata,
             max_dataset_size=self.max_dataset_size,
+            encoding=kwargs.get("encoding"),
         )
         self.dataset_implementation = data_service_factory.get_dataset_implementation()
         kwargs["dataset_implementation"] = self.dataset_implementation

diff --git a/cdisc_rules_engine/services/data_readers/data_reader_factory.py b/cdisc_rules_engine/services/data_readers/data_reader_factory.py
@@ -15,6 +15,7 @@
 from cdisc_rules_engine.services.data_readers.json_reader import JSONReader
 from cdisc_rules_engine.enums.dataformat_types import DataFormatTypes
 from cdisc_rules_engine.models.dataset import PandasDataset
+from cdisc_rules_engine.constants import DEFAULT_ENCODING
 
 
 class DataReaderFactory(FactoryInterface):
@@ -26,9 +27,15 @@ class DataReaderFactory(FactoryInterface):
         DataFormatTypes.USDM.value: JSONReader,
     }
 
-    def __init__(self, service_name: str = None, dataset_implementation=PandasDataset):
+    def __init__(
+        self,
+        service_name: str = None,
+        dataset_implementation=PandasDataset,
+        encoding: str = None,
+    ):
         self._default_service_name = service_name
         self.dataset_implementation = dataset_implementation
+        self.encoding = encoding
 
     @classmethod
     def register_service(cls, name: str, service: Type[DataReaderInterface]):
@@ -47,7 +54,9 @@ def get_service(self, name: str = None, **kwargs) -> DataReaderInterface:
         """
         service_name = name or self._default_service_name
         if service_name in self._reader_map:
-            return self._reader_map[service_name](self.dataset_implementation)
+            reader_class = self._reader_map[service_name]
+            encoding = self.encoding or DEFAULT_ENCODING
+            return reader_class(self.dataset_implementation, encoding=encoding)
         raise ValueError(
             f"Service name must be in {list(self._reader_map.keys())}, "
             f"given service name is {service_name}"

diff --git a/cdisc_rules_engine/services/data_readers/dataset_json_reader.py b/cdisc_rules_engine/services/data_readers/dataset_json_reader.py
@@ -15,14 +15,15 @@
 
 
 class DatasetJSONReader(DataReaderInterface):
+
     def get_schema(self) -> dict:
-        schema = JSONReader().from_file(
+        schema = JSONReader(encoding="utf-8").from_file(
             os.path.join("resources", "schema", "dataset.schema.json")
         )
         return schema
 
     def read_json_file(self, file_path: str) -> dict:
-        return JSONReader().from_file(file_path)
+        return JSONReader(encoding=self.encoding).from_file(file_path)
 
     def _raw_dataset_from_file(self, file_path) -> pd.DataFrame:
         # Load Dataset-JSON Schema

diff --git a/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py b/cdisc_rules_engine/services/data_readers/dataset_ndjson_reader.py
@@ -16,16 +16,23 @@
 
 
 class DatasetNDJSONReader(DataReaderInterface):
+
     def get_schema(self) -> dict:
-        schema = JSONReader().from_file(
+        schema = JSONReader(encoding="utf-8").from_file(
             os.path.join("resources", "schema", "dataset-ndjson-schema.json")
         )
         return schema
 
     def read_json_file(self, file_path: str) -> dict:
-        with open(file_path, "r") as file:
-            lines = file.readlines()
-        return json.loads(lines[0]), [json.loads(line) for line in lines[1:]]
+        try:
+            with open(file_path, "r", encoding=self.encoding) as file:
+                lines = file.readlines()
+            return json.loads(lines[0]), [json.loads(line) for line in lines[1:]]
+        except (UnicodeDecodeError, UnicodeError) as e:
+            raise ValueError(
+                f"Could not decode NDJSON file {file_path} with {self.encoding} encoding: {e}. "
+                f"Please specify the correct encoding using the -e flag."
+            )
 
     def _raw_dataset_from_file(self, file_path) -> pd.DataFrame:
         # Load Dataset-JSON Schema

diff --git a/cdisc_rules_engine/services/data_readers/json_reader.py b/cdisc_rules_engine/services/data_readers/json_reader.py
@@ -8,9 +8,15 @@
 class JSONReader(DataReaderInterface):
     def from_file(self, file_path):
         try:
-            with open(file_path, "rb") as fp:
-                json = load(fp)
-            return json
+            with open(file_path, "r", encoding=self.encoding) as fp:
+                json_data = load(fp)
+            return json_data
+        except (UnicodeDecodeError, UnicodeError) as e:
+            raise InvalidJSONFormat(
+                f"\n  Error reading JSON from: {file_path}"
+                f"\n  Failed to decode with {self.encoding} encoding: {e}"
+                f"\n  Please specify the correct encoding using the -e flag."
+            )
         except Exception as e:
             raise InvalidJSONFormat(
                 f"\n  Error reading JSON from: {file_path}"

diff --git a/cdisc_rules_engine/services/data_readers/xpt_reader.py b/cdisc_rules_engine/services/data_readers/xpt_reader.py
@@ -10,18 +10,19 @@
 
 
 class XPTReader(DataReaderInterface):
+
     def read(self, data):
-        df = pd.read_sas(BytesIO(data), format="xport", encoding="utf-8")
+        df = pd.read_sas(BytesIO(data), format="xport", encoding=self.encoding)
         df = self._format_floats(df)
         return df
 
     def _read_pandas(self, file_path):
-        data = pd.read_sas(file_path, format="xport", encoding="utf-8")
+        data = pd.read_sas(file_path, format="xport", encoding=self.encoding)
         return PandasDataset(self._format_floats(data))
 
     def to_parquet(self, file_path: str) -> str:
         temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet")
-        dataset = pd.read_sas(file_path, chunksize=20000, encoding="utf-8")
+        dataset = pd.read_sas(file_path, chunksize=20000, encoding=self.encoding)
         created = False
         num_rows = 0
         for chunk in dataset:

diff --git a/cdisc_rules_engine/services/data_services/data_service_factory.py b/cdisc_rules_engine/services/data_services/data_service_factory.py
@@ -37,6 +37,7 @@ def __init__(
         standard_substandard: str = None,
         library_metadata: LibraryMetadataContainer = None,
         max_dataset_size: int = 0,
+        encoding: str = None,
     ):
         if config.getValue("DATA_SERVICE_TYPE"):
             self.data_service_name = config.getValue("DATA_SERVICE_TYPE")
@@ -51,12 +52,13 @@ def __init__(
         self.standard_substandard = standard_substandard
         self.library_metadata = library_metadata
         self.max_dataset_size = max_dataset_size
+        self.encoding = encoding
         self.dataset_size_threshold = self.config.get_dataset_size_threshold()
 
     def get_data_service(
         self, dataset_paths: Iterable[str] = []
     ) -> DataServiceInterface:
-        if USDMDataService.is_valid_data(dataset_paths):
+        if USDMDataService.is_valid_data(dataset_paths, encoding=self.encoding):
             """Get json file tree to dataset data service"""
             return self.get_service(
                 "usdm",
@@ -66,11 +68,12 @@ def get_data_service(
                 library_metadata=self.library_metadata,
                 dataset_path=dataset_paths[0],
                 dataset_implementation=self.get_dataset_implementation(),
+                encoding=self.encoding,
             )
-        elif DummyDataService.is_valid_data(dataset_paths):
+        elif DummyDataService.is_valid_data(dataset_paths, encoding=self.encoding):
             """Get dummy data service"""
             return self.get_dummy_data_service(
-                data=DummyDataService.get_data(dataset_paths)
+                data=DummyDataService.get_data(dataset_paths, encoding=self.encoding)
             )
         elif ExcelDataService.is_valid_data(dataset_paths):
             """Get Excel file to dataset data service"""
@@ -82,6 +85,7 @@ def get_data_service(
                 library_metadata=self.library_metadata,
                 dataset_path=dataset_paths[0],
                 dataset_implementation=self.get_dataset_implementation(),
+                encoding=self.encoding,
             )
         else:
             """Get local Directory data service"""
@@ -93,6 +97,7 @@ def get_data_service(
                 library_metadata=self.library_metadata,
                 dataset_paths=dataset_paths,
                 dataset_implementation=self.get_dataset_implementation(),
+                encoding=self.encoding,
             )
 
     def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterface:
@@ -104,6 +109,7 @@ def get_dummy_data_service(self, data: List[DummyDataset]) -> DataServiceInterfa
             standard_substandard=self.standard_substandard,
             library_metadata=self.library_metadata,
             dataset_implementation=self.get_dataset_implementation(),
+            encoding=self.encoding,
         )
 
     def get_dataset_implementation(self):

diff --git a/cdisc_rules_engine/services/data_services/dummy_data_service.py b/cdisc_rules_engine/services/data_services/dummy_data_service.py
@@ -15,6 +15,7 @@
 from cdisc_rules_engine.services.data_readers import DataReaderFactory
 from cdisc_rules_engine.services.data_readers.json_reader import JSONReader
 from cdisc_rules_engine.services.data_services import BaseDataService
+from cdisc_rules_engine.constants import DEFAULT_ENCODING
 from cdisc_rules_engine.models.dataset import PandasDataset
 
 
@@ -42,7 +43,12 @@ def get_instance(
     ):
         return cls(
             cache_service=cache_service,
-            reader_factory=DataReaderFactory(),
+            reader_factory=DataReaderFactory(
+                dataset_implementation=kwargs.get(
+                    "dataset_implementation", PandasDataset
+                ),
+                encoding=kwargs.get("encoding"),
+            ),
             config=config,
             **kwargs,
         )
@@ -177,17 +183,21 @@ def get_datasets(self) -> Iterable[SDTMDatasetMetadata]:
         return self.data
 
     @staticmethod
-    def get_data(dataset_paths: Sequence[str]):
-        json = JSONReader().from_file(dataset_paths[0])
+    def get_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING):
+        json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file(
+            dataset_paths[0]
+        )
         return [DummyDataset(data) for data in json.get("datasets", [])]
 
     @staticmethod
-    def is_valid_data(dataset_paths: Sequence[str]):
+    def is_valid_data(dataset_paths: Sequence[str], encoding: str = DEFAULT_ENCODING):
         if (
             dataset_paths
             and len(dataset_paths) == 1
             and dataset_paths[0].lower().endswith(".json")
         ):
-            json = JSONReader().from_file(dataset_paths[0])
+            json = JSONReader(encoding=encoding or DEFAULT_ENCODING).from_file(
+                dataset_paths[0]
+            )
             return "datasets" in json
         return False
diff --git a/cdisc_rules_engine/services/data_services/excel_data_service.py b/cdisc_rules_engine/services/data_services/excel_data_service.py
@@ -54,7 +54,8 @@ def get_instance(
                 reader_factory=DataReaderFactory(
                     dataset_implementation=kwargs.get(
                         "dataset_implementation", PandasDataset
-                    )
+                    ),
+                    encoding=kwargs.get("encoding"),
                 ),
                 config=config,
                 **kwargs,