Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,19 +138,21 @@ This will show the list of validation options.
-ca, --cache TEXT Relative path to cache files containing pre
loaded metadata and rules
-ps, --pool-size INTEGER Number of parallel processes for validation
-d, --data TEXT Path to directory containing data files
-d, --data TEXT Path to directory containing data files.
DATA_DIR environment variable can be used to pass value.
-dp, --dataset-path TEXT Absolute path to dataset file. Can be specified multiple times.
-dxp, --define-xml-path TEXT Path to Define-XML
DATASET_PATH environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
-dxp, --define-xml-path TEXT Path to Define-XML. DEFINE environment variable can be used to pass value.
-l, --log-level [info|debug|error|critical|disabled|warn]
Sets log level for engine logs, logs are
disabled by default
-rt, --report-template TEXT File path of report template to use for
excel output
-s, --standard TEXT CDISC standard to validate against
-s, --standard TEXT CDISC standard to validate against. STANDARD environment variable can be used to pass value.
[required]
-v, --version TEXT Standard version to validate against
-v, --version TEXT Standard version to validate against. VERSION environment variable can be used to pass value.
[required]
-ss, --substandard TEXT Substandard to validate against
-ss, --substandard TEXT Substandard to validate against. SUBSTANDARD environment variable can be used to pass value.
"SDTM", "SEND", "ADaM", or "CDASH"
[required for TIG]
-uc, --use-case TEXT Use Case for TIG Validation
Expand All @@ -161,7 +163,8 @@ This will show the list of validation options.
against, can provide more than one
NOTE: if a defineXML is provided, if it is version 2.1
engine will use the CT laid out in the define. If it is
version 2.0, -ct is expected to specify the CT package
version 2.0, -ct is expected to specify the CT package.
CONTROLLED_TERMINOLOGY_PACKAGE environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
-o, --output TEXT Report output file destination and name. Path will be
relative to the validation execution directory
and should end in the desired output filename
Expand Down Expand Up @@ -204,7 +207,7 @@ This will show the list of validation options.
if both .env and -me <limit> are specified, the larger value will be used. If either sets the per_dataset_flag to true, it will be true
If limit is set to 0, no maximum will be enforced.
No maximum is the default behavior.
-dv, --define-version TEXT Define-XML version used for validation
-dv, --define-version TEXT Define-XML version used for validation. DEFINE_VERSION environment variable can be used to pass value.
-dxp, --define-xml-path Path to define-xml file.
-vx, --validate-xml Enable XML validation (default 'y' to enable, otherwise disable).
--whodrug TEXT Path to directory with WHODrug dictionary
Expand All @@ -221,9 +224,12 @@ This will show the list of validation options.
--snomed-url TEXT Base url of snomed api to use. (ex. https://snowstorm.snomedtools.org/snowstorm/snomed-ct)
--snomed-edition TEXT Edition of snomed to use. (ex. SNOMEDCT-US)
-r, --rules TEXT Specify rule core ID ex. CORE-000001. Can be specified multiple times.
RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
-er, --exclude-rules TEXT Specify rule core ID to exclude, ex. CORE-000001. Can be specified multiple times.
EXCLUDE_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
-lr, --local-rules TEXT Specify relative path to directory or file containing
local rule yml and/or json rule files.
LOCAL_RULES environment variable can be used to pass values separated by ':' on Unix and ';' for Windows.
-cs, --custom-standard Adding this flag tells engine to use a custom standard specified with -s and -v
that has been uploaded to the cache using update-cache
-cse, --custom-standard-encoding TEXT
Expand Down
1 change: 1 addition & 0 deletions cdisc_rules_engine/enums/dataformat_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ class DataFormatTypes(BaseEnum):
USDM = "USDM"
XLSX = "XLSX"
XPT = "XPT"
CSV = "CSV"
5 changes: 5 additions & 0 deletions cdisc_rules_engine/exceptions/custom_exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,11 @@ class CTPackageNotFoundError(EngineError):
description = "Controlled terminology package(s) not found"


class InvalidCSVFile(EngineError):
code = 400
description = "CSV data is malformed."


class NumberOfAttemptsExceeded(EngineError):
pass

Expand Down
8 changes: 6 additions & 2 deletions cdisc_rules_engine/interfaces/data_reader_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,8 +11,8 @@ def __init__(
self, dataset_implementation=PandasDataset, encoding: str = DEFAULT_ENCODING
):
"""
:param dataset_implementation DatasetInterface: The dataset type to return.
:param encoding str: The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
:param DatasetInterface dataset_implementation : The dataset type to return.
:param str encoding : The encoding to use when reading files. Defaults to DEFAULT_ENCODING (e.g. utf-8).
"""
self.dataset_implementation = dataset_implementation
self.encoding = encoding
Expand All @@ -26,3 +26,7 @@ def read(self, data):

def from_file(self, file_path):
raise NotImplementedError

def to_parquet(self, file_path) -> tuple[int, str]:
"""Returns number of rows and path to the parquet file"""
raise NotImplementedError
168 changes: 168 additions & 0 deletions cdisc_rules_engine/services/csv_metadata_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import logging
from datetime import datetime
from pathlib import Path

import pandas as pd

from cdisc_rules_engine.constants import DEFAULT_ENCODING


class DatasetCSVMetadataReader:
def __init__(
self, file_path: str, file_name: str, encoding: str = DEFAULT_ENCODING
):
self.file_path = file_path
self.file_name = file_name
self.encoding = encoding

def read(self) -> dict:
dataset_name = Path(self.file_name).stem.lower()
variables_file_path = Path(self.file_path).parent / "variables.csv"

if not variables_file_path.exists():
logger = logging.getLogger("validator")
logger.info("No variables file found for %s", dataset_name)
variables_meta = {}
else:
variables_meta = self.__get_variable_metadata(
dataset_name, variables_file_path
)

metadata = {
"dataset_name": dataset_name.upper(),
"dataset_modification_date": datetime.fromtimestamp(
Path(self.file_path).stat().st_mtime
).isoformat(),
"adam_info": {
"categorization_scheme": {},
"w_indexes": {},
"period": {},
"selection_algorithm": {},
},
}
metadata.update(variables_meta)
metadata.update(self.__data_meta())
metadata.update(self.__dataset_label())
return metadata

def __get_variable_metadata(
self, dataset_name: str, variables_file_path: Path
) -> dict:
logger = logging.getLogger("validator")
try:
meta_df = pd.read_csv(variables_file_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"Could not decode CSV file {variables_file_path} with {self.encoding} encoding: {e}. "
f"Please specify the correct encoding using the -e flag."
)
return {}
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

meta_df["dataset"] = meta_df["dataset"].apply(
lambda x: Path(str(x)).stem.lower()
)

dataset_meta_df = meta_df[meta_df["dataset"] == dataset_name]

if dataset_meta_df.empty:
logger = logging.getLogger("validator")
logger.info("No dataset metadata found for %s", dataset_name)
return {}

variable_names = dataset_meta_df["variable"].tolist()
variable_labels = dataset_meta_df["label"].tolist()

variable_name_to_label_map = dict(zip(variable_names, variable_labels))
variable_name_to_data_type_map = dict(
zip(variable_names, dataset_meta_df["type"])
)
variable_name_to_size_map = {
var: (int(length) if pd.notna(length) else None)
for var, length in zip(variable_names, dataset_meta_df["length"])
}
return {
"variable_names": variable_names,
"variable_labels": variable_labels,
"variable_formats": [""] * len(variable_names),
"variable_name_to_label_map": variable_name_to_label_map,
"variable_name_to_data_type_map": variable_name_to_data_type_map,
"variable_name_to_size_map": variable_name_to_size_map,
"number_of_variables": len(variable_names),
}

def __dataset_label(self) -> dict:
logger = logging.getLogger("validator")
tables_file_path = Path(self.file_path).parent / "tables.csv"
if not tables_file_path.exists():
return {}

try:
tables_df = pd.read_csv(tables_file_path, encoding=self.encoding)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
return {}
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return {}

if "Filename" not in tables_df.columns or "Label" not in tables_df.columns:
return {}

tables_df["dataset"] = tables_df["Filename"].apply(
lambda x: Path(str(x)).stem.lower()
)

current_dataset = Path(self.file_name).stem.lower()
match = tables_df[tables_df["dataset"] == current_dataset]

if match.empty:
return {}

return {"dataset_label": str(match.iloc[0]["Label"])}

def __data_meta(self):
logger = logging.getLogger("validator")
result = {
"dataset_length": 0,
"first_record": {},
}
try:
first_row_df = pd.read_csv(self.file_path, encoding=self.encoding, nrows=1)
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
return result
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)
return result

if not first_row_df.empty:
result["first_record"] = (
first_row_df.iloc[0].fillna("").astype(str).to_dict()
)

try:
with open(self.file_path, encoding=self.encoding) as f:
result["dataset_length"] = max(
sum(1 for _ in f) - 1, 0
) # subtract header
except (UnicodeDecodeError, UnicodeError) as e:
logger.error(
f"\n Error reading CSV from: {self.file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
except Exception as e:
logger.error("Error reading CSV file %s. %s", self.file_path, e)

return result
50 changes: 50 additions & 0 deletions cdisc_rules_engine/services/data_readers/csv_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import tempfile

from cdisc_rules_engine.exceptions.custom_exceptions import InvalidCSVFile
from cdisc_rules_engine.interfaces import DataReaderInterface
import pandas as pd


class CSVReader(DataReaderInterface):
def read(self, data):
"""
Function for reading data from a specific file type and returning a
pandas dataframe of the data.
"""
raise NotImplementedError

def from_file(self, file_path):
try:
with open(file_path, "r", encoding=self.encoding) as fp:
data = pd.read_csv(fp, sep=",", header=0, index_col=False)
return data
except (UnicodeDecodeError, UnicodeError) as e:
raise InvalidCSVFile(
f"\n Error reading CSV from: {file_path}"
f"\n Failed to decode with {self.encoding} encoding: {e}"
f"\n Please specify the correct encoding using the -e flag."
)
except Exception as e:
raise InvalidCSVFile(
f"\n Error reading CSV from: {file_path}"
f"\n {type(e).__name__}: {e}"
)

def to_parquet(self, file_path: str) -> tuple[int, str]:
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".parquet")

dataset = pd.read_csv(file_path, chunksize=20000, encoding=self.encoding)

created = False
num_rows = 0

for chunk in dataset:
num_rows += len(chunk)

if not created:
chunk.to_parquet(temp_file.name, engine="fastparquet")
created = True
else:
chunk.to_parquet(temp_file.name, engine="fastparquet", append=True)

return num_rows, temp_file.name
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
DataReaderInterface,
FactoryInterface,
)
from cdisc_rules_engine.services.data_readers.csv_reader import CSVReader
from cdisc_rules_engine.services.data_readers.xpt_reader import XPTReader
from cdisc_rules_engine.services.data_readers.dataset_json_reader import (
DatasetJSONReader,
Expand All @@ -19,12 +20,13 @@


class DataReaderFactory(FactoryInterface):
_reader_map = {
_reader_map: dict[str, Type[DataReaderInterface]] = {
DataFormatTypes.XPT.value: XPTReader,
DataFormatTypes.PARQUET.value: ParquetReader,
DataFormatTypes.JSON.value: DatasetJSONReader,
DataFormatTypes.NDJSON.value: DatasetNDJSONReader,
DataFormatTypes.USDM.value: JSONReader,
DataFormatTypes.CSV.value: CSVReader,
}

def __init__(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from cdisc_rules_engine.services.datasetndjson_metadata_reader import (
DatasetNDJSONMetadataReader,
)
from cdisc_rules_engine.services.csv_metadata_reader import DatasetCSVMetadataReader
from cdisc_rules_engine.utilities.utils import (
convert_file_size,
extract_file_name_from_path_string,
Expand Down Expand Up @@ -194,6 +195,7 @@ def read_metadata(
DataFormatTypes.XPT.value: DatasetXPTMetadataReader,
DataFormatTypes.JSON.value: DatasetJSONMetadataReader,
DataFormatTypes.NDJSON.value: DatasetNDJSONMetadataReader,
DataFormatTypes.CSV.value: DatasetCSVMetadataReader,
}

file_extension = file_name.split(".")[1].upper()
Expand Down
Loading
Loading