Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion cdisc_rules_engine/check_operators/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,7 +423,7 @@ def flatten_list(data, items):


vectorized_apply_regex = np.vectorize(apply_regex)
vectorized_is_complete_date = np.vectorize(is_complete_date)
vectorized_is_complete_date = np.vectorize(is_complete_date, otypes=[bool])
vectorized_compare_dates = np.vectorize(compare_dates)
vectorized_is_valid = np.vectorize(is_valid_date)
vectorized_is_valid_duration = np.vectorize(is_valid_duration)
Expand Down
25 changes: 10 additions & 15 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import (
DefineXMLReaderFactory,
)
from cdisc_rules_engine.utilities.utils import (
get_corresponding_datasets,
from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets
from cdisc_rules_engine.utilities.sdtm_utilities import (
tag_source,
)
from typing import List, Iterable, Optional
Expand Down Expand Up @@ -160,17 +160,14 @@ def get_define_xml_variables_metadata(self) -> List[dict]:
define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
self.dataset_path, self.define_xml_path, self.data_service, self.cache
)
# If domain is not set and this is a SUPP domain, use rdomain
domain = self.dataset_metadata.domain
if not domain and getattr(self.dataset_metadata, "is_supp", False):
domain = getattr(self.dataset_metadata, "rdomain", None)
name = getattr(self.dataset_metadata, "name", None)
return define_xml_reader.extract_variables_metadata(
domain_name=domain, name=name
)
if not domain:
return []
return define_xml_reader.extract_variables_metadata(domain_name=domain)
domain = (
self.dataset_metadata.domain
or self.dataset_metadata.rdomain
or self.dataset_metadata.unsplit_name
)
return define_xml_reader.extract_variables_metadata(
domain_name=domain, name=self.dataset_metadata.name
)

def get_define_xml_value_level_metadata(self) -> List[dict]:
"""
Expand Down Expand Up @@ -204,10 +201,8 @@ def get_library_variables_metadata(self) -> DatasetInterface:
else:
domain = self.dataset_metadata.domain
variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard(
domain=self.dataset_metadata.unsplit_name,
library_metadata=self.library_metadata,
data_service=self.data_service,
dataset=self.get_dataset_contents(),
datasets=self.datasets,
dataset_metadata=self.dataset_metadata,
dataset_path=self.dataset_path,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.utilities.utils import (
from cdisc_rules_engine.utilities.sdtm_utilities import (
get_corresponding_datasets,
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from jsonschema import validators, exceptions
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.models.dataset import DatasetInterface
from cdisc_rules_engine.utilities.utils import tag_source
from cdisc_rules_engine.utilities.sdtm_utilities import tag_source


class JsonSchemaCheckDatasetBuilder(BaseDatasetBuilder):
Expand Down
3 changes: 2 additions & 1 deletion cdisc_rules_engine/enums/execution_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ class ExecutionStatus(BaseEnum):
class SkippedReason(BaseEnum):
COLUMN_NOT_FOUND_IN_DATA = "Column not found in data"
DOMAIN_NOT_FOUND = "Domain not found"
SCHEMA_VALIDATION_IS_OFF = "Schema validation is off"
EMPTY_DATASET = "Empty dataset"
OUTSIDE_SCOPE = "Outside scope"
SCHEMA_VALIDATION_IS_OFF = "Schema validation is off"


class ExecutionError(BaseEnum):
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/models/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def generate_dataset_error_objects(self, message: str, results: pd.Series):
# get targets in the order they appear in rule.output_variables
target_names: List[str] = RuleProcessor.extract_target_names_from_rule(
self.rule,
self.dataset_metadata.domain_cleaned,
self.dataset_metadata.wildcard_replacement,
self.variable.dataset.columns.tolist(),
)
target_names = self._get_target_names_from_list_values(
Expand Down Expand Up @@ -242,7 +242,7 @@ def generate_targeted_error_object( # noqa: C901
),
targets=targets_list,
errors=errors_list,
message=message.replace("--", self.dataset_metadata.domain_cleaned or ""),
message=message.replace("--", self.dataset_metadata.wildcard_replacement),
)

def _generate_errors_by_target_presence(
Expand Down
4 changes: 4 additions & 0 deletions cdisc_rules_engine/models/dataset/dask_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,10 @@ def __len__(self):

return self.length

@property
def empty(self):
return len(self) == 0

def __deepcopy__(self, memo):
pandas_df = self._data.compute()
fresh_dask_df = dd.from_pandas(pandas_df, npartitions=DEFAULT_NUM_PARTITIONS)
Expand Down
5 changes: 5 additions & 0 deletions cdisc_rules_engine/models/dataset_metadata.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from dataclasses import dataclass
from typing import Union
from os.path import basename


@dataclass
Expand All @@ -17,3 +18,7 @@ class DatasetMetadata:
full_path: Union[str, None] = None
first_record: Union[dict, None] = None
original_path: Union[str, None] = None

@property
def data_service_identifier(self) -> str:
return basename(self.full_path) if self.full_path else self.filename
44 changes: 22 additions & 22 deletions cdisc_rules_engine/models/sdtm_dataset_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,35 +12,35 @@ class SDTMDatasetMetadata(DatasetMetadata):

"""
Examples
| name | unsplit_name | is_supp | domain | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom |
| -------- | ------------ | ------- | ------ | ------- | ----- | --------- | ----------------- | -------------- | ------------------------ |
| QS | QS | False | QS | None | False | | False | | |
| QSX | QS | False | QS | None | False | | False | | |
| QSXX | QS | False | QS | None | False | | False | | |
| SUPPQS | SUPPQS | True | None | QS | False | | False | QS | |
| SUPPQSX | SUPPQS | True | None | QS | False | | False | QS | |
| SUPPQSXX | SUPPQS | True | None | QS | False | | False | QS | |
| APQS | APQS | False | APQS | None | True | QS | False | QS | |
| APQSX | APQS | False | APQS | None | True | QS | False | QS | |
| APQSXX | APQS | False | APQS | None | True | QS | False | QS | |
| SQAPQS | SQAPQS | True | None | APQS | True | | False | QS | |
| SQAPQSX | SQAPQS | True | None | APQS | True | | False | QS | |
| SQAPQSXX | SQAPQS | True | None | APQS | True | | False | | |
| RELREC | RELREC | False | None | None | False | | False | | |
| XX | XX | False | XX | None | False | | True | | |
| SUPPXX | SUPPXX | True | None | XX | False | | False | XX | True |
| APXX | APXX | False | APXX | None | True | XX | False | XX | True |
| SQAPXX | SQAPXX | True | None | APXX | True | | False | XX | True |
| FA | FA | False | FA | None | False | | False | | |
| name | unsplit_name | is_supp | domain | wildcard_replacement | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom |
| -------- | ------------ | ------- | ------ | -------------------- | ------- | ----- | --------- | ---------------- | -------------- | ------------------------ |
| QS | QS | False | QS | QS | None | False | | False | | |
| QSX | QS | False | QS | QS | None | False | | False | | |
| QSXX | QS | False | QS | QS | None | False | | False | | |
| SUPPQS | SUPPQS | True | None | | QS | False | | False | QS | |
| SUPPQSX | SUPPQS | True | None | | QS | False | | False | QS | |
| SUPPQSXX | SUPPQS | True | None | | QS | False | | False | QS | |
| APQS | APQS | False | APQS | QS | None | True | QS | False | QS | |
| APQSX | APQS | False | APQS | QS | None | True | QS | False | QS | |
| APQSXX | APQS | False | APQS | QS | None | True | QS | False | QS | |
| SQAPQS | SQAPQS | True | None | | APQS | True | | False | QS | |
| SQAPQSX | SQAPQS | True | None | | APQS | True | | False | QS | |
| SQAPQSXX | SQAPQS | True | None | | APQS | True | | False | | |
| RELREC | RELREC | False | None | | None | False | | False | | |
| XX | XX | False | XX | XX | None | False | | True | | |
| SUPPXX | SUPPXX | True | None | | XX | False | | False | XX | True |
| APXX | APXX | False | APXX | XX | None | True | XX | False | XX | True |
| SQAPXX | SQAPXX | True | None | | APXX | True | | False | XX | True |
| FA | FA | False | FA | FA | None | False | | False | | |
""" # noqa: E501 W291

@property
def domain(self) -> Union[str, None]:
return (self.first_record or {}).get("DOMAIN", None)

@property
def domain_cleaned(self) -> Union[str, None]:
return self.domain.replace("AP", "") if self.domain else None
def wildcard_replacement(self) -> Union[str, None]:
return self.ap_suffix or self.domain or ""

@property
def rdomain(self) -> Union[str, None]:
Expand Down
24 changes: 11 additions & 13 deletions cdisc_rules_engine/operations/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,10 @@ def _handle_grouped_result(self, result):
result = self._rename_grouping_columns(result)
grouping_columns = self._get_grouping_columns()
target_columns = grouping_columns + [self.params.operation_id]
target_columns = self._resolve_variable_name(target_columns, self.params.domain)
grouping_columns = self._resolve_variable_name(
target_columns = self._replace_variable_wildcard(
target_columns, self.params.domain
)
grouping_columns = self._replace_variable_wildcard(
grouping_columns, self.params.domain
)
result = result.reset_index()
Expand Down Expand Up @@ -225,13 +227,9 @@ def _expand_operation_results_in_grouping(self, grouping_list):
def _get_variables_metadata_from_standard(self) -> List[dict]:
# TODO: Update to handle other standard types: adam, cdash, etc.

# self.params.domain is unsplit_name
domain_for_library = self.params.domain
return sdtm_utilities.get_variables_metadata_from_standard(
domain=domain_for_library,
library_metadata=self.library_metadata,
data_service=self.data_service,
dataset=self.evaluation_dataset,
dataset_metadata=self.data_service.get_raw_dataset_metadata(
dataset_name=self.params.dataset_path, datasets=self.params.datasets
),
Expand All @@ -250,17 +248,15 @@ def get_allowed_variable_permissibility(self, variable_metadata: dict):
def _get_variable_names_list(self, domain, dataframe):
# get variables metadata from the standard model
variables_metadata: List[dict] = (
self._get_variables_metadata_from_standard_model(domain, dataframe)
self._get_variables_metadata_from_standard_model(dataframe)
)
# create a list of variable names in accordance to the "ordinal" key
variable_names_list = self._replace_variable_wildcards(
variables_metadata, domain
)
return list(OrderedDict.fromkeys(variable_names_list))

def _get_variables_metadata_from_standard_model(
self, domain, dataframe
) -> List[dict]:
def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]:
"""
Gets variables metadata for the given class and domain from cache.
The cache stores CDISC Library metadata.
Expand All @@ -287,7 +283,6 @@ def _get_variables_metadata_from_standard_model(
# TODO: Update to handle multiple standard types.

return sdtm_utilities.get_variables_metadata_from_standard_model(
domain=domain,
dataframe=dataframe,
datasets=self.params.datasets,
dataset_path=self.params.dataset_path,
Expand All @@ -300,10 +295,13 @@ def _get_variables_metadata_from_standard_model(

@staticmethod
def _replace_variable_wildcards(variables_metadata, domain):
return [var["name"].replace("--", domain) for var in variables_metadata]
return [
BaseOperation._replace_variable_wildcard(var["name"], domain)
for var in variables_metadata
]

@staticmethod
def _resolve_variable_name(variable_name, domain: str):
def _replace_variable_wildcard(variable_name, domain: str):
if isinstance(variable_name, list):
return [
var.replace("--", domain) if "--" in var else var
Expand Down
12 changes: 5 additions & 7 deletions cdisc_rules_engine/operations/day_data_validator.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError
from cdisc_rules_engine.operations.base_operation import BaseOperation
from datetime import datetime
import numpy as np
from cdisc_rules_engine.services import logger
from cdisc_rules_engine.utilities.utils import tag_source
from cdisc_rules_engine.utilities.sdtm_utilities import tag_source


class DayDataValidator(BaseOperation):
def _execute_operation(self):
logger.info(
f"trying to find '{self.params.target}' in the {self.evaluation_dataset['DOMAIN'].iloc[0]}."
)
dtc_value = self.evaluation_dataset[self.params.target].map(
self.parse_timestamp
)
Expand All @@ -18,8 +15,9 @@ def _execute_operation(self):
dataset for dataset in self.params.datasets if dataset.domain == "DM"
]
if not dm_datasets:
# Return none for all values if dm is not provided.
return [0] * len(self.evaluation_dataset[self.params.target])
raise DomainNotFoundError(
"Operation dy requires DM domain but Domain not found in datasets"
)
if len(dm_datasets) > 1:
dm_data = self.data_service.concat_split_datasets(
self.data_service.get_dataset, dm_datasets
Expand Down
6 changes: 3 additions & 3 deletions cdisc_rules_engine/operations/distinct.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,9 +70,9 @@ def get_existing_column_names(group):

def _get_referenced_datasets(self):
referenced_datasets = {}
for dataset_meta in self.data_service.data:
dataset = self.data_service.get_dataset(dataset_meta.filename)
referenced_datasets[dataset_meta.name] = dataset
for dataset_metadata in self.data_service.get_datasets():
dataset = self.data_service.get_dataset(dataset_metadata.filename)
referenced_datasets[dataset_metadata.name] = dataset
return referenced_datasets

def _unique_values_for_column(self, column):
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/operations/domain_is_custom.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from cdisc_rules_engine.operations.base_operation import BaseOperation
from cdisc_rules_engine.utilities.sdtm_utilities import is_custom_domain


class DomainIsCustom(BaseOperation):
Expand All @@ -8,5 +9,4 @@ def _execute_operation(self):
given domain is in standard domains.
If no -> the domain is custom.
"""
standard_data: dict = self.library_metadata.standard_metadata
return self.params.domain not in standard_data.get("domains", {})
return is_custom_domain(self.library_metadata, self.params.domain)
4 changes: 3 additions & 1 deletion cdisc_rules_engine/operations/expected_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def _execute_operation(self):

return list(
{
var["name"].replace("--", self.params.domain): None
BaseOperation._replace_variable_wildcard(
var["name"], self.params.domain
): None
for var in variables_metadata
if self.get_allowed_variable_permissibility(var) == EXPECTED
}.keys()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def _get_model_filtered_variables(self):
key = self.params.key_name
val = self.params.key_value
model_variables: List[dict] = self._get_variables_metadata_from_standard_model(
self.params.domain, self.params.dataframe
self.params.dataframe
)
filtered_model = [var for var in model_variables if var.get(key) == val]
variable_names_list = self._replace_variable_wildcards(
Expand Down
9 changes: 6 additions & 3 deletions cdisc_rules_engine/operations/library_column_order.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def _execute_operation(self):
variables_metadata: List[dict] = self._get_variables_metadata_from_standard()

# create a list of variable names in accordance to the "ordinal" key
variable_names_list = [
var["name"].replace("--", self.params.domain) for var in variables_metadata
]
variable_names_list = BaseOperation._replace_variable_wildcards(
variables_metadata,
self.data_service.get_raw_dataset_metadata(
dataset_name=self.params.dataset_path, datasets=self.params.datasets
).wildcard_replacement,
)
return list(OrderedDict.fromkeys(variable_names_list))
4 changes: 3 additions & 1 deletion cdisc_rules_engine/operations/permissible_variables.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ def _execute_operation(self):

return list(
{
var["name"].replace("--", self.params.domain): None
BaseOperation._replace_variable_wildcard(
var["name"], self.params.domain
): None
for var in variables_metadata
if self.get_allowed_variable_permissibility(var) == PERMISSIBLE
}.keys()
Expand Down
Loading
Loading