diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 085cb48d9..4a9348b8e 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -423,7 +423,7 @@ def flatten_list(data, items): vectorized_apply_regex = np.vectorize(apply_regex) -vectorized_is_complete_date = np.vectorize(is_complete_date) +vectorized_is_complete_date = np.vectorize(is_complete_date, otypes=[bool]) vectorized_compare_dates = np.vectorize(compare_dates) vectorized_is_valid = np.vectorize(is_valid_date) vectorized_is_valid_duration = np.vectorize(is_valid_duration) diff --git a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py index 517f00739..7ad298fa6 100644 --- a/cdisc_rules_engine/dataset_builders/base_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/base_dataset_builder.py @@ -5,8 +5,8 @@ from cdisc_rules_engine.services.define_xml.define_xml_reader_factory import ( DefineXMLReaderFactory, ) -from cdisc_rules_engine.utilities.utils import ( - get_corresponding_datasets, +from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets +from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, ) from typing import List, Iterable, Optional @@ -160,17 +160,14 @@ def get_define_xml_variables_metadata(self) -> List[dict]: define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader( self.dataset_path, self.define_xml_path, self.data_service, self.cache ) - # If domain is not set and this is a SUPP domain, use rdomain - domain = self.dataset_metadata.domain - if not domain and getattr(self.dataset_metadata, "is_supp", False): - domain = getattr(self.dataset_metadata, "rdomain", None) - name = getattr(self.dataset_metadata, "name", None) - return define_xml_reader.extract_variables_metadata( - domain_name=domain, name=name - ) - if not domain: - return [] - return define_xml_reader.extract_variables_metadata(domain_name=domain) + domain = ( + self.dataset_metadata.domain + or self.dataset_metadata.rdomain + or self.dataset_metadata.unsplit_name + ) + return define_xml_reader.extract_variables_metadata( + domain_name=domain, name=self.dataset_metadata.name + ) def get_define_xml_value_level_metadata(self) -> List[dict]: """ @@ -204,10 +201,8 @@ def get_library_variables_metadata(self) -> DatasetInterface: else: domain = self.dataset_metadata.domain variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard( - domain=self.dataset_metadata.unsplit_name, library_metadata=self.library_metadata, data_service=self.data_service, - dataset=self.get_dataset_contents(), datasets=self.datasets, dataset_metadata=self.dataset_metadata, dataset_path=self.dataset_path, diff --git a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py index 8abec8ed0..426cbdacf 100644 --- a/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/contents_dataset_builder.py @@ -1,5 +1,5 @@ from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder -from cdisc_rules_engine.utilities.utils import ( +from cdisc_rules_engine.utilities.sdtm_utilities import ( get_corresponding_datasets, ) diff --git a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py index 865d63407..c139347b0 100644 --- a/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/json_schema_check_dataset_builder.py @@ -6,7 +6,7 @@ from jsonschema import validators, exceptions from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder from cdisc_rules_engine.models.dataset import DatasetInterface -from cdisc_rules_engine.utilities.utils import tag_source +from cdisc_rules_engine.utilities.sdtm_utilities import tag_source class JsonSchemaCheckDatasetBuilder(BaseDatasetBuilder): diff --git a/cdisc_rules_engine/enums/execution_status.py b/cdisc_rules_engine/enums/execution_status.py index 60f9d3a0d..819446c5f 100644 --- a/cdisc_rules_engine/enums/execution_status.py +++ b/cdisc_rules_engine/enums/execution_status.py @@ -12,8 +12,9 @@ class ExecutionStatus(BaseEnum): class SkippedReason(BaseEnum): COLUMN_NOT_FOUND_IN_DATA = "Column not found in data" DOMAIN_NOT_FOUND = "Domain not found" - SCHEMA_VALIDATION_IS_OFF = "Schema validation is off" + EMPTY_DATASET = "Empty dataset" OUTSIDE_SCOPE = "Outside scope" + SCHEMA_VALIDATION_IS_OFF = "Schema validation is off" class ExecutionError(BaseEnum): diff --git a/cdisc_rules_engine/models/actions.py b/cdisc_rules_engine/models/actions.py index 2150d94c6..bb81c274e 100644 --- a/cdisc_rules_engine/models/actions.py +++ b/cdisc_rules_engine/models/actions.py @@ -52,7 +52,7 @@ def generate_dataset_error_objects(self, message: str, results: pd.Series): # get targets in the order they appear in rule.output_variables target_names: List[str] = RuleProcessor.extract_target_names_from_rule( self.rule, - self.dataset_metadata.domain_cleaned, + self.dataset_metadata.wildcard_replacement, self.variable.dataset.columns.tolist(), ) target_names = self._get_target_names_from_list_values( @@ -242,7 +242,7 @@ def generate_targeted_error_object( # noqa: C901 ), targets=targets_list, errors=errors_list, - message=message.replace("--", self.dataset_metadata.domain_cleaned or ""), + message=message.replace("--", self.dataset_metadata.wildcard_replacement), ) def _generate_errors_by_target_presence( diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index 36736d00a..7a6449d31 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -99,6 +99,10 @@ def __len__(self): return self.length + @property + def empty(self): + return len(self) == 0 + def __deepcopy__(self, memo): pandas_df = self._data.compute() fresh_dask_df = dd.from_pandas(pandas_df, npartitions=DEFAULT_NUM_PARTITIONS) diff --git a/cdisc_rules_engine/models/dataset_metadata.py b/cdisc_rules_engine/models/dataset_metadata.py index eae183ba4..b4b8a2dfd 100644 --- a/cdisc_rules_engine/models/dataset_metadata.py +++ b/cdisc_rules_engine/models/dataset_metadata.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from typing import Union +from os.path import basename @dataclass @@ -17,3 +18,7 @@ class DatasetMetadata: full_path: Union[str, None] = None first_record: Union[dict, None] = None original_path: Union[str, None] = None + + @property + def data_service_identifier(self) -> str: + return basename(self.full_path) if self.full_path else self.filename diff --git a/cdisc_rules_engine/models/sdtm_dataset_metadata.py b/cdisc_rules_engine/models/sdtm_dataset_metadata.py index bd48ca8ba..4424e4d07 100644 --- a/cdisc_rules_engine/models/sdtm_dataset_metadata.py +++ b/cdisc_rules_engine/models/sdtm_dataset_metadata.py @@ -12,26 +12,26 @@ class SDTMDatasetMetadata(DatasetMetadata): """ Examples - | name | unsplit_name | is_supp | domain | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom | - | -------- | ------------ | ------- | ------ | ------- | ----- | --------- | ----------------- | -------------- | ------------------------ | - | QS | QS | False | QS | None | False | | False | | | - | QSX | QS | False | QS | None | False | | False | | | - | QSXX | QS | False | QS | None | False | | False | | | - | SUPPQS | SUPPQS | True | None | QS | False | | False | QS | | - | SUPPQSX | SUPPQS | True | None | QS | False | | False | QS | | - | SUPPQSXX | SUPPQS | True | None | QS | False | | False | QS | | - | APQS | APQS | False | APQS | None | True | QS | False | QS | | - | APQSX | APQS | False | APQS | None | True | QS | False | QS | | - | APQSXX | APQS | False | APQS | None | True | QS | False | QS | | - | SQAPQS | SQAPQS | True | None | APQS | True | | False | QS | | - | SQAPQSX | SQAPQS | True | None | APQS | True | | False | QS | | - | SQAPQSXX | SQAPQS | True | None | APQS | True | | False | | | - | RELREC | RELREC | False | None | None | False | | False | | | - | XX | XX | False | XX | None | False | | True | | | - | SUPPXX | SUPPXX | True | None | XX | False | | False | XX | True | - | APXX | APXX | False | APXX | None | True | XX | False | XX | True | - | SQAPXX | SQAPXX | True | None | APXX | True | | False | XX | True | - | FA | FA | False | FA | None | False | | False | | | + | name | unsplit_name | is_supp | domain | wildcard_replacement | rdomain | is_ap | ap_suffix | domain_is_custom | related_domain | related_domain_is_custom | + | -------- | ------------ | ------- | ------ | -------------------- | ------- | ----- | --------- | ---------------- | -------------- | ------------------------ | + | QS | QS | False | QS | QS | None | False | | False | | | + | QSX | QS | False | QS | QS | None | False | | False | | | + | QSXX | QS | False | QS | QS | None | False | | False | | | + | SUPPQS | SUPPQS | True | None | | QS | False | | False | QS | | + | SUPPQSX | SUPPQS | True | None | | QS | False | | False | QS | | + | SUPPQSXX | SUPPQS | True | None | | QS | False | | False | QS | | + | APQS | APQS | False | APQS | QS | None | True | QS | False | QS | | + | APQSX | APQS | False | APQS | QS | None | True | QS | False | QS | | + | APQSXX | APQS | False | APQS | QS | None | True | QS | False | QS | | + | SQAPQS | SQAPQS | True | None | | APQS | True | | False | QS | | + | SQAPQSX | SQAPQS | True | None | | APQS | True | | False | QS | | + | SQAPQSXX | SQAPQS | True | None | | APQS | True | | False | | | + | RELREC | RELREC | False | None | | None | False | | False | | | + | XX | XX | False | XX | XX | None | False | | True | | | + | SUPPXX | SUPPXX | True | None | | XX | False | | False | XX | True | + | APXX | APXX | False | APXX | XX | None | True | XX | False | XX | True | + | SQAPXX | SQAPXX | True | None | | APXX | True | | False | XX | True | + | FA | FA | False | FA | FA | None | False | | False | | | """ # noqa: E501 W291 @property @@ -39,8 +39,8 @@ def domain(self) -> Union[str, None]: return (self.first_record or {}).get("DOMAIN", None) @property - def domain_cleaned(self) -> Union[str, None]: - return self.domain.replace("AP", "") if self.domain else None + def wildcard_replacement(self) -> Union[str, None]: + return self.ap_suffix or self.domain or "" @property def rdomain(self) -> Union[str, None]: diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index a045a323b..22f1fd57e 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -140,8 +140,10 @@ def _handle_grouped_result(self, result): result = self._rename_grouping_columns(result) grouping_columns = self._get_grouping_columns() target_columns = grouping_columns + [self.params.operation_id] - target_columns = self._resolve_variable_name(target_columns, self.params.domain) - grouping_columns = self._resolve_variable_name( + target_columns = self._replace_variable_wildcard( + target_columns, self.params.domain + ) + grouping_columns = self._replace_variable_wildcard( grouping_columns, self.params.domain ) result = result.reset_index() @@ -225,13 +227,9 @@ def _expand_operation_results_in_grouping(self, grouping_list): def _get_variables_metadata_from_standard(self) -> List[dict]: # TODO: Update to handle other standard types: adam, cdash, etc. - # self.params.domain is unsplit_name - domain_for_library = self.params.domain return sdtm_utilities.get_variables_metadata_from_standard( - domain=domain_for_library, library_metadata=self.library_metadata, data_service=self.data_service, - dataset=self.evaluation_dataset, dataset_metadata=self.data_service.get_raw_dataset_metadata( dataset_name=self.params.dataset_path, datasets=self.params.datasets ), @@ -250,7 +248,7 @@ def get_allowed_variable_permissibility(self, variable_metadata: dict): def _get_variable_names_list(self, domain, dataframe): # get variables metadata from the standard model variables_metadata: List[dict] = ( - self._get_variables_metadata_from_standard_model(domain, dataframe) + self._get_variables_metadata_from_standard_model(dataframe) ) # create a list of variable names in accordance to the "ordinal" key variable_names_list = self._replace_variable_wildcards( @@ -258,9 +256,7 @@ def _get_variable_names_list(self, domain, dataframe): ) return list(OrderedDict.fromkeys(variable_names_list)) - def _get_variables_metadata_from_standard_model( - self, domain, dataframe - ) -> List[dict]: + def _get_variables_metadata_from_standard_model(self, dataframe) -> List[dict]: """ Gets variables metadata for the given class and domain from cache. The cache stores CDISC Library metadata. @@ -287,7 +283,6 @@ def _get_variables_metadata_from_standard_model( # TODO: Update to handle multiple standard types. return sdtm_utilities.get_variables_metadata_from_standard_model( - domain=domain, dataframe=dataframe, datasets=self.params.datasets, dataset_path=self.params.dataset_path, @@ -300,10 +295,13 @@ def _get_variables_metadata_from_standard_model( @staticmethod def _replace_variable_wildcards(variables_metadata, domain): - return [var["name"].replace("--", domain) for var in variables_metadata] + return [ + BaseOperation._replace_variable_wildcard(var["name"], domain) + for var in variables_metadata + ] @staticmethod - def _resolve_variable_name(variable_name, domain: str): + def _replace_variable_wildcard(variable_name, domain: str): if isinstance(variable_name, list): return [ var.replace("--", domain) if "--" in var else var diff --git a/cdisc_rules_engine/operations/day_data_validator.py b/cdisc_rules_engine/operations/day_data_validator.py index 3b4e92ca5..6a9b6269c 100644 --- a/cdisc_rules_engine/operations/day_data_validator.py +++ b/cdisc_rules_engine/operations/day_data_validator.py @@ -1,15 +1,12 @@ +from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError from cdisc_rules_engine.operations.base_operation import BaseOperation from datetime import datetime import numpy as np -from cdisc_rules_engine.services import logger -from cdisc_rules_engine.utilities.utils import tag_source +from cdisc_rules_engine.utilities.sdtm_utilities import tag_source class DayDataValidator(BaseOperation): def _execute_operation(self): - logger.info( - f"trying to find '{self.params.target}' in the {self.evaluation_dataset['DOMAIN'].iloc[0]}." - ) dtc_value = self.evaluation_dataset[self.params.target].map( self.parse_timestamp ) @@ -18,8 +15,9 @@ def _execute_operation(self): dataset for dataset in self.params.datasets if dataset.domain == "DM" ] if not dm_datasets: - # Return none for all values if dm is not provided. - return [0] * len(self.evaluation_dataset[self.params.target]) + raise DomainNotFoundError( + "Operation dy requires DM domain but Domain not found in datasets" + ) if len(dm_datasets) > 1: dm_data = self.data_service.concat_split_datasets( self.data_service.get_dataset, dm_datasets diff --git a/cdisc_rules_engine/operations/distinct.py b/cdisc_rules_engine/operations/distinct.py index ac936d305..cecf00560 100644 --- a/cdisc_rules_engine/operations/distinct.py +++ b/cdisc_rules_engine/operations/distinct.py @@ -70,9 +70,9 @@ def get_existing_column_names(group): def _get_referenced_datasets(self): referenced_datasets = {} - for dataset_meta in self.data_service.data: - dataset = self.data_service.get_dataset(dataset_meta.filename) - referenced_datasets[dataset_meta.name] = dataset + for dataset_metadata in self.data_service.get_datasets(): + dataset = self.data_service.get_dataset(dataset_metadata.filename) + referenced_datasets[dataset_metadata.name] = dataset return referenced_datasets def _unique_values_for_column(self, column): diff --git a/cdisc_rules_engine/operations/domain_is_custom.py b/cdisc_rules_engine/operations/domain_is_custom.py index 3b9f86986..641b3476d 100644 --- a/cdisc_rules_engine/operations/domain_is_custom.py +++ b/cdisc_rules_engine/operations/domain_is_custom.py @@ -1,4 +1,5 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation +from cdisc_rules_engine.utilities.sdtm_utilities import is_custom_domain class DomainIsCustom(BaseOperation): @@ -8,5 +9,4 @@ def _execute_operation(self): given domain is in standard domains. If no -> the domain is custom. """ - standard_data: dict = self.library_metadata.standard_metadata - return self.params.domain not in standard_data.get("domains", {}) + return is_custom_domain(self.library_metadata, self.params.domain) diff --git a/cdisc_rules_engine/operations/expected_variables.py b/cdisc_rules_engine/operations/expected_variables.py index e441236c2..4a6dba786 100644 --- a/cdisc_rules_engine/operations/expected_variables.py +++ b/cdisc_rules_engine/operations/expected_variables.py @@ -23,7 +23,9 @@ def _execute_operation(self): return list( { - var["name"].replace("--", self.params.domain): None + BaseOperation._replace_variable_wildcard( + var["name"], self.params.domain + ): None for var in variables_metadata if self.get_allowed_variable_permissibility(var) == EXPECTED }.keys() diff --git a/cdisc_rules_engine/operations/get_model_filtered_variables.py b/cdisc_rules_engine/operations/get_model_filtered_variables.py index 237eebaee..faff0daca 100644 --- a/cdisc_rules_engine/operations/get_model_filtered_variables.py +++ b/cdisc_rules_engine/operations/get_model_filtered_variables.py @@ -23,7 +23,7 @@ def _get_model_filtered_variables(self): key = self.params.key_name val = self.params.key_value model_variables: List[dict] = self._get_variables_metadata_from_standard_model( - self.params.domain, self.params.dataframe + self.params.dataframe ) filtered_model = [var for var in model_variables if var.get(key) == val] variable_names_list = self._replace_variable_wildcards( diff --git a/cdisc_rules_engine/operations/library_column_order.py b/cdisc_rules_engine/operations/library_column_order.py index 2aec11ede..7360de800 100644 --- a/cdisc_rules_engine/operations/library_column_order.py +++ b/cdisc_rules_engine/operations/library_column_order.py @@ -22,7 +22,10 @@ def _execute_operation(self): variables_metadata: List[dict] = self._get_variables_metadata_from_standard() # create a list of variable names in accordance to the "ordinal" key - variable_names_list = [ - var["name"].replace("--", self.params.domain) for var in variables_metadata - ] + variable_names_list = BaseOperation._replace_variable_wildcards( + variables_metadata, + self.data_service.get_raw_dataset_metadata( + dataset_name=self.params.dataset_path, datasets=self.params.datasets + ).wildcard_replacement, + ) return list(OrderedDict.fromkeys(variable_names_list)) diff --git a/cdisc_rules_engine/operations/permissible_variables.py b/cdisc_rules_engine/operations/permissible_variables.py index 0dcbdf89b..96cffd36c 100644 --- a/cdisc_rules_engine/operations/permissible_variables.py +++ b/cdisc_rules_engine/operations/permissible_variables.py @@ -23,7 +23,9 @@ def _execute_operation(self): return list( { - var["name"].replace("--", self.params.domain): None + BaseOperation._replace_variable_wildcard( + var["name"], self.params.domain + ): None for var in variables_metadata if self.get_allowed_variable_permissibility(var) == PERMISSIBLE }.keys() diff --git a/cdisc_rules_engine/operations/record_count.py b/cdisc_rules_engine/operations/record_count.py index 9cbedc3b6..017b260f1 100644 --- a/cdisc_rules_engine/operations/record_count.py +++ b/cdisc_rules_engine/operations/record_count.py @@ -79,7 +79,7 @@ def _get_grouping_for_operations(self) -> list: ) effective_grouping = [] for col in grouping_cols: - col = self._resolve_variable_name(col, self.params.domain) + col = self._replace_variable_wildcard(col, self.params.domain) if col in self.evaluation_dataset.data.columns: sample_val = self.evaluation_dataset[col].iloc[0] if isinstance(sample_val, (list, tuple)): @@ -150,7 +150,7 @@ def _build_effective_grouping(self) -> tuple[list, dict]: ) effective_grouping = [] for col in grouping_cols: - col = self._resolve_variable_name(col, self.params.domain) + col = self._replace_variable_wildcard(col, self.params.domain) if col in self.evaluation_dataset.data.columns: if len(self.evaluation_dataset) == 0: effective_grouping.append(col) diff --git a/cdisc_rules_engine/operations/related_domain_is_custom.py b/cdisc_rules_engine/operations/related_domain_is_custom.py index 215a8c8ca..5b681d6f4 100644 --- a/cdisc_rules_engine/operations/related_domain_is_custom.py +++ b/cdisc_rules_engine/operations/related_domain_is_custom.py @@ -1,4 +1,5 @@ from cdisc_rules_engine.operations.base_operation import BaseOperation +from cdisc_rules_engine.utilities.sdtm_utilities import is_custom_domain class RelatedDomainIsCustom(BaseOperation): @@ -8,9 +9,8 @@ def _execute_operation(self): given domain is in standard domains. If no -> the domain is custom. """ - standard_data: dict = self.library_metadata.standard_metadata for ds in self.params.datasets: if ds.is_supp and self.params.domain.endswith(ds.rdomain): - return ds.rdomain not in standard_data.get("domains", {}) + return is_custom_domain(self.library_metadata, ds.rdomain) return False diff --git a/cdisc_rules_engine/operations/required_variables.py b/cdisc_rules_engine/operations/required_variables.py index 167d748df..8cc41d3be 100644 --- a/cdisc_rules_engine/operations/required_variables.py +++ b/cdisc_rules_engine/operations/required_variables.py @@ -22,7 +22,9 @@ def _execute_operation(self): variables_metadata: List[dict] = self._get_variables_metadata_from_standard() return list( { - var["name"].replace("--", self.params.domain): None + BaseOperation._replace_variable_wildcard( + var["name"], self.params.domain + ): None for var in variables_metadata if self.get_allowed_variable_permissibility(var) == REQUIRED }.keys() diff --git a/cdisc_rules_engine/operations/standard_domains.py b/cdisc_rules_engine/operations/standard_domains.py index 43b6d453b..8326777d1 100644 --- a/cdisc_rules_engine/operations/standard_domains.py +++ b/cdisc_rules_engine/operations/standard_domains.py @@ -3,13 +3,7 @@ class StandardDomains(BaseOperation): def _execute_operation(self): - standard_data: dict = self.library_metadata.standard_metadata - domains = standard_data.get("domains", set()) - if isinstance(domains, (set, list, tuple)): - return sorted(list(domains)) - elif domains is None: - return [] - raise TypeError( - f"Invalid type for 'domains' in standard_metadata: " - f"expected set, list, or tuple, got {type(domains).__name__}" - ) + dataset_names = ( + self.library_metadata.standard_metadata.get("dataset_names") or set() + ) | (self.library_metadata.model_metadata.get("dataset_names") or set()) + return sorted(list(dataset_names)) diff --git a/cdisc_rules_engine/operations/variable_count.py b/cdisc_rules_engine/operations/variable_count.py index 2512d5d4e..c9962e7bb 100644 --- a/cdisc_rules_engine/operations/variable_count.py +++ b/cdisc_rules_engine/operations/variable_count.py @@ -38,9 +38,7 @@ async def _get_dataset_variable_count( data: pd.DataFrame = self.data_service.get_dataset( dataset_name=dataset.full_path ) - target_variable = ( - self.params.original_target.replace("--", dataset.domain, 1) - if dataset.domain - else self.params.original_target + target_variable = BaseOperation._replace_variable_wildcard( + self.params.original_target, dataset.wildcard_replacement ) return 1 if target_variable in data else 0 diff --git a/cdisc_rules_engine/operations/variable_value_count.py b/cdisc_rules_engine/operations/variable_value_count.py index 6901ed995..f283a0e60 100644 --- a/cdisc_rules_engine/operations/variable_value_count.py +++ b/cdisc_rules_engine/operations/variable_value_count.py @@ -5,8 +5,8 @@ import os from collections import Counter from typing import List -from cdisc_rules_engine.utilities.utils import ( - get_corresponding_datasets, +from cdisc_rules_engine.utilities.sdtm_utilities import get_corresponding_datasets +from cdisc_rules_engine.utilities.sdtm_utilities import ( tag_source, ) @@ -49,8 +49,8 @@ async def _get_dataset_variable_value_count( ) ) data = tag_source(data, dataset_metadata) - target_variable = self.params.original_target.replace( - "--", dataset_metadata.domain, 1 + target_variable = BaseOperation._replace_variable_wildcard( + self.params.original_target, dataset_metadata.wildcard_replacement ) if target_variable in data: return Counter(data[target_variable].unique()) diff --git a/cdisc_rules_engine/rules_engine.py b/cdisc_rules_engine/rules_engine.py index 6404ee841..9a5b81047 100644 --- a/cdisc_rules_engine/rules_engine.py +++ b/cdisc_rules_engine/rules_engine.py @@ -391,18 +391,38 @@ def execute_rule( dataset = self.rule_processor.perform_rule_operations( rule_copy, dataset, - dataset_metadata.unsplit_name, + dataset_metadata, datasets, - dataset_metadata.full_path, standard=self.standard, standard_version=self.standard_version, standard_substandard=self.standard_substandard, external_dictionaries=self.external_dictionaries, ct_packages=ct_packages, ) + if dataset.empty: + rule_id = rule.get("core_id", "unknown") + reason = ( + f"Dataset skipped - Dataset is empty after preprocessing and operations. " + f"rule id={rule_id}, dataset={dataset_metadata.name}" + ) + logger.info(f"Skipped dataset {dataset_metadata.name}. Reason: {reason}") + error_obj = FailedValidationEntity( + dataset=dataset_metadata.filename, + error=SkippedReason.EMPTY_DATASET.value, + message=reason, + ) + return [ + ValidationErrorContainer( + status=ExecutionStatus.SKIPPED.value, + message=reason, + dataset=dataset_metadata.filename, + domain=dataset_metadata.domain or dataset_metadata.rdomain or "", + errors=[error_obj], + ).to_representation() + ] dataset_variable = DatasetVariable( dataset, - column_prefix_map={"--": dataset_metadata.domain_cleaned}, + column_prefix_map={"--": dataset_metadata.wildcard_replacement}, value_level_metadata=value_level_metadata, column_codelist_map=variable_codelist_map, codelist_term_maps=codelist_term_maps, diff --git a/cdisc_rules_engine/services/cdisc_library_service.py b/cdisc_rules_engine/services/cdisc_library_service.py index 4b1637b92..4c8b3c800 100644 --- a/cdisc_rules_engine/services/cdisc_library_service.py +++ b/cdisc_rules_engine/services/cdisc_library_service.py @@ -310,7 +310,7 @@ def get_standard_details( {...}, ... ], - "domains": { + "dataset_names": { "CO", "DM", "SE", @@ -319,11 +319,11 @@ def get_standard_details( } """ standard_data: dict = self._get_standard(standard_type, version, substandard) - domains: Set[str] = self._extract_domain_names_from_tabulation_standard( + dataset_names: Set[str] = self._extract_dataset_names_from_tabulation_standard( standard_data ) - if domains: - standard_data["domains"] = domains + if dataset_names: + standard_data["dataset_names"] = dataset_names return standard_data def get_model_details(self, standard_details: dict) -> Optional[dict]: @@ -352,6 +352,11 @@ def get_model_details(self, standard_details: dict) -> Optional[dict]: model_version: str = standard_href[-1] model_data: dict = self._get_model(standard_type, model_version) model_data["standard_type"] = standard_type + dataset_names: Set[str] = self._extract_dataset_names_from_tabulation_standard( + model_data + ) + if dataset_names: + model_data["dataset_names"] = dataset_names return model_data def _get_standard( @@ -405,7 +410,7 @@ def _extract_variables_details_from_standard( }, "cdashig": { "classes_key": "classes", - "datasets_key": "domains", + "datasets_key": "dataset_names", "variables_key": "fields", }, "adam": { @@ -425,7 +430,7 @@ def _extract_variables_details_from_standard( }, "tig/cdash": { "classes_key": "classes", - "datasets_key": "domains", + "datasets_key": "dataset_names", "variables_key": "fields", }, "tig/adam": { @@ -668,11 +673,11 @@ def _merge_codelist_maps(self, initial: dict, new_map: dict) -> dict: result[key] = new_map[key] return result - def _extract_domain_names_from_tabulation_standard( + def _extract_dataset_names_from_tabulation_standard( self, standard_data: dict ) -> Set[str]: """ - Accepts tabulation standard data and extracts domain names. + Accepts tabulation standard data and extracts dataset names. Input example: { "registrationStatus": "Final", @@ -708,8 +713,8 @@ def _extract_domain_names_from_tabulation_standard( Output example: {"CO", "DM", ...} """ - domain_names: Set[str] = set() + dataset_names: Set[str] = set() for cls in standard_data.get("classes", []): for dataset in cls.get("datasets", []): - domain_names.add(dataset.get("name")) - return domain_names + dataset_names.add(dataset.get("name")) + return dataset_names diff --git a/cdisc_rules_engine/services/data_services/base_data_service.py b/cdisc_rules_engine/services/data_services/base_data_service.py index b8a0a824c..8ac16f92e 100644 --- a/cdisc_rules_engine/services/data_services/base_data_service.py +++ b/cdisc_rules_engine/services/data_services/base_data_service.py @@ -33,14 +33,16 @@ from cdisc_rules_engine.services.cdisc_library_service import CDISCLibraryService from cdisc_rules_engine.services.data_readers import DataReaderFactory from cdisc_rules_engine.utilities.utils import ( - convert_library_class_name_to_ct_class, get_dataset_cache_key_from_path, get_directory_path, search_in_list_of_dicts, - tag_source, replace_nan_values_in_df, ) -from cdisc_rules_engine.utilities.sdtm_utilities import get_class_and_domain_metadata +from cdisc_rules_engine.utilities.sdtm_utilities import ( + convert_library_class_name_to_ct_class, + get_class_and_dataset_metadata, + tag_source, +) from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from cdisc_rules_engine.models.dataset import PandasDataset from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata @@ -176,8 +178,8 @@ def get_dataset_class( dataset_metadata: SDTMDatasetMetadata, ) -> Optional[str]: if self.library_metadata.standard_metadata: - class_data, _ = get_class_and_domain_metadata( - self.library_metadata.standard_metadata, + class_data, _ = get_class_and_dataset_metadata( + self.library_metadata, dataset_metadata.unsplit_name, ) name = class_data.get("name") diff --git a/cdisc_rules_engine/services/data_services/local_data_service.py b/cdisc_rules_engine/services/data_services/local_data_service.py index 6f2408a51..201fbe1f5 100644 --- a/cdisc_rules_engine/services/data_services/local_data_service.py +++ b/cdisc_rules_engine/services/data_services/local_data_service.py @@ -1,4 +1,5 @@ import os +from os.path import basename from io import IOBase from typing import Iterable, List, Optional, Tuple @@ -22,7 +23,6 @@ ) from cdisc_rules_engine.utilities.utils import ( convert_file_size, - extract_file_name_from_path_string, ) from cdisc_rules_engine.exceptions.custom_exceptions import InvalidDatasetFormat from .base_data_service import BaseDataService, cached_dataset @@ -99,7 +99,7 @@ def get_file_matching_pattern(self, prefix: str, pattern: str) -> str: @cached_dataset(DatasetTypes.CONTENTS.value) def get_dataset(self, dataset_name: str, **params) -> DatasetInterface: reader = self._reader_factory.get_service( - extract_file_name_from_path_string(dataset_name).split(".")[1].upper() + basename(dataset_name).split(".")[1].upper() ) df = reader.from_file(dataset_name) return df @@ -169,7 +169,7 @@ def read_metadata( self, file_path: str, datasets: Optional[Iterable[SDTMDatasetMetadata]] = None ) -> dict: file_size = os.path.getsize(file_path) - file_name = extract_file_name_from_path_string(file_path) + file_name = basename(file_path) file_metadata = { "path": file_path, "name": file_name, @@ -180,7 +180,7 @@ def read_metadata( if obj.full_path == file_path: file_metadata = { "path": obj.original_path, - "name": extract_file_name_from_path_string(obj.original_path), + "name": basename(obj.original_path), "file_size": os.path.getsize(obj.original_path), } file_name = obj.filename @@ -232,7 +232,7 @@ def __get_dataset_metadata(self, dataset_name: str, **kwargs) -> Tuple[dict, dic def to_parquet(self, file_path: str) -> str: reader = self._reader_factory.get_service( - extract_file_name_from_path_string(file_path).split(".")[1].upper() + basename(file_path).split(".")[1].upper() ) return reader.to_parquet(file_path) diff --git a/cdisc_rules_engine/services/data_services/usdm_data_service.py b/cdisc_rules_engine/services/data_services/usdm_data_service.py index 275e1c676..30928a102 100644 --- a/cdisc_rules_engine/services/data_services/usdm_data_service.py +++ b/cdisc_rules_engine/services/data_services/usdm_data_service.py @@ -1,4 +1,5 @@ import os +from os.path import basename from io import IOBase from typing import List, Sequence, Any from dataclasses import dataclass @@ -22,9 +23,6 @@ DataReaderFactory, ) from cdisc_rules_engine.services.data_readers.json_reader import JSONReader -from cdisc_rules_engine.utilities.utils import ( - extract_file_name_from_path_string, -) from .base_data_service import BaseDataService, cached_dataset @@ -150,7 +148,7 @@ def get_raw_dataset_metadata( modification_date=datetime.fromtimestamp( os.path.getmtime(self.dataset_path) ).isoformat(), - filename=extract_file_name_from_path_string(dataset_name), + filename=basename(dataset_name), full_path=dataset_name, file_size=0, record_count=len(dataset), @@ -182,7 +180,7 @@ def get_define_xml_contents(self, dataset_name: str) -> bytes: def read_metadata(self, dataset_name: str) -> dict: np_json_type_map: dict = {"O": "string", "float64": "float"} file_size = os.path.getsize(self.dataset_path) - file_name = extract_file_name_from_path_string(self.dataset_path) + file_name = basename(self.dataset_path) file_metadata = { "path": self.dataset_path, "name": file_name, @@ -476,7 +474,7 @@ def __get_dataset_name_from_domain(self, domain_name: str) -> str: return os.path.join(self.dataset_path, "{}.json".format(domain_name)) def __get_domain_from_dataset_name(self, dataset_name: str) -> str: - return extract_file_name_from_path_string(dataset_name).split(".")[0] + return basename(dataset_name).split(".")[0] @staticmethod def is_valid_data(dataset_paths: Sequence[str], encoding: str = None): diff --git a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py index 5d08f235c..6a74974b1 100644 --- a/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py +++ b/cdisc_rules_engine/services/define_xml/base_define_xml_reader.py @@ -19,7 +19,7 @@ from cdisc_rules_engine.models.define import ValueLevelMetadata from cdisc_rules_engine.services import logger from cdisc_rules_engine.utilities.decorators import cached -from cdisc_rules_engine.utilities.utils import is_supp_domain +from cdisc_rules_engine.utilities.sdtm_utilities import is_supp_domain @dataclass diff --git a/cdisc_rules_engine/utilities/data_processor.py b/cdisc_rules_engine/utilities/data_processor.py index 04d6dbe46..7ea587499 100644 --- a/cdisc_rules_engine/utilities/data_processor.py +++ b/cdisc_rules_engine/utilities/data_processor.py @@ -142,18 +142,24 @@ def merge_on_relrec_record( variables_with_wildcards["USUBJID"], ] else: - left_on = ["STUDYID", "USUBJID", relrec_row["IDVAR_LEFT"]] + left_on = ["STUDYID", "USUBJID", "RELREC.IDVAR"] right_on = [ variables_with_wildcards["STUDYID"], variables_with_wildcards["USUBJID"], - variables_with_wildcards[relrec_row["IDVAR_RIGHT"]], + "RELREC.IDVAR", ] + left_subset["RELREC.IDVAR"] = left_subset[relrec_row["IDVAR_LEFT"]].astype( + str + ) + right_subset["RELREC.IDVAR"] = right_subset[ + relrec_row["IDVAR_RIGHT"] + ].astype(str) right_subset = right_subset.rename(columns=variables_with_wildcards) result = left_subset.merge( other=right_subset.data, left_on=left_on, right_on=right_on, - ) + ).drop(["RELREC.IDVAR"], axis=1, errors="ignore") return result @staticmethod @@ -181,15 +187,17 @@ def merge_relrec_datasets( relrec_for_domain = DataProcessor.filter_relrec_for_domain( left_dataset_domain_name, relrec_dataset ) - - # TODO: FIX objs = [ DataProcessor.merge_on_relrec_record( relrec_row, left_dataset, datasets, dataset_preprocessor, wildcard ) for _, relrec_row in relrec_for_domain.iterrows() ] - result = objs[0].concat(objs[1:], ignore_index=True) + result = ( + objs[0].concat(objs[1:], ignore_index=True) + if objs + else left_dataset.__class__() + ) return result @staticmethod diff --git a/cdisc_rules_engine/utilities/dataset_preprocessor.py b/cdisc_rules_engine/utilities/dataset_preprocessor.py index 7c55a191b..0b34890a1 100644 --- a/cdisc_rules_engine/utilities/dataset_preprocessor.py +++ b/cdisc_rules_engine/utilities/dataset_preprocessor.py @@ -14,7 +14,6 @@ from cdisc_rules_engine.utilities.utils import ( replace_pattern_in_list_of_strings, get_sided_match_keys, - get_dataset_name_from_details, ) from cdisc_rules_engine.exceptions.custom_exceptions import PreprocessingError import os @@ -126,14 +125,14 @@ def preprocess( # noqa if file_info.domain in merged_domains: continue - filename = get_dataset_name_from_details(file_info) - # Try to download the dataset try: - other_dataset: DatasetInterface = self._download_dataset(filename) + other_dataset: DatasetInterface = self._download_dataset( + file_info.data_service_identifier + ) except Exception as e: raise PreprocessingError( - f"Failed to download dataset '{filename}' for preprocessing: {str(e)}" + f"Failed to download dataset '{file_info.data_service_identifier}' for preprocessing: {str(e)}" ) referenced_targets = set( @@ -552,7 +551,6 @@ def _merge_datasets( # noqa raise PreprocessingError( f"Failed to merge RELREC dataset in preprocessing. " f"Left dataset: {left_dataset_domain_name}, " - f"RELREC dataset: {right_dataset_domain_name}, " f"Wildcard: {right_dataset_domain_details.get('wildcard')}, " f"Match keys: {match_keys}, " f"Error: {str(e)}" diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index 470c5928b..6c686be16 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -10,7 +10,9 @@ from cdisc_rules_engine.models.dataset.dataset_interface import ( DatasetInterface, ) -from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata +from cdisc_rules_engine.models.dataset_metadata import ( + DatasetMetadata, +) from cdisc_rules_engine.models.library_metadata_container import ( LibraryMetadataContainer, ) @@ -27,15 +29,18 @@ from cdisc_rules_engine.interfaces import ConditionInterface from cdisc_rules_engine.models.operation_params import OperationParams from cdisc_rules_engine.models.rule_conditions import AllowedConditionsKeys -from cdisc_rules_engine.exceptions.custom_exceptions import OperationError +from cdisc_rules_engine.exceptions.custom_exceptions import ( + DomainNotFoundError, + OperationError, +) from cdisc_rules_engine.operations import operations_factory +from cdisc_rules_engine.operations.base_operation import BaseOperation from cdisc_rules_engine.services import logger from cdisc_rules_engine.utilities.data_processor import DataProcessor from cdisc_rules_engine.utilities.utils import ( get_directory_path, get_operations_cache_key, search_in_list_of_dicts, - get_dataset_name_from_details, ) from cdisc_rules_engine.models.external_dictionaries_container import ( ExternalDictionariesContainer, @@ -320,9 +325,8 @@ def perform_rule_operations( self, rule: dict, dataset: DatasetInterface, - domain: str, + dataset_metadata: SDTMDatasetMetadata, datasets: Iterable[SDTMDatasetMetadata], - dataset_path: str, standard: str, standard_version: str, standard_substandard: str, @@ -344,11 +348,15 @@ def perform_rule_operations( # change -- pattern to domain name original_target: str = operation.get("name") target: str = original_target - domain: str = operation.get("domain", domain) + domain: str = operation.get("domain", dataset_metadata.unsplit_name) + wildcard_replacement: str = operation.get( + "domain", dataset_metadata.wildcard_replacement + ) if target and target.startswith("--") and domain: # Not a study wide operation - target = target.replace("--", domain) - domain = domain.replace("--", domain) + target = BaseOperation._replace_variable_wildcard( + target, wildcard_replacement + ) # get necessary operation operation_params = OperationParams( @@ -368,11 +376,11 @@ def perform_rule_operations( ], ct_version=operation.get("version"), dataframe=dataset_copy, - dataset_path=dataset_path, + dataset_path=dataset_metadata.full_path, datasets=datasets, delimiter=operation.get("delimiter"), dictionary_term_type=operation.get("dictionary_term_type"), - directory_path=get_directory_path(dataset_path), + directory_path=get_directory_path(dataset_metadata.full_path), domain=domain, domain_class=operation.get("domain_class"), external_dictionaries=external_dictionaries, @@ -409,6 +417,8 @@ def perform_rule_operations( dataset_copy = self._execute_operation( operation_params, dataset_copy, previous_operations ) + except (DomainNotFoundError, KeyError): + raise except Exception as e: error_detail = getattr(e, "message", None) or str(e) raise OperationError( @@ -468,17 +478,16 @@ def _execute_operation( ), ) if domain_details is None: - raise OperationError( + raise DomainNotFoundError( f"Failed to execute rule operation. " f"Domain {operation_params.domain} does not exist. " f"Operation: {operation_params.operation_name}, " f"Target: {operation_params.target}, " f"Core ID: {operation_params.core_id}" ) - filename = get_dataset_name_from_details(domain_details) file_path: str = os.path.join( get_directory_path(operation_params.dataset_path), - filename, + domain_details.data_service_identifier, ) operation_params.dataframe = self.data_service.get_dataset( dataset_name=file_path @@ -502,7 +511,11 @@ def is_current_domain(self, dataset, target_domain): if not target_domain: return True elif not self.is_relationship_dataset(target_domain): - return "DOMAIN" in dataset and dataset["DOMAIN"].iloc[0] == target_domain + return ( + "DOMAIN" in dataset + and not dataset.empty + and dataset["DOMAIN"].iloc[0] == target_domain + ) else: # Always lookup relationship datasets when performing operations on them. return False diff --git a/cdisc_rules_engine/utilities/sdtm_utilities.py b/cdisc_rules_engine/utilities/sdtm_utilities.py index 3d7304d8d..c4e19972e 100644 --- a/cdisc_rules_engine/utilities/sdtm_utilities.py +++ b/cdisc_rules_engine/utilities/sdtm_utilities.py @@ -1,7 +1,20 @@ +import re + +from cdisc_rules_engine.constants.domains import ( + AP_DOMAIN, + APFA_DOMAIN, + APRELSUB_DOMAIN, + SUPPLEMENTARY_DOMAINS, +) +from cdisc_rules_engine.constants.metadata_columns import ( + SOURCE_FILENAME, + SOURCE_ROW_NUMBER, +) from cdisc_rules_engine.interfaces.data_service_interface import DataServiceInterface +from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface +from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata from cdisc_rules_engine.utilities.utils import ( search_in_list_of_dicts, - convert_library_class_name_to_ct_class, ) from cdisc_rules_engine.constants.classes import ( DETECTABLE_CLASSES, @@ -9,6 +22,8 @@ FINDINGS, FINDINGS_ABOUT, FINDINGS_TEST_VARIABLE, + SPECIAL_PURPOSE, + SPECIAL_PURPOSE_MODEL, ) from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.library_metadata_container import ( @@ -16,37 +31,67 @@ ) import copy from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata -from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface from typing import Iterable, Tuple, List, Optional -def get_class_and_domain_metadata( - standard_details: dict, domain: str +def is_custom_domain( + library_metadata: LibraryMetadataContainer, dataset_name: str +) -> bool: + """ + Checks if the given dataset is a custom domain based on the standard metadata. + + Args: + dataset_name: The dataset name to check. + library_metadata: The library metadata container containing standard metadata. + """ + standard_details = library_metadata.standard_metadata + model_details = library_metadata.model_metadata + is_custom = dataset_name not in standard_details.get( + "dataset_names", {} + ) and dataset_name not in model_details.get("dataset_names", {}) + return is_custom + + +def get_class_and_dataset_metadata( + library_metadata: LibraryMetadataContainer, dataset_name: str ) -> Tuple[dict, dict]: """ - Extracts metadata of a certain class and domain - from given standards details. + Extracts metadata of a certain class and dataset + from given standards details. Checks IG first, then model. If not found, returns empty dicts. Args: - standard_details: Library implementation guide metadata. - domain: Name of the target domain + library_metadata: Library metadata container containing standard metadata. + dataset_name: Name of the target dataset Returns: - The class metadata and domain metadata from the standard. + The class metadata and dataset metadata from the standard. Ex: - {class_details}, {domain_details} + {class_details}, {dataset_details} """ - # Get domain and class details for domain. - for c in standard_details.get("classes"): - domain_details = search_in_list_of_dicts( - c.get("datasets", []), lambda item: item["name"] == domain + for c in library_metadata.standard_metadata.get("classes", []): + dataset_details = search_in_list_of_dicts( + c.get("datasets", []), lambda item: item["name"] == dataset_name ) - if domain_details: - return c, domain_details + if dataset_details: + return c, dataset_details + for c in library_metadata.model_metadata.get("classes", []): + dataset_details = search_in_list_of_dicts( + c.get("datasets", []), lambda item: item["name"] == dataset_name + ) + if dataset_details: + return c, dataset_details return {}, {} +def convert_library_class_name_to_ct_class(class_name: str): + conversions = { + "special-purpose": SPECIAL_PURPOSE, + "special-purpose datasets": SPECIAL_PURPOSE_MODEL, + } + return conversions.get(class_name.lower(), class_name.upper()) + + def get_tabulation_model_type_and_version(model_link: dict) -> Tuple: link = model_link.get("href") model_type = "sdtm" @@ -55,15 +100,14 @@ def get_tabulation_model_type_and_version(model_link: dict) -> Tuple: def get_variables_metadata_from_standard( # noqa - domain, library_metadata, data_service, - dataset: DatasetInterface, dataset_metadata: SDTMDatasetMetadata, dataset_path: str, datasets: Iterable[SDTMDatasetMetadata], ): add_AP = False + domain = dataset_metadata.unsplit_name original_domain = domain if ( domain @@ -79,13 +123,12 @@ def get_variables_metadata_from_standard( # noqa domain = domain[2:] original_domain = domain add_AP = True - standard_details = library_metadata.standard_metadata model_details = library_metadata.model_metadata - is_custom = domain not in standard_details.get("domains", {}) + is_custom = is_custom_domain(library_metadata, domain) variables_metadata = [] if not is_custom: - IG_class_details, IG_domain_details = get_class_and_domain_metadata( - standard_details, domain + IG_class_details, IG_domain_details = get_class_and_dataset_metadata( + library_metadata, domain ) class_name = convert_library_class_name_to_ct_class( IG_class_details.get("name") @@ -294,7 +337,6 @@ def group_class_variables_by_role( def get_variables_metadata_from_standard_model( # noqa - domain: str, dataframe, datasets: Iterable[SDTMDatasetMetadata], dataset_path: str, @@ -309,6 +351,7 @@ def get_variables_metadata_from_standard_model( # noqa if custom, IDs class and uses class variables. """ add_AP = False + domain = dataset_metadata.unsplit_name original_domain = domain if ( domain @@ -324,12 +367,11 @@ def get_variables_metadata_from_standard_model( # noqa domain = domain[2:] original_domain = domain add_AP = True - standard_details = library_metadata.standard_metadata model_details = library_metadata.model_metadata - is_custom = domain not in standard_details.get("domains", {}) + is_custom = is_custom_domain(library_metadata, domain) if not is_custom: - IG_class_details, IG_domain_details = get_class_and_domain_metadata( - standard_details, domain + IG_class_details, IG_domain_details = get_class_and_dataset_metadata( + library_metadata, domain ) class_name = convert_library_class_name_to_ct_class( IG_class_details.get("name") @@ -467,3 +509,46 @@ def add_variable_wildcards( ) for variable in variables } + + +def is_supp_domain(dataset_domain: str) -> bool: + """ + Returns true if domain name starts with SUPP or SQ + """ + return dataset_domain.startswith(SUPPLEMENTARY_DOMAINS) + + +def is_ap_domain(dataset_domain: str) -> bool: + """ + Returns true if domain name is like AP-- / APFA APRELSUB. + """ + if dataset_domain == APRELSUB_DOMAIN: + return True + if len(dataset_domain) == 6: + domain_to_check: str = APFA_DOMAIN + else: + domain_to_check: str = AP_DOMAIN + regex = r"^" + re.escape(domain_to_check) + "[a-zA-Z]{2,4}$" + return bool(re.match(regex, dataset_domain)) + + +def get_corresponding_datasets( + datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata +) -> List[SDTMDatasetMetadata]: + return [ + other + for other in datasets + if dataset_metadata.unsplit_name == other.unsplit_name + ] + + +def tag_source( + dataset: DatasetInterface, dataset_metadata: DatasetMetadata +) -> DatasetInterface: + """ + For sdtm split datasets, + Adds source filename and row number to dataset + """ + dataset[SOURCE_FILENAME] = dataset_metadata.filename + dataset[SOURCE_ROW_NUMBER] = list(range(1, dataset.len() + 1)) + return dataset diff --git a/cdisc_rules_engine/utilities/utils.py b/cdisc_rules_engine/utilities/utils.py index 41d79a317..5e41736a2 100644 --- a/cdisc_rules_engine/utilities/utils.py +++ b/cdisc_rules_engine/utilities/utils.py @@ -11,27 +11,13 @@ import ast import pandas as pd from datetime import datetime -from typing import Callable, Iterable, List, Optional, Union +from typing import Callable, List, Optional, Union from uuid import UUID -from cdisc_rules_engine.constants.metadata_columns import ( - SOURCE_FILENAME, - SOURCE_ROW_NUMBER, -) -from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface -from cdisc_rules_engine.models.dataset_metadata import DatasetMetadata - -from cdisc_rules_engine.constants.domains import ( - AP_DOMAIN, - APFA_DOMAIN, - APRELSUB_DOMAIN, - SUPPLEMENTARY_DOMAINS, -) -from cdisc_rules_engine.constants.classes import SPECIAL_PURPOSE, SPECIAL_PURPOSE_MODEL + from cdisc_rules_engine.enums.execution_status import ExecutionStatus from cdisc_rules_engine.interfaces import ConditionInterface from cdisc_rules_engine.models.base_validation_entity import BaseValidationEntity from cdisc_rules_engine.check_operators.helpers import is_valid_date -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.constants.adam_products import ADAM_PRODUCTS @@ -173,27 +159,6 @@ def get_dataset_cache_key_from_path(dataset_path: str, dataset_type: str) -> str ) -def is_supp_domain(dataset_domain: str) -> bool: - """ - Returns true if domain name starts with SUPP or SQ - """ - return dataset_domain.startswith(SUPPLEMENTARY_DOMAINS) - - -def is_ap_domain(dataset_domain: str) -> bool: - """ - Returns true if domain name is like AP-- / APFA APRELSUB. - """ - if dataset_domain == APRELSUB_DOMAIN: - return True - if len(dataset_domain) == 6: - domain_to_check: str = APFA_DOMAIN - else: - domain_to_check: str = AP_DOMAIN - regex = r"^" + re.escape(domain_to_check) + "[a-zA-Z]{2,4}$" - return bool(re.match(regex, dataset_domain)) - - def get_library_variables_metadata_cache_key( standard_type: str, standard_version: str, standard_substandard: str ) -> str: @@ -287,36 +252,6 @@ def get_directory_path(dataset_path): return os.path.dirname(dataset_path) -def tag_source( - dataset: DatasetInterface, dataset_metadata: DatasetMetadata -) -> DatasetInterface: - """ - For sdtm split datasets, - Adds source filename and row number to dataset - """ - dataset[SOURCE_FILENAME] = dataset_metadata.filename - dataset[SOURCE_ROW_NUMBER] = list(range(1, dataset.len() + 1)) - return dataset - - -def get_corresponding_datasets( - datasets: Iterable[SDTMDatasetMetadata], dataset_metadata: SDTMDatasetMetadata -) -> List[SDTMDatasetMetadata]: - return [ - other - for other in datasets - if dataset_metadata.unsplit_name == other.unsplit_name - ] - - -def get_dataset_name_from_details(dataset_metadata: SDTMDatasetMetadata) -> str: - return ( - os.path.split(dataset_metadata.full_path)[-1] - if dataset_metadata.full_path - else dataset_metadata.filename - ) - - def serialize_rule(rule: dict) -> dict: """ Converts rule "conditions" to dict. @@ -351,16 +286,6 @@ def list_contains_duplicates(list_to_check: list) -> bool: return bool(len(list_to_check) > len(set(list_to_check))) -def extract_file_name_from_path_string(path: str) -> str: - """ - Extracts file name from given path string. - Example: - input: "CDISC01/test/ae.xpt" - output: ae.xpt - """ - return os.path.split(path)[-1] - - def generate_report_filename(generation_time: str) -> str: timestamp = ( datetime.fromisoformat(generation_time) @@ -436,14 +361,6 @@ def get_dictionary_path(directory_path: str, file_name: str) -> str: return os.path.join(directory_path, file_name) -def convert_library_class_name_to_ct_class(class_name: str): - conversions = { - "special-purpose": SPECIAL_PURPOSE, - "special-purpose datasets": SPECIAL_PURPOSE_MODEL, - } - return conversions.get(class_name.lower(), class_name.upper()) - - def decode_line(line: bytes) -> str: return line.decode("utf-8").replace("\n", "").replace("\r", "") diff --git a/core.py b/core.py index 2ef8d21ef..ffd451e7e 100644 --- a/core.py +++ b/core.py @@ -81,7 +81,7 @@ def valid_data_file(data_path: list) -> tuple[list, set]: if ignored_files: logger = logging.getLogger("validator") - logger.warning( + logger.info( f"Ignoring {len(ignored_files)} file(s) with unsupported formats: {', '.join(ignored_files[:5])}" + ("..." if len(ignored_files) > 5 else "") ) diff --git a/resources/cache/standards_details.pkl b/resources/cache/standards_details.pkl index e6e409cf5..b20874bc8 100644 Binary files a/resources/cache/standards_details.pkl and b/resources/cache/standards_details.pkl differ diff --git a/resources/cache/standards_models.pkl b/resources/cache/standards_models.pkl index d8e562c0e..586420e7c 100644 Binary files a/resources/cache/standards_models.pkl and b/resources/cache/standards_models.pkl differ diff --git a/tests/unit/test_actions.py b/tests/unit/test_actions.py index 0dbda6d6b..81c81ce28 100644 --- a/tests/unit/test_actions.py +++ b/tests/unit/test_actions.py @@ -9,7 +9,7 @@ import json import pytest -from cdisc_rules_engine.utilities.utils import tag_source +from cdisc_rules_engine.utilities.sdtm_utilities import tag_source from cdisc_rules_engine.constants.metadata_columns import ( SOURCE_FILENAME, SOURCE_ROW_NUMBER, diff --git a/tests/unit/test_cdisc_library_service.py b/tests/unit/test_cdisc_library_service.py index 4728c119a..f7868e10b 100644 --- a/tests/unit/test_cdisc_library_service.py +++ b/tests/unit/test_cdisc_library_service.py @@ -26,9 +26,9 @@ def test_get_standard_details(mock_get_sdtmig: MagicMock): library_service = CDISCLibraryService(config, MagicMock()) standard_details: dict = library_service.get_standard_details("sdtmig", "3-1-2") - # expected is that mocked sdtmig details is extended with "domains" key + # expected is that mocked sdtmig details is extended with "dataset_names" key assert standard_details == { - "domains": { + "dataset_names": { "CO", "DM", "SE", diff --git a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py index 3c36c1478..04a1a047f 100644 --- a/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py +++ b/tests/unit/test_dataset_builders/test_define_variables_with_library_metadata.py @@ -66,7 +66,7 @@ def test_define_variables_metadata_with_library_metadata_dataset_builder( standard_substandard = None standard_data = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py index eb889e460..ee6c8d947 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py @@ -76,7 +76,7 @@ def test_variable_metadata_with_library_metadata_dataset_builder( standard_substandard = None standard_data = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": ["AE", "DM", "VS"], + "dataset_names": ["AE", "DM", "VS"], "classes": [ { "name": "Events", @@ -239,7 +239,7 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i standard_substandard = None standard_data = { "_links": {"model": {"href": "/mdr/sdtm/2-0"}}, - "domains": ["AE", "DM", "VS"], + "dataset_names": ["AE", "DM", "VS"], "classes": [ { "name": "Events", diff --git a/tests/unit/test_operations/test_distinct.py b/tests/unit/test_operations/test_distinct.py index 6f92a760c..0e0f68b77 100644 --- a/tests/unit/test_operations/test_distinct.py +++ b/tests/unit/test_operations/test_distinct.py @@ -227,7 +227,7 @@ def __init__(self, name, filename): self.name = name self.filename = filename - data_service.data = [MockDataset("LB", "lb.xpt")] + data_service.get_datasets = lambda: [MockDataset("LB", "lb.xpt")] def mock_get_dataset(dataset_name, **kwargs): return referenced_data @@ -298,7 +298,7 @@ def __init__(self, name, filename): self.name = name self.filename = filename - data_service.data = [MockDataset("LB", "lb.xpt")] + data_service.get_datasets = lambda: [MockDataset("LB", "lb.xpt")] def mock_get_dataset(dataset_name, **kwargs): return referenced_data diff --git a/tests/unit/test_operations/test_domain_is_custom.py b/tests/unit/test_operations/test_domain_is_custom.py index fb4f06d3d..d1caf924c 100644 --- a/tests/unit/test_operations/test_domain_is_custom.py +++ b/tests/unit/test_operations/test_domain_is_custom.py @@ -86,7 +86,7 @@ def test_domain_is_custom( expected: bool, ): standard_metadata = { - "domains": {"AE"}, + "dataset_names": {"AE"}, } operation_params.dataframe = dataframe operation_params.domain = domain diff --git a/tests/unit/test_operations/test_expected_variables.py b/tests/unit/test_operations/test_expected_variables.py index 5205b3b9a..7d547a980 100644 --- a/tests/unit/test_operations/test_expected_variables.py +++ b/tests/unit/test_operations/test_expected_variables.py @@ -11,6 +11,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.expected_variables import ExpectedVariables from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService @@ -68,7 +69,7 @@ standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -189,7 +190,7 @@ def test_get_expected_variables(operation_params: OperationParams, dataset_type) data_service.get_dataset_class = Mock(return_value=mock_dataset_class) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_get_dataset_filtered_variables.py b/tests/unit/test_operations/test_get_dataset_filtered_variables.py index 30c080619..dff836b6c 100644 --- a/tests/unit/test_operations/test_get_dataset_filtered_variables.py +++ b/tests/unit/test_operations/test_get_dataset_filtered_variables.py @@ -96,7 +96,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -189,7 +189,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -275,7 +275,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -352,7 +352,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -442,7 +442,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"FA"}, + "dataset_names": {"FA"}, "classes": [ { "name": FINDINGS_ABOUT, @@ -643,7 +643,7 @@ def test_get_dataset_filtered_variables_dask( standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -758,7 +758,7 @@ def test_get_dataset_filtered_variables_empty_dataset( standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -870,7 +870,7 @@ def test_get_dataset_filtered_variables_invalid_key(operation_params: OperationP standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", diff --git a/tests/unit/test_operations/test_get_model_filtered_variables.py b/tests/unit/test_operations/test_get_model_filtered_variables.py index cd0553455..526605a9a 100644 --- a/tests/unit/test_operations/test_get_model_filtered_variables.py +++ b/tests/unit/test_operations/test_get_model_filtered_variables.py @@ -108,7 +108,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -236,7 +236,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", diff --git a/tests/unit/test_operations/test_label_referenced_variable_metadata.py b/tests/unit/test_operations/test_label_referenced_variable_metadata.py index 62ec54006..a70e9d8d5 100644 --- a/tests/unit/test_operations/test_label_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_label_referenced_variable_metadata.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.label_referenced_variable_metadata import ( LabelReferencedVariableMetadata, ) @@ -68,7 +69,7 @@ def test_get_label_referenced_variable_metadata( } standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -191,7 +192,7 @@ def test_get_label_referenced_variable_metadata( ) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_library_column_order.py b/tests/unit/test_operations/test_library_column_order.py index 87ea10bb2..b13f9d8e8 100644 --- a/tests/unit/test_operations/test_library_column_order.py +++ b/tests/unit/test_operations/test_library_column_order.py @@ -12,6 +12,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.library_column_order import LibraryColumnOrder from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService @@ -69,7 +70,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -201,7 +202,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -335,7 +336,7 @@ def test_get_column_order_from_library( ) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_library_model_column_order.py b/tests/unit/test_operations/test_library_model_column_order.py index 087eac3db..bc69b7e43 100644 --- a/tests/unit/test_operations/test_library_model_column_order.py +++ b/tests/unit/test_operations/test_library_model_column_order.py @@ -76,7 +76,7 @@ standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -231,7 +231,7 @@ def mock_get_raw_metadata(*args, **kwargs): }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": FINDINGS_ABOUT, diff --git a/tests/unit/test_operations/test_name_referenced_variable_metadata.py b/tests/unit/test_operations/test_name_referenced_variable_metadata.py index d9574b6e0..0014bacdf 100644 --- a/tests/unit/test_operations/test_name_referenced_variable_metadata.py +++ b/tests/unit/test_operations/test_name_referenced_variable_metadata.py @@ -6,6 +6,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.name_referenced_variable_metadata import ( NameReferencedVariableMetadata, ) @@ -67,7 +68,7 @@ def test_get_name_referenced_variable_metadata( } standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -190,7 +191,7 @@ def test_get_name_referenced_variable_metadata( ) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_parent_library_model_column_order.py b/tests/unit/test_operations/test_parent_library_model_column_order.py index 6cef24791..0f40379ec 100644 --- a/tests/unit/test_operations/test_parent_library_model_column_order.py +++ b/tests/unit/test_operations/test_parent_library_model_column_order.py @@ -89,7 +89,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE"}, + "dataset_names": {"AE"}, "classes": [ { "name": "Events", @@ -140,6 +140,7 @@ def test_get_parent_column_order_from_library( operation_params.standard = "sdtmig" operation_params.standard_version = "3-4" operation_params.datasets = datasets + operation_params.dataset_path = "suppae.xpt" # save model metadata to cache cache = InMemoryCacheService.get_instance() @@ -271,7 +272,7 @@ def mock_get_raw_metadata(dataset_name, **kwargs): }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": {"AE", "EC"}, + "dataset_names": {"AE", "EC"}, "classes": [ { "name": FINDINGS_ABOUT, @@ -350,6 +351,7 @@ def test_get_parent_findings_class_column_order_from_library( operation_params.datasets = [ SDTMDatasetMetadata(**dataset) for dataset in datasets ] + operation_params.dataset_path = "suppae.xpt" # save model metadata to cache cache = InMemoryCacheService.get_instance() diff --git a/tests/unit/test_operations/test_permissible_variables.py b/tests/unit/test_operations/test_permissible_variables.py index 18da9e0b9..be9838326 100644 --- a/tests/unit/test_operations/test_permissible_variables.py +++ b/tests/unit/test_operations/test_permissible_variables.py @@ -11,6 +11,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.permissible_variables import PermissibleVariables from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService @@ -70,7 +71,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -204,7 +205,7 @@ }, { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -332,7 +333,7 @@ def test_get_permissible_variables( ) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_related_domain_is_custom.py b/tests/unit/test_operations/test_related_domain_is_custom.py index 80b3ef63e..c01ea774e 100644 --- a/tests/unit/test_operations/test_related_domain_is_custom.py +++ b/tests/unit/test_operations/test_related_domain_is_custom.py @@ -71,7 +71,7 @@ def test_related_domain_is_custom( """ library_metadata = LibraryMetadataContainer( - standard_metadata={"domains": standard_domains} + standard_metadata={"dataset_names": standard_domains} ) params = DummyParams(datasets=study_datasets, domain=domain) diff --git a/tests/unit/test_operations/test_required_variables.py b/tests/unit/test_operations/test_required_variables.py index 16c33f741..3a92a945b 100644 --- a/tests/unit/test_operations/test_required_variables.py +++ b/tests/unit/test_operations/test_required_variables.py @@ -13,6 +13,7 @@ from cdisc_rules_engine.constants.classes import GENERAL_OBSERVATIONS_CLASS from cdisc_rules_engine.enums.variable_roles import VariableRoles from cdisc_rules_engine.models.operation_params import OperationParams +from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.operations.required_variables import RequiredVariables from cdisc_rules_engine.services.cache import InMemoryCacheService from cdisc_rules_engine.services.data_services import LocalDataService @@ -71,7 +72,7 @@ } standard_metadata = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -199,7 +200,7 @@ def test_get_required_variables(operation_params: OperationParams, dataset_type) ) def mock_cached_method(*args, **kwargs): - return operation_params.dataframe + return SDTMDatasetMetadata(first_record={"DOMAIN": "AE"}) with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_operations/test_standard_domains.py b/tests/unit/test_operations/test_standard_domains.py index 605eba7ca..dac5a2423 100644 --- a/tests/unit/test_operations/test_standard_domains.py +++ b/tests/unit/test_operations/test_standard_domains.py @@ -35,11 +35,9 @@ def _create_operation(operation_params, standard_metadata, dataset_type): "domains_input, expected_domains", [ ({"AE", "FA", "LB", "QS", "CM", "DM"}, ["AE", "CM", "DM", "FA", "LB", "QS"]), - (["AE", "FA", "LB"], ["AE", "FA", "LB"]), - (("AE", "FA", "LB"), ["AE", "FA", "LB"]), - (["QS", "AE", "FA", "LB", "CM"], ["AE", "CM", "FA", "LB", "QS"]), + ({"AE", "FA", "LB"}, ["AE", "FA", "LB"]), + ({"QS", "AE", "FA", "LB", "CM"}, ["AE", "CM", "FA", "LB", "QS"]), (set(), []), - ([], []), ], ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) @@ -49,7 +47,7 @@ def test_standard_domains_returns_sorted_list( domains_input, expected_domains, ): - standard_metadata = {"domains": domains_input} + standard_metadata = {"dataset_names": domains_input} operation = _create_operation(operation_params, standard_metadata, dataset_type) result = operation.execute() domain_list = result[operation_params.operation_id].iloc[0] @@ -60,7 +58,7 @@ def test_standard_domains_returns_sorted_list( "standard_metadata, expected_length", [ ({}, 0), - ({"domains": None}, 0), + ({"dataset_names": None}, 0), ], ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) @@ -75,22 +73,3 @@ def test_standard_domains_handles_missing_or_none_domains( domain_list = result[operation_params.operation_id].iloc[0] assert isinstance(domain_list, list) assert len(domain_list) == expected_length - - -@pytest.mark.parametrize( - "standard_metadata", - [ - ({"domains": {}}), - ({"domains": 123}), - ({"domains": "invalid"}), - ], -) -@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) -def test_standard_domains_raises_error_for_invalid_type( - operation_params: OperationParams, - dataset_type, - standard_metadata, -): - operation = _create_operation(operation_params, standard_metadata, dataset_type) - with pytest.raises(TypeError): - operation.execute() diff --git a/tests/unit/test_rules_engine.py b/tests/unit/test_rules_engine.py index c2f0450dd..5de91eb06 100644 --- a/tests/unit/test_rules_engine.py +++ b/tests/unit/test_rules_engine.py @@ -2380,7 +2380,7 @@ def test_validate_variables_order_against_library_metadata( } standard_data = { "_links": {"model": {"href": "/mdr/sdtm/1-5"}}, - "domains": { + "dataset_names": { "HO", "CO", "SU", @@ -2477,7 +2477,7 @@ def test_validate_variables_order_against_library_metadata( ) def mock_cached_method(*args, **kwargs): - return mock_get_dataset.return_value + return dataset_metadata with patch( "cdisc_rules_engine.services.data_services.LocalDataService.get_raw_dataset_metadata", diff --git a/tests/unit/test_utilities/test_rule_processor.py b/tests/unit/test_utilities/test_rule_processor.py index 197856c64..a345e5785 100644 --- a/tests/unit/test_utilities/test_rule_processor.py +++ b/tests/unit/test_utilities/test_rule_processor.py @@ -4,9 +4,7 @@ import pandas as pd import pytest from conftest import mock_data_service -from cdisc_rules_engine.exceptions.custom_exceptions import ( - OperationError, -) +from cdisc_rules_engine.exceptions.custom_exceptions import DomainNotFoundError from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata from cdisc_rules_engine.models.rule_conditions import ConditionCompositeFactory from cdisc_rules_engine.models.rule_conditions.condition_composite import ( @@ -489,7 +487,7 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation): df = dataset_implementation.from_dict( {"AESTDY": [11, 12, 40, 59, 59], "DOMAIN": ["AE", "AE", "AE", "AE", "AE"]} ) - datasets = [ + datasets_metadata = [ SDTMDatasetMetadata( filename="ae.xpt", full_path="test/ae.xpt", @@ -502,9 +500,8 @@ def test_perform_rule_operation(mock_data_service, dataset_implementation): result = processor.perform_rule_operations( rule, df, - "AE", - datasets, - "test/", + datasets_metadata[0], + datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -585,7 +582,7 @@ def test_perform_rule_operation_with_grouping( } ) - datasets = [ + datasets_metadata = [ SDTMDatasetMetadata( filename="ae.xpt", full_path="test/ae.xpt", @@ -599,9 +596,8 @@ def test_perform_rule_operation_with_grouping( data = processor.perform_rule_operations( rule, df, - "AE", - datasets, - "test/", + datasets_metadata[0], + datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -703,7 +699,7 @@ def test_perform_rule_operation_with_multi_key_grouping( } ) - datasets = [ + datasets_metadata = [ SDTMDatasetMetadata( filename="ae.xpt", full_path="test/ae.xpt", @@ -717,9 +713,8 @@ def test_perform_rule_operation_with_multi_key_grouping( data = processor.perform_rule_operations( rule, df, - "AE", - datasets, - "test/", + datasets_metadata[0], + datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -763,7 +758,7 @@ def test_perform_rule_operation_with_null_operations( df = dataset_implementation.from_dict( {"AESTDY": [11, 12, 40, 59], "USUBJID": [1, 200, 1, 200]} ) - datasets = [ + datasets_metadata = [ SDTMDatasetMetadata( filename="ae.xpt", full_path="test/ae.xpt", @@ -775,9 +770,8 @@ def test_perform_rule_operation_with_null_operations( new_data = processor.perform_rule_operations( rule, df, - "AE", - datasets, - "test/", + datasets_metadata[0], + datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -901,16 +895,19 @@ def test_perform_extract_metadata_operation( } ) processor = RuleProcessor(mock, InMemoryCacheService()) + datasets_metadata = [ + SDTMDatasetMetadata( + name="SUPPEC", + first_record={"RDOMAIN": "EC"}, + filename="suppec.xpt", + full_path="study/data_bundle/suppec.xpt", + ) + ] dataset_after_operation = processor.perform_rule_operations( rule=rule_equal_to_with_extract_metadata_operation, dataset=dataset, - domain="SUPPEC", - datasets=[ - SDTMDatasetMetadata( - name="SUPPEC", first_record={"RDOMAIN": "EC"}, filename="suppec.xpt" - ) - ], - dataset_path="study/data_bundle/suppec.xpt", + dataset_metadata=datasets_metadata[0], + datasets=datasets_metadata, standard="sdtmig", standard_version="3-1-2", standard_substandard=None, @@ -1165,23 +1162,25 @@ def test_operation_nonexistent_domain_raises_error(mock_data_service): } processor = RuleProcessor(mock_data_service, InMemoryCacheService()) datasets_metadata = [ - SDTMDatasetMetadata(name="LB", filename="lb.xpt", first_record={"DOMAIN": "LB"}) + SDTMDatasetMetadata( + name="LB", + filename="lb.xpt", + first_record={"DOMAIN": "LB"}, + full_path="lb.xpt", + ) ] - with pytest.raises(OperationError) as exc_info: + with pytest.raises(DomainNotFoundError) as exc_info: processor.perform_rule_operations( rule=rule, dataset=df.copy(), - domain="LB", + dataset_metadata=datasets_metadata[0], datasets=datasets_metadata, - dataset_path="lb.xpt", standard="sdtmig", standard_version="3-1-2", standard_substandard=None, ) error_message = str(exc_info.value) assert ( - "Failed to execute rule operation. Operation: distinct, " - "Target: AESEQ, Domain: AE, Error: Failed to execute rule operation. " - "Domain AE does not exist. Operation: distinct, Target: AESEQ, Core ID: None" + "Failed to execute rule operation. Domain AE does not exist. Operation: distinct, Target: AESEQ, Core ID: None" == error_message ) diff --git a/tests/unit/test_utilities/test_sdtm_utils.py b/tests/unit/test_utilities/test_sdtm_utils.py index c937db5b9..73a1da9f7 100644 --- a/tests/unit/test_utilities/test_sdtm_utils.py +++ b/tests/unit/test_utilities/test_sdtm_utils.py @@ -1,10 +1,11 @@ import pytest -from unittest.mock import Mock +from unittest.mock import Mock, patch from scripts.script_utils import ( get_library_metadata_from_cache, ) from cdisc_rules_engine.utilities.sdtm_utilities import ( + get_corresponding_datasets, get_variables_metadata_from_standard, get_variables_metadata_from_standard_model, ) @@ -38,21 +39,11 @@ def mock_datasets(): return [] -@pytest.fixture -def mock_dataset(): - """Mock dataset for tests.""" - return Mock() - - -def test_standard_domain_ae( - library_metadata, mock_data_service, mock_dataset, mock_datasets -): +def test_standard_domain_ae(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="AE", first_record={"DOMAIN": "AE"}) variables = get_variables_metadata_from_standard( - "AE", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/ae.xpt", mock_datasets, @@ -62,15 +53,11 @@ def test_standard_domain_ae( assert any(var["name"] == "AESTDTC" for var in variables) -def test_standard_domain_dm( - library_metadata, mock_data_service, mock_dataset, mock_datasets -): +def test_standard_domain_dm(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="DM", first_record={"DOMAIN": "DM"}) variables = get_variables_metadata_from_standard( - "DM", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/dm.xpt", mock_datasets, @@ -80,15 +67,11 @@ def test_standard_domain_dm( assert any(var["name"] == "SEX" for var in variables) -def test_findings_domain_lb( - library_metadata, mock_data_service, mock_dataset, mock_datasets -): +def test_findings_domain_lb(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard( - "LB", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/lb.xpt", mock_datasets, @@ -99,13 +82,11 @@ def test_findings_domain_lb( assert any(var["name"] == "LBORRES" for var in variables) -def test_supp_domain(library_metadata, mock_data_service, mock_dataset, mock_datasets): +def test_supp_domain(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( - "SUPPAE", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/suppae.xpt", mock_datasets, @@ -115,13 +96,11 @@ def test_supp_domain(library_metadata, mock_data_service, mock_dataset, mock_dat assert any(var["name"] == "QLABEL" for var in variables) -def test_sq_domain(library_metadata, mock_data_service, mock_dataset, mock_datasets): +def test_sq_domain(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="SQAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard( - "SQAE", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/sqae.xpt", mock_datasets, @@ -131,13 +110,11 @@ def test_sq_domain(library_metadata, mock_data_service, mock_dataset, mock_datas assert any(var["name"] == "QLABEL" for var in variables) -def test_ap_domain(library_metadata, mock_data_service, mock_dataset, mock_datasets): +def test_ap_domain(library_metadata, mock_data_service, mock_datasets): dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard( - "APDM", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/apdm.xpt", mock_datasets, @@ -149,15 +126,15 @@ def test_ap_domain(library_metadata, mock_data_service, mock_dataset, mock_datas assert any(var["name"] == "DMDY" for var in variables) -def test_sqap_domain(library_metadata, mock_data_service, mock_dataset, mock_datasets): - dataset_metadata = SDTMDatasetMetadata(name="SQAP", first_record={"QNAM": "TEST"}) +def test_sqap_domain(library_metadata, mock_data_service, mock_datasets): + dataset_metadata = SDTMDatasetMetadata( + name="SQAPMH", first_record={"QNAM": "TEST", "RDOMAIN": "APMH"} + ) variables = get_variables_metadata_from_standard( - "SQAP", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, - "/path/to/sqap.xpt", + "/path/to/sqapmh.xpt", mock_datasets, ) assert any(var["name"] == "APID" for var in variables) @@ -165,16 +142,12 @@ def test_sqap_domain(library_metadata, mock_data_service, mock_dataset, mock_dat assert any(var["name"] == "RDOMAIN" for var in variables) -def test_findings_about_domain_fa( - library_metadata, mock_data_service, mock_dataset, mock_datasets -): +def test_findings_about_domain_fa(library_metadata, mock_data_service, mock_datasets): """Test Findings About domain includes FINDINGS class variables.""" dataset_metadata = SDTMDatasetMetadata(name="FA", first_record={"DOMAIN": "FA"}) variables = get_variables_metadata_from_standard( - "FA", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/fa.xpt", mock_datasets, @@ -188,7 +161,6 @@ def test_findings_domain_from_model(library_metadata, mock_data_service, mock_da mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="LB", first_record={"DOMAIN": "LB"}) variables = get_variables_metadata_from_standard_model( - domain="LB", dataframe=mock_dataframe, datasets=mock_datasets, dataset_path="/path/to/lb.xpt", @@ -205,7 +177,6 @@ def test_supp_domain_from_model(library_metadata, mock_data_service, mock_datase mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SUPPAE", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( - domain="SUPPAE", dataframe=mock_dataframe, datasets=mock_datasets, dataset_path="/path/to/suppae.xpt", @@ -222,7 +193,6 @@ def test_sqap_domain_from_model(library_metadata, mock_data_service, mock_datase mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="SQAP", first_record={"QNAM": "TEST"}) variables = get_variables_metadata_from_standard_model( - domain="SQAP", dataframe=mock_dataframe, datasets=mock_datasets, dataset_path="/path/to/suppae.xpt", @@ -239,7 +209,6 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets mock_dataframe = Mock() dataset_metadata = SDTMDatasetMetadata(name="APDM", first_record={"APID": "001"}) variables = get_variables_metadata_from_standard_model( - domain="APDM", dataframe=mock_dataframe, datasets=mock_datasets, dataset_path="/path/to/apdm.xpt", @@ -253,18 +222,13 @@ def test_ap_domain_from_model(library_metadata, mock_data_service, mock_datasets assert any(var["name"] == "DMDY" for var in variables) -def test_custom_domain_events_class( - library_metadata, mock_data_service, mock_dataset, mock_datasets -): +def test_custom_domain_events_class(library_metadata, mock_data_service, mock_datasets): """Test custom domain detection and variable metadata retrieval for EVENTS class.""" dataset_metadata = SDTMDatasetMetadata(name="ZZ", first_record={"DOMAIN": "ZZ"}) mock_data_service._handle_custom_domains = Mock(return_value="EVENTS") - mock_dataset.columns = ["STUDYID", "DOMAIN", "USUBJID", "ZZTERM", "ZZSEQ"] variables = get_variables_metadata_from_standard( - "ZZ", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/zz.xpt", mock_datasets, @@ -277,17 +241,14 @@ def test_custom_domain_events_class( def test_custom_domain_findings_class( - library_metadata, mock_data_service, mock_dataset, mock_datasets + library_metadata, mock_data_service, mock_datasets ): """Test custom domain detection and variable metadata retrieval for FINDINGS class.""" dataset_metadata = SDTMDatasetMetadata(name="XX", first_record={"DOMAIN": "XX"}) mock_data_service._handle_custom_domains = Mock(return_value="FINDINGS") - mock_dataset.columns = ["STUDYID", "DOMAIN", "USUBJID", "XXTESTCD", "XXORRES"] variables = get_variables_metadata_from_standard( - "XX", library_metadata, mock_data_service, - mock_dataset, dataset_metadata, "/path/to/xx.xpt", mock_datasets, @@ -298,3 +259,159 @@ def test_custom_domain_findings_class( assert any(var["name"] == "USUBJID" for var in variables) assert any(var["name"] == "XXTESTCD" for var in variables) assert any(var["name"] == "XXORRES" for var in variables) + + +mock_datasets_ss = [ + {"filename": "SS11.xpt", "first_record": {"DOMAIN": "SS"}}, +] + + +@patch( + "cdisc_rules_engine.utilities.sdtm_utilities.get_corresponding_datasets", + return_value=mock_datasets_ss, +) +def test_is_split_dataset_from_file(mock_get_corresponding_datasets): + result = ( + SDTMDatasetMetadata( + filename="SS11.xpt", first_record={"DOMAIN": "SS"} + ).is_split, + ) + assert result + + +datasets_tests = [ + ( + {"name": "SS", "first_record": {"RDOMAIN": "SS"}}, + False, + ), + ( + {"name": "SUPPSS", "first_record": {"RDOMAIN": "SS"}}, + True, + ), + ({"name": "SUPPSS1", "first_record": {"RDOMAIN": "SS"}}, True), + ({"name": "SQAPSSS1", "first_record": {"RDOMAIN": "APSS"}}, True), +] + + +@pytest.mark.parametrize("mock_dataset, expected", datasets_tests) +def test_is_supp_dataset(mock_dataset, expected): + result = SDTMDatasetMetadata(**mock_dataset).is_supp + assert ( + result == expected + ), f"Expected {expected} but got {result} for datasets {datasets_tests}" + + +is_ap_tests = [ + ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, True), + ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, True), + ({"first_record": {"DOMAIN": "APFAMH", "APID": "AP004"}}, True), + ({"first_record": {"DOMAIN": "AE"}}, False), + ({"first_record": {"DOMAIN": "LB"}}, False), + ({"first_record": {"DOMAIN": "AP"}}, False), + ({"first_record": {"DOMAIN": "APF"}}, False), + ({"first_record": None}, False), + ({"first_record": {}}, False), + ({}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, True), + ({"name": "SUPPQS", "first_record": {"RDOMAIN": "QS"}}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "AP"}}, False), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APF"}}, False), + ({"first_record": {"APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, True), + ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, True), +] + + +@pytest.mark.parametrize("mock_dataset, expected", is_ap_tests) +def test_is_ap_dataset(mock_dataset, expected): + result = SDTMDatasetMetadata(**mock_dataset).is_ap + assert ( + result == expected + ), f"Expected {expected} but got {result} for dataset {mock_dataset}" + + +ap_suffix_tests = [ + ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, "FA"), + ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, "XX"), + ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, "QS"), + ({"first_record": {"DOMAIN": "APLB", "APID": "AP004"}}, "LB"), + ({"first_record": {"DOMAIN": "APFA", "APID": "AP005"}}, "FA"), + ({"first_record": {"DOMAIN": "AE"}}, ""), + ({"first_record": {"DOMAIN": "LB"}}, ""), + ({"first_record": {"DOMAIN": "AP"}}, ""), + ({"first_record": {"DOMAIN": "APF"}}, ""), + ({"first_record": None}, ""), + ({"first_record": {}}, ""), + ({}, ""), + ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, ""), + ({"first_record": {"APID": "AP001"}}, ""), + ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, ""), + ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, ""), +] + + +@pytest.mark.parametrize("mock_dataset, expected", ap_suffix_tests) +def test_ap_suffix_property(mock_dataset, expected): + result = SDTMDatasetMetadata(**mock_dataset).ap_suffix + assert ( + result == expected + ), f"Expected {expected} but got {result} for dataset {mock_dataset}" + + +datasets = [ + SDTMDatasetMetadata(**dataset) + for dataset in [ + {"filename": "SS.xpt", "first_record": {"DOMAIN": "SS"}}, + {"filename": "SS12.xpt", "first_record": {"DOMAIN": "SS"}}, + {"filename": "AE.xpt", "first_record": {"DOMAIN": "AE"}}, + {"filename": "DD.xpt", "first_record": {"DOMAIN": "DD"}}, + {"filename": "EC.xpt", "first_record": {"DOMAIN": "EC"}}, + {"filename": "EX.xpt", "first_record": {"DOMAIN": "EX"}}, + {"filename": "FA.xpt", "first_record": {"DOMAIN": "FA"}}, + {"filename": "FT.xpt", "first_record": {"DOMAIN": "FT"}}, + {"filename": "RS.xpt", "first_record": {"DOMAIN": "RS"}}, + {"filename": "AB.xpt", "first_record": {"DOMAIN": "AB"}}, + {"filename": "AB12.xpt", "first_record": {"DOMAIN": "AB"}}, + ] +] + + +# Parameters for testing each domain +domain_test_cases = [ + ( + "SS", + [ + {"filename": "SS.xpt", "first_record": {"DOMAIN": "SS"}}, + {"filename": "SS12.xpt", "first_record": {"DOMAIN": "SS"}}, + ], + ), + ( + "AB", + [ + {"filename": "AB.xpt", "first_record": {"DOMAIN": "AB"}}, + {"filename": "AB12.xpt", "first_record": {"DOMAIN": "AB"}}, + ], + ), + ("AE", [{"filename": "AE.xpt", "first_record": {"DOMAIN": "AE"}}]), + ("DD", [{"filename": "DD.xpt", "first_record": {"DOMAIN": "DD"}}]), + ("EC", [{"filename": "EC.xpt", "first_record": {"DOMAIN": "EC"}}]), + ("EX", [{"filename": "EX.xpt", "first_record": {"DOMAIN": "EX"}}]), + ("FA", [{"filename": "FA.xpt", "first_record": {"DOMAIN": "FA"}}]), + ("FT", [{"filename": "FT.xpt", "first_record": {"DOMAIN": "FT"}}]), + ("RS", [{"filename": "RS.xpt", "first_record": {"DOMAIN": "RS"}}]), +] + + +@pytest.mark.parametrize("domain, expected_datasets", domain_test_cases) +def test_get_corresponding_datasets(domain, expected_datasets): + result_datasets = get_corresponding_datasets( + datasets, SDTMDatasetMetadata(first_record={"DOMAIN": domain}) + ) + assert result_datasets == [ + SDTMDatasetMetadata(**dataset) for dataset in expected_datasets + ], f"The function should return only datasets matching the '{domain}' domain" diff --git a/tests/unit/test_utilities/test_utils.py b/tests/unit/test_utilities/test_utils.py deleted file mode 100644 index 5cc8577f5..000000000 --- a/tests/unit/test_utilities/test_utils.py +++ /dev/null @@ -1,161 +0,0 @@ -import pytest -from unittest.mock import patch -from cdisc_rules_engine.utilities.utils import ( - get_corresponding_datasets, -) -from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata - -mock_datasets = [ - {"filename": "SS11.xpt", "first_record": {"DOMAIN": "SS"}}, -] - - -@patch( - "cdisc_rules_engine.utilities.utils.get_corresponding_datasets", - return_value=mock_datasets, -) -def test_is_split_dataset_from_file(mock_get_corresponding_datasets): - result = ( - SDTMDatasetMetadata( - filename="SS11.xpt", first_record={"DOMAIN": "SS"} - ).is_split, - ) - assert result - - -datasets_tests = [ - ( - {"name": "SS", "first_record": {"RDOMAIN": "SS"}}, - False, - ), - ( - {"name": "SUPPSS", "first_record": {"RDOMAIN": "SS"}}, - True, - ), - ({"name": "SUPPSS1", "first_record": {"RDOMAIN": "SS"}}, True), - ({"name": "SQAPSSS1", "first_record": {"RDOMAIN": "APSS"}}, True), -] - - -@pytest.mark.parametrize("mock_dataset, expected", datasets_tests) -def test_is_supp_dataset(mock_dataset, expected): - result = SDTMDatasetMetadata(**mock_dataset).is_supp - assert ( - result == expected - ), f"Expected {expected} but got {result} for datasets {mock_datasets}" - - -is_ap_tests = [ - ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, True), - ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, True), - ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, True), - ({"first_record": {"DOMAIN": "APFAMH", "APID": "AP004"}}, True), - ({"first_record": {"DOMAIN": "AE"}}, False), - ({"first_record": {"DOMAIN": "LB"}}, False), - ({"first_record": {"DOMAIN": "AP"}}, False), - ({"first_record": {"DOMAIN": "APF"}}, False), - ({"first_record": None}, False), - ({"first_record": {}}, False), - ({}, False), - ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, True), - ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, True), - ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, True), - ({"name": "SUPPQS", "first_record": {"RDOMAIN": "QS"}}, False), - ({"name": "SQAPQS", "first_record": {"RDOMAIN": "AP"}}, False), - ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APF"}}, False), - ({"first_record": {"APID": "AP001"}}, True), - ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, True), - ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, True), -] - - -@pytest.mark.parametrize("mock_dataset, expected", is_ap_tests) -def test_is_ap_dataset(mock_dataset, expected): - result = SDTMDatasetMetadata(**mock_dataset).is_ap - assert ( - result == expected - ), f"Expected {expected} but got {result} for dataset {mock_dataset}" - - -ap_suffix_tests = [ - ({"first_record": {"DOMAIN": "APFA", "APID": "AP001"}}, "FA"), - ({"first_record": {"DOMAIN": "APXX", "APID": "AP002"}}, "XX"), - ({"first_record": {"DOMAIN": "APQS", "APID": "AP003"}}, "QS"), - ({"first_record": {"DOMAIN": "APLB", "APID": "AP004"}}, "LB"), - ({"first_record": {"DOMAIN": "APFA", "APID": "AP005"}}, "FA"), - ({"first_record": {"DOMAIN": "AE"}}, ""), - ({"first_record": {"DOMAIN": "LB"}}, ""), - ({"first_record": {"DOMAIN": "AP"}}, ""), - ({"first_record": {"DOMAIN": "APF"}}, ""), - ({"first_record": None}, ""), - ({"first_record": {}}, ""), - ({}, ""), - ({"name": "SQAPQS", "first_record": {"RDOMAIN": "APQS"}}, ""), - ({"name": "SQAPQSX", "first_record": {"RDOMAIN": "APQS"}}, ""), - ({"name": "SQAPQSXX", "first_record": {"RDOMAIN": "APQS"}}, ""), - ({"first_record": {"APID": "AP001"}}, ""), - ({"first_record": {"DOMAIN": "AP", "APID": "AP001"}}, ""), - ({"first_record": {"DOMAIN": "APF", "APID": "AP001"}}, ""), -] - - -@pytest.mark.parametrize("mock_dataset, expected", ap_suffix_tests) -def test_ap_suffix_property(mock_dataset, expected): - result = SDTMDatasetMetadata(**mock_dataset).ap_suffix - assert ( - result == expected - ), f"Expected {expected} but got {result} for dataset {mock_dataset}" - - -datasets = [ - SDTMDatasetMetadata(**dataset) - for dataset in [ - {"filename": "SS.xpt", "first_record": {"DOMAIN": "SS"}}, - {"filename": "SS12.xpt", "first_record": {"DOMAIN": "SS"}}, - {"filename": "AE.xpt", "first_record": {"DOMAIN": "AE"}}, - {"filename": "DD.xpt", "first_record": {"DOMAIN": "DD"}}, - {"filename": "EC.xpt", "first_record": {"DOMAIN": "EC"}}, - {"filename": "EX.xpt", "first_record": {"DOMAIN": "EX"}}, - {"filename": "FA.xpt", "first_record": {"DOMAIN": "FA"}}, - {"filename": "FT.xpt", "first_record": {"DOMAIN": "FT"}}, - {"filename": "RS.xpt", "first_record": {"DOMAIN": "RS"}}, - {"filename": "AB.xpt", "first_record": {"DOMAIN": "AB"}}, - {"filename": "AB12.xpt", "first_record": {"DOMAIN": "AB"}}, - ] -] - - -# Parameters for testing each domain -domain_test_cases = [ - ( - "SS", - [ - {"filename": "SS.xpt", "first_record": {"DOMAIN": "SS"}}, - {"filename": "SS12.xpt", "first_record": {"DOMAIN": "SS"}}, - ], - ), - ( - "AB", - [ - {"filename": "AB.xpt", "first_record": {"DOMAIN": "AB"}}, - {"filename": "AB12.xpt", "first_record": {"DOMAIN": "AB"}}, - ], - ), - ("AE", [{"filename": "AE.xpt", "first_record": {"DOMAIN": "AE"}}]), - ("DD", [{"filename": "DD.xpt", "first_record": {"DOMAIN": "DD"}}]), - ("EC", [{"filename": "EC.xpt", "first_record": {"DOMAIN": "EC"}}]), - ("EX", [{"filename": "EX.xpt", "first_record": {"DOMAIN": "EX"}}]), - ("FA", [{"filename": "FA.xpt", "first_record": {"DOMAIN": "FA"}}]), - ("FT", [{"filename": "FT.xpt", "first_record": {"DOMAIN": "FT"}}]), - ("RS", [{"filename": "RS.xpt", "first_record": {"DOMAIN": "RS"}}]), -] - - -@pytest.mark.parametrize("domain, expected_datasets", domain_test_cases) -def test_get_corresponding_datasets(domain, expected_datasets): - result_datasets = get_corresponding_datasets( - datasets, SDTMDatasetMetadata(first_record={"DOMAIN": domain}) - ) - assert result_datasets == [ - SDTMDatasetMetadata(**dataset) for dataset in expected_datasets - ], f"The function should return only datasets matching the '{domain}' domain"