Skip to content
14 changes: 2 additions & 12 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,21 +197,11 @@ def get_library_variables_metadata(self) -> DatasetInterface:
and self.dataset_metadata.rdomain
):
domain = "SUPPQUAL"
elif (
not self.dataset_metadata.domain
and not self.dataset_metadata.rdomain
and "rel" in self.dataset_metadata.name.lower()
):
if self.dataset_metadata.name.lower().startswith(
"ap"
) and self.dataset_metadata.name.lower()[2:].startswith("rel"):
domain = self.dataset_metadata.name[2:]
else:
domain = self.dataset_metadata.name
else:
domain = self.dataset_metadata.domain
variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard(
domain=domain, library_metadata=self.library_metadata
domain=self.dataset_metadata.unsplit_name,
library_metadata=self.library_metadata,
)
variables_metadata: dict = self.library_metadata.variables_metadata.get(
domain, {}
Expand Down
23 changes: 3 additions & 20 deletions cdisc_rules_engine/operations/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,26 +224,9 @@ def _expand_operation_results_in_grouping(self, grouping_list):

def _get_variables_metadata_from_standard(self) -> List[dict]:
# TODO: Update to handle other standard types: adam, cdash, etc.
target_metadata = None
for ds in self.params.datasets:
if ds.unsplit_name == self.params.domain:
target_metadata = ds
break
if (
target_metadata
and hasattr(target_metadata, "is_supp")
and target_metadata.is_supp
):
domain_for_library = "SUPPQUAL"
elif target_metadata and "rel" in target_metadata.name.lower():
if target_metadata.name.lower().startswith(
"ap"
) and target_metadata.name.lower()[2:].startswith("rel"):
domain_for_library = target_metadata.name[2:]
else:
domain_for_library = target_metadata.name
else:
domain_for_library = self.params.domain

# self.params.domain is unsplit_name
domain_for_library = self.params.domain
return sdtm_utilities.get_variables_metadata_from_standard(
domain_for_library,
self.library_metadata,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ class LibraryModelColumnOrder(BaseOperation):
def _execute_operation(self):
"""
Fetches column order for a given domain from the CDISC library.
Self.params.domain is SDTMDatasetMetadata.unsplit_name
Returns it as a Series of lists like:
0 ["STUDYID", "DOMAIN", ...]
1 ["STUDYID", "DOMAIN", ...]
Expand Down
157 changes: 132 additions & 25 deletions cdisc_rules_engine/utilities/sdtm_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,23 @@ def get_tabulation_model_type_and_version(model_link: dict) -> Tuple:


def get_variables_metadata_from_standard(domain, library_metadata): # noqa
add_AP = False
original_domain = domain
if (
domain
and (domain.upper().startswith("SUPP") or domain.upper().startswith("SQ"))
and len(domain) > 2
):
if domain.upper().startswith("SQ"):
parent_domain = domain[2:]
if parent_domain.upper().startswith("AP"):
add_AP = True
domain = "SUPPQUAL"
elif domain and domain.upper().startswith("AP"):
domain = domain[2:]
original_domain = domain
add_AP = True

standard_details = library_metadata.standard_metadata
model_details = library_metadata.model_metadata
is_custom = domain not in standard_details.get("domains", {})
Expand All @@ -70,13 +87,22 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa
class_variables_metadata,
timing_metadata,
) = get_allowed_class_variables(model_details, model_class_details)
if add_AP:
ap_class_details = get_class_metadata(model_details, "ASSOCIATED PERSONS")
ap_identifiers = ap_class_details.get("classVariables", [])
identifiers_metadata = [
v
for v in identifiers_metadata + ap_identifiers
if v.get("name") != "USUBJID"
]
identifiers_metadata.sort(key=lambda item: int(item["ordinal"]))
model_variables = []
for var_list in [
identifiers_metadata,
class_variables_metadata,
timing_metadata,
]:
replace_variable_wildcards(var_list, domain, model_variables)
replace_variable_wildcards(var_list, original_domain, model_variables)
# Custom domains only pull from model hierarchy
if is_custom:
variables_metadata = model_variables
Expand All @@ -90,13 +116,21 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa
var["name"]: i for i, var in enumerate(variables_metadata)
}
for ig_var in ig_variables:
ig_var_name = ig_var["name"]
if "--" in ig_var["name"]:
ig_var_copy = copy.deepcopy(ig_var)
ig_var_copy["name"] = ig_var_copy["name"].replace(
"--", original_domain
)
ig_var_to_use = ig_var_copy
else:
ig_var_to_use = ig_var
ig_var_name = ig_var_to_use["name"]
if ig_var_name in model_vars_by_name:
variables_metadata[model_vars_by_name[ig_var_name]] = ig_var
variables_metadata[model_vars_by_name[ig_var_name]] = ig_var_to_use
else:
# if a variable exists in the IG but not in the model,
# insert it at the end of the its section
ig_var_role = ig_var.get("role")
ig_var_role = ig_var_to_use.get("role")
if ig_var_role == "Identifier":
identifiers_length = len(identifiers_metadata)
insertion_point = identifiers_length
Expand All @@ -107,12 +141,28 @@ def get_variables_metadata_from_standard(domain, library_metadata): # noqa
insertion_point = (
len(variables_metadata) - timing_metadata_length
)
variables_metadata.insert(insertion_point, ig_var)
variables_metadata.insert(insertion_point, ig_var_to_use)
model_vars_by_name = {
var["name"]: i for i, var in enumerate(variables_metadata)
}
else:
variables_metadata = ig_variables
if add_AP:
ap_class_details = get_class_metadata(
model_details, "ASSOCIATED PERSONS"
)
ap_identifiers = ap_class_details.get("classVariables", [])
ig_variables = [
v
for v in ig_variables + ap_identifiers
if v.get("name") != "USUBJID"
]
ig_variables.sort(key=lambda item: int(item["ordinal"]))
variables_metadata = []
replace_variable_wildcards(
ig_variables, original_domain, variables_metadata
)
else:
variables_metadata = ig_variables
return variables_metadata


Expand Down Expand Up @@ -238,12 +288,22 @@ def get_variables_metadata_from_standard_model( # noqa
classes outside of general observation, we check the model for their definition
if they are not there, differ to the standard definition of the domain
"""
add_AP = False
original_domain = domain
if (
domain
and (domain.upper().startswith("SUPP") or domain.upper().startswith("SQ"))
and len(domain) > 2
):
if domain.upper().startswith("SQ"):
parent_domain = domain[2:]
if parent_domain.upper().startswith("AP"):
add_AP = True
domain = "SUPPQUAL"
elif domain and domain.upper().startswith("AP"):
domain = domain[2:]
original_domain = domain
add_AP = True
standard_details = library_metadata.standard_metadata
model_details = library_metadata.model_metadata

Expand All @@ -258,38 +318,85 @@ def get_variables_metadata_from_standard_model( # noqa
class_variables_metadata,
timing_metadata,
) = get_allowed_class_variables(model_details, model_class_details)
if add_AP:
ap_class_details = get_class_metadata(model_details, "ASSOCIATED PERSONS")
ap_identifiers = ap_class_details.get("classVariables", [])
identifiers_metadata = identifiers_metadata + ap_identifiers
# Remove USUBJID from identifiers and re-sort
identifiers_metadata = [
v for v in identifiers_metadata if v.get("name") != "USUBJID"
]
identifiers_metadata.sort(key=lambda item: int(item["ordinal"]))
variables_metadata = []
if identifiers_metadata:
variables_metadata = identifiers_metadata
variables_metadata = variables_metadata + class_variables_metadata
if timing_metadata:
variables_metadata = variables_metadata + timing_metadata
for var_list in [
identifiers_metadata,
class_variables_metadata,
timing_metadata,
]:
replace_variable_wildcards(var_list, original_domain, variables_metadata)
return variables_metadata
else:
# First, try to get class metadata and check for classVariables i.e. AP class
# First, try to get class metadata and check for classVariables
class_details = get_class_metadata(model_details, class_name)
class_variables = class_details.get("classVariables", [])
if class_variables:
if add_AP:
ap_class_details = get_class_metadata(
model_details, "ASSOCIATED PERSONS"
)
ap_identifiers = ap_class_details.get("classVariables", [])
# Filter out USUBJID from AP identifiers only, then add to class_variables
filtered_ap_identifiers = [
v for v in ap_identifiers if v.get("name") != "USUBJID"
]
class_variables = class_variables + filtered_ap_identifiers
class_variables.sort(key=lambda item: int(item["ordinal"]))
return class_variables
variables_metadata = []
replace_variable_wildcards(
class_variables, original_domain, variables_metadata
)
return variables_metadata
else:
# Second, check if domain exists in model datasets
domain_details = get_model_domain_metadata(model_details, domain)
if domain_details:
dataset_variables = domain_details.get("datasetVariables", [])
if dataset_variables:
dataset_variables.sort(key=lambda item: int(item["ordinal"]))
return dataset_variables
dataset_variables.sort(key=lambda item: int(item["ordinal"]))
if add_AP:
ap_class_details = get_class_metadata(
model_details, "ASSOCIATED PERSONS"
)
ap_identifiers = ap_class_details.get("classVariables", [])
dataset_variables = [
v
for v in dataset_variables + ap_identifiers
if v.get("name") != "USUBJID"
]
variables_metadata = []
replace_variable_wildcards(
dataset_variables, original_domain, variables_metadata
)
variables_metadata.sort(key=lambda item: int(item["ordinal"]))
return variables_metadata
# Third, fall back to standard datasets
for cls in standard_details.get("classes", []):
for dataset in cls.get("datasets", []):
if dataset.get("name") == domain:
dataset_variables = dataset.get("datasetVariables", [])
if dataset_variables:
dataset_variables.sort(
key=lambda item: int(item["ordinal"])
)
return dataset_variables
if IG_domain_details:
dataset_variables = IG_domain_details.get("datasetVariables", [])
dataset_variables.sort(key=lambda item: int(item["ordinal"]))
if add_AP:
ap_class_details = get_class_metadata(
model_details, "ASSOCIATED PERSONS"
)
ap_identifiers = ap_class_details.get("classVariables", [])
dataset_variables = [
v
for v in dataset_variables + ap_identifiers
if v.get("name") != "USUBJID"
]
variables_metadata = []
replace_variable_wildcards(
dataset_variables, original_domain, variables_metadata
)
return variables_metadata
return None


Expand Down
2 changes: 1 addition & 1 deletion env.example
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
CDISC_LIBRARY_API_KEY=your_api_key_here
DATASET_SIZE_THRESHOLD=10485760 # max dataset size in bytes to force dask implementation
MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report
MAX_REPORT_ROWS = 10 # integer for maximum number of issues per excel sheet (plus headers) in result report. Defaults to 10000.
MAX_ERRORS_PER_RULE = (10, True) # Tuple for maximum number of errors to report per rule during a validation run. Also has a per dataset flag described as second bool value in readme. example value
Loading
Loading