Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 18 additions & 7 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am getting an error when I run Marcelina's data.
On line 159

        """
        Gets Define XML variables metadata.
        """
        define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
            self.dataset_path, self.define_xml_path, self.data_service, self.cache
        )
        return define_xml_reader.extract_variables_metadata(
            domain_name=self.dataset_metadata.domain
        )

extract_variables_metadata is being called with the None for domain for SUPPEC. We need a check for self.dataset_metadata.is_supp() and if so call extract_variables_metadata with the rdomain property. You could also do a check if domain exists, and if not, use rdomain.

the other issue I see would occur downstream from this: _get_domain_metadata() in base_define_xml__reader is looking only based on domain and grabbing the first that matches. This will be an issue when differentiating Supp-- from Parents.
image
I suspect passing the name and the domain/rdomain may be necessary and using the Name and the Domain property of the itemgroupdef to pick which metadata to grab would be best plan of attack

Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
get_corresponding_datasets,
tag_source,
)
from typing import List, Iterable
from typing import List, Iterable, Optional
from cdisc_rules_engine.utilities import sdtm_utilities
from cdisc_rules_engine.utilities.rule_processor import RuleProcessor
from cdisc_rules_engine.models.dataset.dataset_interface import DatasetInterface
Expand Down Expand Up @@ -156,9 +156,17 @@ def get_define_xml_variables_metadata(self) -> List[dict]:
define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
self.dataset_path, self.define_xml_path, self.data_service, self.cache
)
return define_xml_reader.extract_variables_metadata(
domain_name=self.dataset_metadata.domain
)
# If domain is not set and this is a SUPP domain, use rdomain
domain = self.dataset_metadata.domain
if not domain and getattr(self.dataset_metadata, "is_supp", False):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is the problem I described in my comment above. "the other issue I see would occur downstream from this: _get_domain_metadata() in base_define_xml__reader is looking only based on domain and grabbing the first that matches. This will be an issue when differentiating Supp-- from Parents."
image
we need to update extract_variables_metadata to not only use domain but also name as if we use the rdomain--it will grab the parent as they both have domain set to the same thing. There is a name property 'SUPP--' that will be the same as self.dataset_metadata.name. This can be used in the case of supps to correctly retreive the supp data and not the parent dataset metadata

domain = getattr(self.dataset_metadata, "rdomain", None)
name = getattr(self.dataset_metadata, "name", None)
return define_xml_reader.extract_variables_metadata(
domain_name=domain, name=name
)
if not domain:
return []
return define_xml_reader.extract_variables_metadata(domain_name=domain)

def get_define_xml_value_level_metadata(self) -> List[dict]:
"""
Expand Down Expand Up @@ -205,13 +213,16 @@ def get_library_variables_metadata(self) -> DatasetInterface:
variables: List[dict] = sdtm_utilities.get_variables_metadata_from_standard(
domain=domain, library_metadata=self.library_metadata
)
variables_metadata: dict = self.library_metadata.variables_metadata.get(
domain, {}
)
for variable in variables:
variable["ccode"] = ""
if variable.get("codelistSubmissionValues"):
variable_metadata: Optional[dict] = variables_metadata.get(variable["name"])
if variable_metadata:
if "_links" in variable and "codelist" in variable["_links"]:
first_codelist = variable["_links"]["codelist"][0]
href = first_codelist["href"]
codelist_code = href.split("/")[-1]
codelist_code = first_codelist["href"].split("/")[-1]
variable["ccode"] = codelist_code
if "role" not in variable:
variable["role"] = ""
Expand Down
2 changes: 0 additions & 2 deletions cdisc_rules_engine/rules_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,6 @@ def validate_rule(
elif (
rule.get("rule_type")
== RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value
or rule.get("rule_type")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you please add details to the PR explaining why this check was removed and summarize your overall work?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After I fixed the dataset builder to correctly load library_variable_ccode the rule still wasn't working as expected, the equality checks just didn't work.

I've started diving in and noticed that after calling add_comparator_to_rule_conditions on this rule it becomes corrupt:

Image

Pay attention to the conditions 2 and 3 - comparator values become define_library_variable_ccode whereas they should be library_variable_ccode and define_variable_ccode

== RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE_XML_AND_LIBRARY.value
):
self.rule_processor.add_comparator_to_rule_conditions(
rule, comparator=None, target_prefix="define_"
Expand Down
21 changes: 15 additions & 6 deletions cdisc_rules_engine/services/define_xml/base_define_xml_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,13 +157,15 @@ def extract_domain_metadata(self, domain_name: str = None) -> dict:
return domain_metadata_dict

@cached("define-variables-metadata")
def extract_variables_metadata(self, domain_name: str = None) -> List[dict]:
def extract_variables_metadata(
self, domain_name: str = None, name: str = None
) -> List[dict]:
logger.info(
f"Extracting variables metadata from Define-XML. domain_name={domain_name}"
)
try:
metadata = self._odm_loader.MetaDataVersion()
domain_metadata = self._get_domain_metadata(metadata, domain_name)
domain_metadata = self._get_domain_metadata(metadata, domain_name, name)
variables_metadata = []
codelist_map = self._get_codelist_def_map(metadata.CodeList)
for index, itemref in enumerate(domain_metadata.ItemRef):
Expand Down Expand Up @@ -267,11 +269,18 @@ def _get_dataset_metadata(self, metadata, dataset_name):
f"Dataset {dataset_name} is not found in Define XML"
)

def _get_domain_metadata(self, metadata, domain_name):
def _get_domain_metadata(self, metadata, domain_name, name: str = None):
try:
domain_metadata = next(
item for item in metadata.ItemGroupDef if item.Domain == domain_name
)
if name:
domain_metadata = next(
item
for item in metadata.ItemGroupDef
if item.Domain == domain_name and item.Name == name
)
else:
domain_metadata = next(
item for item in metadata.ItemGroupDef if item.Domain == domain_name
)
return domain_metadata
except StopIteration:
raise DomainNotFoundInDefineXMLError(
Expand Down
11 changes: 10 additions & 1 deletion cdisc_rules_engine/utilities/decorators.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,20 @@ def cached(cache_key: str): # noqa: C901
"""

def format_cache_key(
key: str, args=[], study_id=None, data_bundle_id=None, domain_name=None
key: str,
args=[],
study_id=None,
data_bundle_id=None,
domain_name=None,
name=None,
):
"""
If a study_id and data_bundle_id are available,
cache_key = {study_id}/{data_bundle_id}/key
else the function just returns the provided cache key.
"""
if name:
key = f"{name}/" + key
if domain_name:
key = f"{domain_name}/" + key
if data_bundle_id:
Expand Down Expand Up @@ -85,6 +92,7 @@ def inner(*args, **kwargs):
if hasattr(instance, "domain")
else kwargs.get("domain_name")
)
name = instance.name if hasattr(instance, "name") else kwargs.get("name")
if (
hasattr(instance, "cache_service")
and instance.cache_service is not None
Expand All @@ -95,6 +103,7 @@ def inner(*args, **kwargs):
study_id=study_id,
data_bundle_id=data_bundle_id,
domain_name=domain_name,
name=name,
)
cached_data = instance.cache_service.get(key)
if cached_data is not None:
Expand Down
131 changes: 131 additions & 0 deletions tests/QARegressionTests/test_Issues/test_CoreIssue1421.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import os
import subprocess
import openpyxl
import pytest
from conftest import get_python_executable
from QARegressionTests.globals import (
dataset_details_sheet,
issue_datails_sheet,
rules_report_sheet,
issue_sheet_variable_column,
issue_sheet_coreid_column,
)


@pytest.mark.regression
def test_validate_define_xml_against_lib_metadata():
command = [
f"{get_python_executable()}",
"-m",
"core",
"validate",
"-s",
"sdtmig",
"-v",
"3-4",
"-dp",
os.path.join(
"tests",
"resources",
"CoreIssue1421",
"Dataset.json",
),
"-lr",
os.path.join("tests", "resources", "CoreIssue1421", "Rule.yml"),
"-dxp",
os.path.join("tests", "resources", "CoreIssue1421", "Define.xml"),
]
subprocess.run(command, check=True)

# Get the latest created Excel file
files = os.listdir()
excel_files = [
file
for file in files
if file.startswith("CORE-Report-") and file.endswith(".xlsx")
]
excel_file_path = sorted(excel_files)[-1]
# Open the Excel file
workbook = openpyxl.load_workbook(excel_file_path)

# Go to the "Issue Details" sheet
sheet = workbook[issue_datails_sheet]

variables_values_column = sheet[issue_sheet_variable_column]
variables_values = [
cell.value for cell in variables_values_column[1:] if cell.value is not None
]
assert len(variables_values) == 1
for value in variables_values:
assert len(value.split(",")) == 6

variables_names_column = sheet["H"]
variables_names_values = [
cell.value for cell in variables_names_column[1:] if cell.value is not None
]
assert len(variables_names_values) == 1
for value in variables_names_values:
assert len(value.split(",")) == 6

dataset_column = sheet["D"]
dataset_column_values = [
cell.value for cell in dataset_column[1:] if cell.value is not None
]
assert sorted(set(dataset_column_values)) == ["dm.xpt"]

core_id_column = sheet[issue_sheet_coreid_column]
core_id_column_values = [
cell.value for cell in core_id_column[1:] if cell.value is not None
]
assert set(core_id_column_values) == {"CDISC.SDTMIG.CG0999"}

# Go to the "Rules Report" sheet
rules_values = [
row for row in workbook[rules_report_sheet].iter_rows(values_only=True)
][1:]
rules_values = [row for row in rules_values if any(row)]
assert rules_values[0][0] == "CDISC.SDTMIG.CG0999"
assert "SUCCESS" in rules_values[0]
assert (
rules_values[0][4]
== "Issue with codelist definition in the Define-XML document."
)

# Go to the "Dataset Details" sheet
dataset_sheet = workbook[dataset_details_sheet]
dataset_values = [row for row in dataset_sheet.iter_rows(values_only=True)][1:]
dataset_values = [row for row in dataset_values if any(row)]
assert len(dataset_values) > 0
dataset_names = set(row[0] for row in dataset_values if row[0] is not None)
assert dataset_names == {"ae.xpt", "dm.xpt", "ec.xpt", "ex.xpt", "suppec.xpt"}
expected_records = {
"ae.xpt": 74,
"dm.xpt": 18,
"ec.xpt": 1590,
"ex.xpt": 1583,
"suppec.xpt": 13,
}
for row in dataset_values:
dataset_name = row[0]
records_count = row[-1]
assert records_count == expected_records[dataset_name]

# Go to the "Issue Summary" sheet
issue_summary_sheet = workbook["Issue Summary"]
summary_values = [row for row in issue_summary_sheet.iter_rows(values_only=True)][
1:
]
summary_values = [row for row in summary_values if any(row)]
assert len(summary_values) == 1
core_ids = set(row[1] for row in summary_values if row[1] is not None)
assert core_ids == {"CDISC.SDTMIG.CG0999"}
# Check Message and dataset columns
assert (
summary_values[0][2]
== "Issue with codelist definition in the Define-XML document."
)
assert summary_values[0][0] == "dm.xpt"

# Delete the excel file
if os.path.exists(excel_file_path):
os.remove(excel_file_path)
Loading
Loading