Skip to content
Merged

FB0405 #1589

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions cdisc_rules_engine/dataset_builders/base_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_define_xml_item_group_metadata_for_dataset(
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""

define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
Expand All @@ -142,6 +143,7 @@ def get_define_xml_item_group_metadata_for_domain(self, domain: str) -> List[dic
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""

define_xml_reader = DefineXMLReaderFactory.get_define_xml_reader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@
from cdisc_rules_engine.dataset_builders.variables_metadata_values_dataset_builder import (
ValueCheckVariableMetadataDatasetBuilder,
)
from cdisc_rules_engine.dataset_builders.domain_list_with_define_builder import (
DomainListWithDefineDatasetBuilder,
)
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
from cdisc_rules_engine.enums.rule_types import RuleTypes

Expand All @@ -67,6 +70,7 @@ class DatasetBuilderFactory(FactoryInterface):
RuleTypes.DATASET_METADATA_CHECK_AGAINST_DEFINE.value: DatasetMetadataDefineDatasetBuilder,
RuleTypes.VARIABLE_METADATA_CHECK.value: VariablesMetadataDatasetBuilder,
RuleTypes.DOMAIN_PRESENCE_CHECK.value: DomainListDatasetBuilder,
RuleTypes.DOMAIN_PRESENCE_CHECK_AGAINST_DEFINE.value: DomainListWithDefineDatasetBuilder,
RuleTypes.DEFINE_ITEM_METADATA_CHECK.value: DefineVariablesDatasetBuilder,
RuleTypes.VARIABLE_METADATA_CHECK_AGAINST_DEFINE.value: VariablesMetadataWithDefineDatasetBuilder,
RuleTypes.DATASET_CONTENTS_CHECK_AGAINST_DEFINE_AND_LIBRARY.value: ContentsDatasetBuilder,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
from cdisc_rules_engine.models.dataset import DatasetInterface
from cdisc_rules_engine.services import logger
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder
import os
import numpy as np


Expand All @@ -17,6 +15,7 @@ def build(self):
dataset_name - Name of the dataset
dataset_label - Label for the dataset
dataset_domain - Domain of the dataset
dataset_columns - List of columns in the dataset
is_ap - Whether the domain is an AP domain
ap_suffix - The 2-character suffix from AP domains

Expand All @@ -30,12 +29,11 @@ def build(self):
define_dataset_is_non_standard - whether a dataset is a standard
define_dataset_variables - dataset variables
define_dataset_key_sequence - dataset key sequence

...,
define_dataset_has_no_data
"""
# 1. Build define xml dataframe
define_df = self._get_define_xml_dataframe()
# )

# 2. Build dataset dataframe
dataset_df = self._get_dataset_dataframe()
if define_df.empty or dataset_df.empty:
Expand All @@ -49,26 +47,11 @@ def build(self):
right_on=["define_dataset_name", "define_dataset_location"],
how="outer",
)

# 4. Remove NaN
merged._data = merged._data.astype(object).replace({np.nan: None})
# 5. remove unused rows, replace rows with target row
merged_cleaned = merged.dropna(subset=["dataset_name"])
dataset_filename = (
os.path.basename(self.dataset_metadata.full_path).lower()
if self.dataset_metadata.full_path
else None
)
matching_row: DatasetInterface = merged_cleaned[
merged_cleaned["dataset_location"].str.lower() == dataset_filename
]
if matching_row.empty:
# when using DASK dataset_filename refers to temp parquet filename
matching_row: DatasetInterface = merged_cleaned[
merged_cleaned["dataset_location"].str.lower()
== self.dataset_metadata.original_path.lower()
]
for column in merged.columns:
merged[column] = matching_row[column].iloc[0]

# 5. Return all rows (one per dataset)
return merged

def _get_define_xml_dataframe(self):
Expand All @@ -80,6 +63,7 @@ def _get_define_xml_dataframe(self):
"define_dataset_class",
"define_dataset_structure",
"define_dataset_is_non_standard",
"define_dataset_has_no_data",
]
define_metadata = self.get_define_metadata()
if not define_metadata:
Expand All @@ -92,6 +76,8 @@ def _ensure_required_columns(self, dataset_df, dataset_col_order):
dataset_df["dataset_size"] = None
if "is_ap" not in dataset_df.columns:
dataset_df["is_ap"] = False
if "dataset_columns" not in dataset_df.columns:
dataset_df["dataset_columns"] = None
if "ap_suffix" not in dataset_df.columns:
dataset_df["ap_suffix"] = ""
return self.dataset_implementation(dataset_df[dataset_col_order])
Expand All @@ -103,6 +89,7 @@ def _get_dataset_dataframe(self):
"dataset_name",
"dataset_label",
"dataset_domain",
"dataset_columns",
"is_ap",
"ap_suffix",
]
Expand All @@ -121,6 +108,12 @@ def _get_dataset_dataframe(self):
ds_metadata.data["dataset_domain"] = getattr(
dataset, "domain", None
)
if dataset.first_record:
ds_metadata.data["dataset_columns"] = [
list(dataset.first_record.keys())
]
else:
ds_metadata.data["dataset_columns"] = [[]]
except Exception as e:
logger.trace(e)
logger.error(f"Error: {e}. Error message: {str(e)}")
Expand All @@ -136,7 +129,6 @@ def _get_dataset_dataframe(self):
data_col_mapping = {
"filename": "dataset_location",
"label": "dataset_label",
"domain": "dataset_name",
}
dataset_df = datasets.rename(columns=data_col_mapping)
dataset_df = self._ensure_required_columns(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def build(self):
"define_dataset_is_non_standard"
"define_dataset_variables"
"define_dataset_key_sequence"
"define_dataset_has_no_data"
"""
item_group_metadata: List[dict] = (
self.get_define_xml_item_group_metadata_for_domain(
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from cdisc_rules_engine.dataset_builders.base_dataset_builder import BaseDatasetBuilder


class DomainListWithDefineDatasetBuilder(BaseDatasetBuilder):
def build(self):
"""
Returns a dataframe with one row per dataset in Define-XML.

Columns:
- domain: The domain name
- filename: The file name if the dataset exists, None otherwise
- define_dataset_name
- define_dataset_label
- define_dataset_location
- define_dataset_domain
- define_dataset_class
- define_dataset_structure
- define_dataset_is_non_standard
- define_dataset_has_no_data
- define_dataset_key_sequence
- define_dataset_variables

Dataset example:
domain filename define_dataset_name define_dataset_has_no_data
0 AE ae.xpt AE False
1 EC ec.xpt EC False
2 SE None SE True
"""
domain_files = {ds.unsplit_name: ds.filename for ds in self.datasets}
all_define_metadata = self.get_define_metadata()
records = []
for define_item in all_define_metadata:
domain_name = define_item.get("define_dataset_name", "")
record = {
"domain": domain_name,
"filename": domain_files.get(domain_name),
**define_item,
}
records.append(record)

return self.dataset_implementation.from_records(records)
6 changes: 6 additions & 0 deletions cdisc_rules_engine/enums/domain_presence_values.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from cdisc_rules_engine.enums.base_enum import BaseEnum


class DomainPresenceValues(BaseEnum):
DATASET = "STUDY"
RECORD = ""
1 change: 1 addition & 0 deletions cdisc_rules_engine/enums/rule_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ class RuleTypes(BaseEnum):
DEFINE_ITEM_GROUP_METADATA_CHECK = "Define Item Group Metadata Check"
DEFINE_ITEM_METADATA_CHECK = "Define Item Metadata Check"
DOMAIN_PRESENCE_CHECK = "Domain Presence Check"
DOMAIN_PRESENCE_CHECK_AGAINST_DEFINE = "Domain Presence Check against Define XML"
JSONATA = "JSONata"
VALUE_LEVEL_METADATA_CHECK_AGAINST_DEFINE = (
"Value Level Metadata Check against Define XML"
Expand Down
6 changes: 6 additions & 0 deletions cdisc_rules_engine/models/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
SOURCE_ROW_NUMBER,
)
from cdisc_rules_engine.enums.sensitivity import Sensitivity
from cdisc_rules_engine.enums.domain_presence_values import DomainPresenceValues
from cdisc_rules_engine.models.sdtm_dataset_metadata import SDTMDatasetMetadata
from cdisc_rules_engine.models.dataset_variable import DatasetVariable
from cdisc_rules_engine.models.validation_error_container import (
Expand Down Expand Up @@ -62,6 +63,11 @@ def generate_dataset_error_objects(self, message: str, results: pd.Series):
error_object = self.generate_targeted_error_object(
target_names, rows_with_error, message
)
if "domain presence" in self.rule.get("rule_type", "").lower():
error_object.dataset = DomainPresenceValues.DATASET.value
for error in error_object.errors:
error.dataset = DomainPresenceValues.DATASET.value
error.row = DomainPresenceValues.RECORD.value
self.output_container.append(error_object.to_representation())

@rule_action(params={"message": FIELD_TEXT})
Expand Down
8 changes: 8 additions & 0 deletions resources/schema/MetaVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,16 @@ ItemGroupDef.leaf.href

## define_dataset_name

ItemGroupDef.Name

## define_dataset_domain

ItemGroupDef.Domain

## define_dataset_has_no_data

ItemGroupDef.HasNoData

## define_dataset_structure

ItemGroupDef.Structure
Expand Down
4 changes: 4 additions & 0 deletions resources/schema/Rule_Type.json
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
"const": "Domain Presence Check",
"title": "Content domain presence at study level"
},
{
"const": "Domain Presence Check against Define XML",
"title": "Content domain presence at study level with define xml metadata at dataset level"
},
{
"const": "JSON Schema Check",
"title": "Apply JSON schema validation to a JSON file"
Expand Down
60 changes: 51 additions & 9 deletions resources/schema/Rule_Type.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Columns are the columns within the original dataset along with the following col
- `dataset_domain`
- `define_dataset_class`
- `define_dataset_domain`
- `define_dataset_has_no_data`
- `define_dataset_is_non_standard`
- `define_dataset_key_sequence`
- `define_dataset_label`
Expand All @@ -48,22 +49,26 @@ Columns are the columns within the original dataset along with the following col

## Dataset Metadata Check against Define XML

Returns a dataset where each dataset is a row in the new dataset. The define xml dataset metadata is attached to each row.

#### Columns

- `dataset_size`
- `dataset_location`
- `dataset_name`
- `dataset_label`
- `dataset_domain`
- `define_dataset_name`
- `dataset_columns`
- `define_dataset_class`
- `define_dataset_domain`
- `define_dataset_has_no_data`
- `define_dataset_is_non_standard`
- `define_dataset_key_sequence`
- `define_dataset_label`
- `define_dataset_location`
- `define_dataset_domain`
- `define_dataset_class`
- `define_dataset_name`
- `define_dataset_structure`
- `define_dataset_is_non_standard`
- `define_dataset_variables`
- `define_dataset_key_sequence`

#### Rule Macro

Expand Down Expand Up @@ -92,14 +97,16 @@ any:

#### Columns

- `define_dataset_name`
- `define_dataset_class`
- `define_dataset_domain`
- `define_dataset_has_no_data`
- `define_dataset_is_non_standard`
- `define_dataset_key_sequence`
- `define_dataset_label`
- `define_dataset_location`
- `define_dataset_class`
- `define_dataset_name`
- `define_dataset_structure`
- `define_dataset_is_non_standard`
- `define_dataset_variables`
- `define_dataset_key_sequence`

## Define Item Metadata Check

Expand Down Expand Up @@ -179,6 +186,41 @@ all:
operator: not_exists
```

## Domain Presence Check against Define XML

#### Columns

One row per dataset defined in Define-XML:

- `domain`
- `filename` - The file name if dataset exists, null otherwise
- `define_dataset_name`
- `define_dataset_label`
- `define_dataset_location`
- `define_dataset_domain`
- `define_dataset_class`
- `define_dataset_structure`
- `define_dataset_is_non_standard`
- `define_dataset_has_no_data`
- `define_dataset_key_sequence`
- `define_dataset_variables`

#### Example

Check if SE domain is defined in Define-XML without HasNoData="Yes" but the dataset file doesn't exist:

```yaml
all:
- name: define_dataset_name
operator: equal_to
value: "SE"
- name: define_dataset_has_no_data
operator: equal_to
value: False
- name: filename
operator: not_exists
```

## JSONata

Apply a JSONata query to a JSON file. [JSONata documentation](https://docs.jsonata.org)
Expand Down
Loading
Loading