Skip to content
Merged

Cg0562 #1422

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1200,6 +1200,7 @@ def is_inconsistent_across_dataset(self, other_value):
def is_unique_set(self, other_value):
target = self.replace_prefix(other_value.get("target"))
comparator = other_value.get("comparator")
regex_pattern = other_value.get("regex")
values = [target, comparator]
target_data = flatten_list(self.value, values)
target_names = []
Expand All @@ -1209,6 +1210,13 @@ def is_unique_set(self, other_value):
target_names.append(target_name)
target_names = list(set(target_names))
df_group = self.value[target_names].copy()
if regex_pattern:
for col in df_group.columns:
df_group[col] = df_group[col].apply(
lambda x: (
apply_regex(regex_pattern, x) if isinstance(x, str) and x else x
)
)
df_group = df_group.fillna("_NaN_")
group_sizes = df_group.groupby(target_names).size()
counts = df_group.apply(tuple, axis=1).map(group_sizes)
Expand Down
52 changes: 18 additions & 34 deletions cdisc_rules_engine/operations/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,52 +191,36 @@ def _rename_grouping_columns(self, data):
)

def _get_grouping_columns(self) -> List[str]:
if any(item.startswith("$") for item in self.params.grouping):
return self._expand_operation_results_in_grouping(self.params.grouping)
expanded = self._expand_operation_results_in_grouping(self.params.grouping)
if not self.params.grouping_aliases:
return expanded
else:
return (
self.params.grouping
if not self.params.grouping_aliases
else [
(
self.params.grouping_aliases[i]
if 0 <= i < len(self.params.grouping_aliases)
else v
)
for i, v in enumerate(self.params.grouping)
]
)
return [
(
self.params.grouping_aliases[i]
if 0 <= i < len(self.params.grouping_aliases)
else v
)
for i, v in enumerate(expanded)
]

def _expand_operation_results_in_grouping(self, grouping_list):
expanded = []
for item in grouping_list:
if item.startswith("$") and item in self.evaluation_dataset.columns:
if item in self.evaluation_dataset.columns:
operation_col = self.evaluation_dataset[item]
first_val = operation_col.iloc[0]
if operation_col.astype(str).nunique() == 1:
if isinstance(first_val, (list, tuple)):
expanded.extend(first_val)
else:
expanded.append(item)
if (
isinstance(first_val, (list, tuple))
and operation_col.astype(str).nunique() == 1
):
expanded.extend(first_val)
else:
expanded.extend(self._collect_values_from_column(operation_col))
expanded.append(item)
else:
expanded.append(item)
return list(dict.fromkeys(expanded))

def _collect_values_from_column(self, operation_col):
seen = []
for val in operation_col:
if val is not None:
if isinstance(val, (list, tuple)):
for v in val:
if v not in seen:
seen.append(v)
else:
if val not in seen:
seen.append(val)
return seen

def _get_variables_metadata_from_standard(self) -> List[dict]:
# TODO: Update to handle other standard types: adam, cdash, etc.
target_metadata = None
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/operations/record_count.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ def _build_effective_grouping(self) -> tuple[list, dict]:
)
effective_grouping = []
for col in grouping_cols:
if col in self.params.dataframe.columns:
sample_val = self.params.dataframe[col].iloc[0]
if col in self.evaluation_dataset.data.columns:
sample_val = self.evaluation_dataset[col].iloc[0]
if isinstance(sample_val, (list, tuple)):
# This is an operation result - expand the list
effective_grouping.extend(sample_val)
Expand Down
2 changes: 1 addition & 1 deletion cdisc_rules_engine/operations/variable_is_null.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def _execute_operation(self):
]
return self.data_service.dataset_implementation().convert_to_series(result)
else:
target_variable = self.params.target.replace("--", self.params.domain, 1)
target_variable = self.params.target
return self._is_target_variable_null(dataframe, target_variable)

def _is_target_variable_null(self, dataframe, target_variable: str) -> bool:
Expand Down
45 changes: 45 additions & 0 deletions cdisc_rules_engine/utilities/rule_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
LibraryMetadataContainer,
)

import copy
import os
from cdisc_rules_engine.constants.classes import (
FINDINGS_ABOUT,
Expand Down Expand Up @@ -597,6 +598,50 @@ def add_comparator_to_rule_conditions(
f"comparator={comparator}, conditions={rule['conditions']}"
)

def _preprocess_operation_params(
self, operation_params: OperationParams, domain_details: dict = None
) -> OperationParams:
# uses shallow copy to not overwrite for subsequent
# operations and avoids costly deepcopy of dataframe
params_copy = copy.copy(operation_params)
current_domain = params_copy.domain
if domain_details.is_supp:
current_domain = domain_details.rdomain
for param_name in vars(params_copy):
if param_name in ("datasets", "dataframe"):
continue
param_value = getattr(params_copy, param_name)
updated_value = self._replace_wildcards_in_value(
param_value, current_domain
)
if updated_value is not param_value:
updated_value = copy.deepcopy(updated_value)
setattr(params_copy, param_name, updated_value)
return params_copy

def _replace_wildcards_in_value(self, value, domain: str):
if value is None:
return value
if isinstance(value, str):
return value.replace("--", domain)
elif isinstance(value, list):
return [self._replace_wildcards_in_value(item, domain) for item in value]
elif isinstance(value, set):
return {self._replace_wildcards_in_value(item, domain) for item in value}
elif isinstance(value, dict):
return {
self._replace_wildcards_in_value(
k, domain
): self._replace_wildcards_in_value(v, domain)
for k, v in value.items()
}
elif isinstance(value, tuple):
return tuple(
self._replace_wildcards_in_value(item, domain) for item in value
)
else:
return value

@staticmethod
def duplicate_conditions_for_all_targets(
conditions: ConditionInterface, targets: List[str]
Expand Down
31 changes: 25 additions & 6 deletions resources/schema/Operator.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
# Check Operator

NOTE: Complementary operators have access to the same paremeter arguments unless otherwise stated.

## Relational

Basic value comparisons and presence checks for evaluating equality, inequality, ranges, and whether values exist or are empty.
Expand Down Expand Up @@ -854,17 +856,34 @@ Relationship Integrity Check

> `name` can be a variable containing a list of columns and `value` does not need to be present

> The `regex` parameter allows you to extract portions of values using a regex pattern before checking uniqueness.

> Compare date only (YYYY-MM-DD) for uniqueness

```yaml
Rule Type: Dataset Contents Check against Define XML
Check:
all:
- name: define_dataset_key_sequence # contains list of dataset key columns
operator: is_unique_set
- name: "--REPNUM"
operator: is_not_unique_set
value:
- "USUBJID"
- "--TESTCD"
- "$TIMING_VARIABLES"
regex: '^\d{4}-\d{2}-\d{2}'
```

> Compare by first N characters of a string

```yaml
- name: "ITEM_ID"
operator: is_not_unique_set
value:
- "USUBJID"
- "CATEGORY"
regex: "^.{2}"
```

### is_not_unique_set

Complement of `is_unique_set`
Complement of `is_unique_set`.

> --SEQ is not unique within DOMAIN, USUBJID, and --TESTCD

Expand Down
36 changes: 36 additions & 0 deletions tests/unit/test_check_operators/test_value_set_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,42 @@ def test_is_not_unique_set(target, comparator, dataset_type, expected_result):
assert result.equals(df.convert_to_series(expected_result))


@pytest.mark.parametrize(
"target, comparator, regex, dataset_type, expected_result",
[
(
"ARM",
"DTC",
r"^\d{4}-\d{2}-\d{2}",
PandasDataset,
[False, False, False, False],
),
("ARM", "TAE", None, PandasDataset, [False, False, True, True]),
],
)
def test_is_unique_set_with_regex(
target, comparator, regex, dataset_type, expected_result
):
data = {
"ARM": ["PLACEBO", "PLACEBO", "ACTIVE", "ACTIVE"],
"TAE": [1, 1, 1, 2],
"DTC": [
"2024-01-15T10:30:00",
"2024-01-15T14:45:00",
"2024-01-16T10:30:00",
"2024-01-16T14:45:00",
],
}
df = dataset_type.from_dict(data)
params = {"target": target, "comparator": comparator}
if regex is not None:
params["regex"] = regex
result = DataframeType(
{"value": df, "column_prefix_map": {"--": "AR"}}
).is_unique_set(params)
assert result.equals(df.convert_to_series(expected_result))


@pytest.mark.parametrize(
"target, comparator, dataset_type, expected_result",
[
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/test_operations/test_variable_is_null.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def test_variable_is_null(
config = ConfigService()
cache = CacheServiceFactory(config).get_cache_service()
operation_params.dataframe = data
operation_params.target = "--VAR"
operation_params.target = "AEVAR"
operation_params.domain = "AE"
mock_data_service.get_dataset.return_value = data
mock_data_service.dataset_implementation = data.__class__
Expand Down
Loading
Loading