Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ def build(self):
variable_label
variable_size
variable_data_type
variable_is_empty
variable_has_empty_values
define_variable_name,
define_variable_label,
define_variable_data_type,
Expand All @@ -29,7 +31,6 @@ def build(self):
define_variable_codelist_coded_values,
define_variable_codelist_coded_codes,
define_variable_mandatory,
variable_has_empty_values
library_variable_name,
library_variable_label,
library_variable_data_type,
Expand Down Expand Up @@ -82,24 +83,22 @@ def build(self):
right_on="library_variable_name",
).fillna("")

final_dataframe["variable_has_empty_values"] = final_dataframe.apply(
lambda row: self.variable_has_null_values(
(
row["variable_name"]
if row["variable_name"] != ""
else row["library_variable_name"]
final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = (
final_dataframe.apply(
lambda row: self.get_variable_null_stats(
row["variable_name"], dataset_contents
),
dataset_contents,
),
axis=1,
axis=1,
result_type="expand",
)
)

return final_dataframe

def variable_has_null_values(
def get_variable_null_stats(
self, variable: str, content: DatasetInterface
) -> bool:
) -> tuple[bool, bool]:
if variable not in content:
return True
series = content[variable]
return series.mask(series == "").isnull().any()
return True, True
series = content[variable].mask(content[variable] == "")
return series.isnull().any(), series.isnull().all()
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def build(self):
variable_size
variable_data_type
variable_has_empty_values
variable_is_empty
library_variable_name,
library_variable_label,
library_variable_data_type,
Expand Down Expand Up @@ -57,18 +58,20 @@ def build(self):
right_on="library_variable_name",
).fillna("")

data["variable_has_empty_values"] = data.apply(
lambda row: self.variable_has_null_values(
data[["variable_has_empty_values", "variable_is_empty"]] = data.apply(
lambda row: self.get_variable_null_stats(
row["variable_name"], dataset_contents
),
axis=1,
result_type="expand",
)

return data

def variable_has_null_values(
def get_variable_null_stats(
self, variable: str, content: DatasetInterface
) -> bool:
) -> tuple[bool, bool]:
if variable not in content:
return True
series = content[variable]
return series.mask(series == "").isnull().any()
return True, True
series = content[variable].mask(content[variable] == "")
return series.isnull().any(), series.isnull().all()
1 change: 1 addition & 0 deletions cdisc_rules_engine/models/operation_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class OperationParams:
original_target: str = None
regex: str = None
returntype: str = None
source: str = None
target: str = None
value_is_reference: bool = False
namespace: str = None
Expand Down
4 changes: 2 additions & 2 deletions cdisc_rules_engine/operations/base_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,10 +173,10 @@ def _filter_data(self, data):
def _is_wildcard_pattern(self, value: str) -> bool:
if not isinstance(value, str):
return False
return value.endswith("%")
return value.endswith("&")

def _apply_wildcard_filter(self, series: pd.Series, pattern: str) -> pd.Series:
prefix = pattern.rstrip("%")
prefix = pattern.rstrip("&")
result = series.str.startswith(prefix, na=False)
return result

Expand Down
21 changes: 9 additions & 12 deletions cdisc_rules_engine/operations/variable_is_null.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,16 @@

class VariableIsNull(BaseOperation):
def _execute_operation(self):
# Always get the content dataframe. Similar to variable_exists check
dataframe = self.data_service.get_dataset(dataset_name=self.params.dataset_path)
if self.params.target.startswith("define_variable"):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new changes does not have this check of starting with define_variable. Is this case handled by evaluation dataset branch now? Could you please explain briefly?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if a define is contained, it will be a define rule type and those variable will be accessible via the evaluation dataset built by that define rule type.

# Handle checks against define metadata
target_column = self.evaluation_dataset[self.params.target]
result = [
self._is_target_variable_null(dataframe, value)
for value in target_column
]
return self.data_service.dataset_implementation().convert_to_series(result)
if self.params.source == "submission":
if self.params.level == "row":
raise ValueError("level: row may only be used with source: evaluation")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we please add a unit test covering this case ensuring ValueError is raised? We can cover other combinations too that can happen in real case.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is done

dataframe = self.data_service.get_dataset(
dataset_name=self.params.dataset_path
)
else:
target_variable = self.params.target
return self._is_target_variable_null(dataframe, target_variable)
dataframe = self.evaluation_dataset

return self._is_target_variable_null(dataframe, self.params.target)

def _is_target_variable_null(self, dataframe, target_variable: str) -> bool:
if target_variable not in dataframe:
Expand Down
1 change: 1 addition & 0 deletions cdisc_rules_engine/utilities/rule_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,6 +393,7 @@ def perform_rule_operations(
original_target=original_target,
regex=operation.get("regex"),
returntype=operation.get("returntype"),
source=operation.get("source"),
standard=standard,
standard_substandard=standard_substandard,
standard_version=standard_version,
Expand Down
1 change: 1 addition & 0 deletions resources/schema/rule/MetaVariables.json
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@
},
{ "const": "variable_format" },
{ "const": "variable_has_empty_values" },
{ "const": "variable_is_empty" },
{ "const": "variable_label" },
{ "const": "variable_name" },
{
Expand Down
4 changes: 4 additions & 0 deletions resources/schema/rule/MetaVariables.md
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,10 @@ Variable format

True/False value indicating whether a variable has any empty values

## variable_is_empty

True/False value indicating whether a variable is completely empty

## variable_label

Variable long label
Expand Down
3 changes: 3 additions & 0 deletions resources/schema/rule/Operations.json
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,9 @@
"type": "string",
"enum": ["code", "value", "pref_term"]
},
"source": {
"type": "string"
},
"term_value": {
"type": "string"
},
Expand Down
15 changes: 10 additions & 5 deletions resources/schema/rule/Operations.md
Original file line number Diff line number Diff line change
Expand Up @@ -1013,7 +1013,7 @@ Operations:

### record_count

If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.
If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables.

If both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables.

Expand Down Expand Up @@ -1058,7 +1058,7 @@ Example: return the number of records where QNAM starts with "RACE" (matches RAC
- operation: record_count
id: $race_records_in_dataset
filter:
QNAM: "RACE%"
QNAM: "RACE&"
group:
- "USUBJID"
```
Expand Down Expand Up @@ -1291,7 +1291,7 @@ Match Datasets:

### variable_exists

Flag an error if MIDS is in the dataset currently being evaluated and the TM domain is not present in the study
Operation operates only on original submission datasets regardless of rule type. Flags an error if a column exists is in the submission dataset currently being evaluated.

Rule Type: Domain Presence Check

Expand All @@ -1312,13 +1312,18 @@ Operations:
### variable_is_null

Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty.
The operation can work with both direct variable names and define metadata references (variables starting with "define_variable").
The operation supports two sources via the `source` parameter:

- **`submission`** : checks against the raw submission dataset
- **`evaluation`** (default): checks against the evaluation dataset built based on the rule type

```yaml
# Dataset level check - is this variable entirely null/missing from the source data?
Operations:
- operator: variable_is_null
name: USUBJID
id: $aeterm_is_null
id: $usubjid_is_null
source: submission
```

### get_xhtml_errors
Expand Down
4 changes: 3 additions & 1 deletion resources/schema/rule/Rule_Type.md
Original file line number Diff line number Diff line change
Expand Up @@ -555,6 +555,7 @@ Attach define xml metadata at variable level
- `variable_data_type`
- `variable_format`
- `variable_has_empty_values`
- `variable_is_empty`
- `library_variable_name`
- `library_variable_role`
- `library_variable_label`
Expand All @@ -572,6 +573,8 @@ Attach define xml metadata at variable level
- `variable_size`
- `variable_order_number`
- `variable_data_type`
- `variable_has_empty_values`
- `variable_is_empty`
- `define_variable_name`
- `define_variable_label`
- `define_variable_data_type`
Expand All @@ -597,7 +600,6 @@ Attach define xml metadata at variable level
- `library_variable_order_number`
- `library_variable_data_type`
- `library_variable_ccode`
- `variable_has_empty_values`

## JSON Schema Check

Expand Down
Loading
Loading