From a23ab00166ff38b4d71d8004fab029aba4411323 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Wed, 18 Feb 2026 15:42:40 -0500 Subject: [PATCH 1/5] update --- cdisc_rules_engine/models/operation_params.py | 1 + .../operations/base_operation.py | 4 +- .../operations/variable_is_null.py | 32 +++--- .../utilities/rule_processor.py | 1 + resources/schema/rule/Operations.json | 5 +- resources/schema/rule/Operations.md | 28 +++++- .../test_operations/test_variable_is_null.py | 99 ++++++++++++++++--- 7 files changed, 137 insertions(+), 33 deletions(-) diff --git a/cdisc_rules_engine/models/operation_params.py b/cdisc_rules_engine/models/operation_params.py index 0fbbf21a4..02d16cedc 100644 --- a/cdisc_rules_engine/models/operation_params.py +++ b/cdisc_rules_engine/models/operation_params.py @@ -56,6 +56,7 @@ class OperationParams: original_target: str = None regex: str = None returntype: str = None + source: str = None target: str = None value_is_reference: bool = False namespace: str = None diff --git a/cdisc_rules_engine/operations/base_operation.py b/cdisc_rules_engine/operations/base_operation.py index e8913d011..a045a323b 100644 --- a/cdisc_rules_engine/operations/base_operation.py +++ b/cdisc_rules_engine/operations/base_operation.py @@ -173,10 +173,10 @@ def _filter_data(self, data): def _is_wildcard_pattern(self, value: str) -> bool: if not isinstance(value, str): return False - return value.endswith("%") + return value.endswith("&") def _apply_wildcard_filter(self, series: pd.Series, pattern: str) -> pd.Series: - prefix = pattern.rstrip("%") + prefix = pattern.rstrip("&") result = series.str.startswith(prefix, na=False) return result diff --git a/cdisc_rules_engine/operations/variable_is_null.py b/cdisc_rules_engine/operations/variable_is_null.py index 18758ab38..f548ed953 100644 --- a/cdisc_rules_engine/operations/variable_is_null.py +++ b/cdisc_rules_engine/operations/variable_is_null.py @@ -3,22 +3,30 @@ class VariableIsNull(BaseOperation): def _execute_operation(self): - # Always get the content dataframe. Similar to variable_exists check - dataframe = self.data_service.get_dataset(dataset_name=self.params.dataset_path) - if self.params.target.startswith("define_variable"): - # Handle checks against define metadata - target_column = self.evaluation_dataset[self.params.target] - result = [ - self._is_target_variable_null(dataframe, value) - for value in target_column - ] - return self.data_service.dataset_implementation().convert_to_series(result) + if self.params.source == "submission": + if self.params.level == "row": + raise ValueError("level: row may only be used with source: evaluation") + dataframe = self.data_service.get_dataset( + dataset_name=self.params.dataset_path + ) else: - target_variable = self.params.target - return self._is_target_variable_null(dataframe, target_variable) + dataframe = self.evaluation_dataset + + if self.params.level == "row": + return self._is_variable_null_by_row(dataframe, self.params.target) + else: + return self._is_target_variable_null(dataframe, self.params.target) def _is_target_variable_null(self, dataframe, target_variable: str) -> bool: if target_variable not in dataframe: return True series = dataframe[target_variable] return (series.isnull() | (series == "")).all() + + def _is_variable_null_by_row(self, dataframe, target_variable: str): + if target_variable not in dataframe: + return self.data_service.dataset_implementation().convert_to_series( + [True] * len(dataframe) + ) + series = dataframe[target_variable] + return series.isnull() | (series == "") diff --git a/cdisc_rules_engine/utilities/rule_processor.py b/cdisc_rules_engine/utilities/rule_processor.py index c482087a3..85e128a61 100644 --- a/cdisc_rules_engine/utilities/rule_processor.py +++ b/cdisc_rules_engine/utilities/rule_processor.py @@ -393,6 +393,7 @@ def perform_rule_operations( original_target=original_target, regex=operation.get("regex"), returntype=operation.get("returntype"), + source=operation.get("source"), standard=standard, standard_substandard=standard_substandard, standard_version=standard_version, diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index 6f87ec119..e7f222c39 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -380,7 +380,7 @@ "properties": { "operator": { "const": "variable_is_null" } }, - "required": ["id", "operator"], + "required": ["id", "operator", "source", "level"], "type": "object" }, { @@ -557,6 +557,9 @@ "type": "string", "enum": ["code", "value", "pref_term"] }, + "source": { + "type": "string" + }, "term_value": { "type": "string" }, diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index da0aecbad..9cceae356 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -1013,7 +1013,7 @@ Operations: ### record_count -If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables. +If no filter or group is provided, returns the number of records in the dataset. If filter is provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter. Filter can have a wildcard `&` that when added to the end of the filter value will look for all instances of that prefix (see 4th example below). If group is provided, returns the number of rows matching each unique set of the grouping variables. These can be static column name(s) or can be derived from other operations like get_dataset_filtered_variables. If both filter and group are provided, returns the number of records in the dataset that contain the value(s) in the corresponding column(s) provided in the filter that also match each unique set of the grouping variables. @@ -1058,7 +1058,7 @@ Example: return the number of records where QNAM starts with "RACE" (matches RAC - operation: record_count id: $race_records_in_dataset filter: - QNAM: "RACE%" + QNAM: "RACE&" group: - "USUBJID" ``` @@ -1291,7 +1291,7 @@ Match Datasets: ### variable_exists -Flag an error if MIDS is in the dataset currently being evaluated and the TM domain is not present in the study +Operation operates only on original submission datasets regardless of rule type. Flags an error if a column exists is in the submission dataset currently being evaluated. Rule Type: Domain Presence Check @@ -1312,13 +1312,31 @@ Operations: ### variable_is_null Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty. -The operation can work with both direct variable names and define metadata references (variables starting with "define_variable"). +The operation supports two sources via the `source` parameter: + +- **`submission`** : checks against the raw submission dataset +- **`evaluation`** (default): checks against the evaluation dataset built based on the rule type + The operation supports two levels via the `level` parameter: +- **`dataset`** (default): returns a single boolean broadcast to all rows — `true` if the variable is missing or all values are all null or empty string +- **`row`**: returns a boolean Series — `true` for each row where the value in the target variable is null or empty string. May only be used on the evaluation dataset. ```yaml + +# Dataset level check - is this variable entirely null/missing from the source data? +Operations: + - operator: variable_is_null + name: USUBJID + id: $usubjid_is_null + source: submission + level: dataset + +# Row level check - is USUBJID blank on this row? Operations: - operator: variable_is_null name: USUBJID - id: $aeterm_is_null + id: $usubjid_is_null + source: evaluation + level: row ``` ### get_xhtml_errors diff --git a/tests/unit/test_operations/test_variable_is_null.py b/tests/unit/test_operations/test_variable_is_null.py index f2fe30cf6..960938ca6 100644 --- a/tests/unit/test_operations/test_variable_is_null.py +++ b/tests/unit/test_operations/test_variable_is_null.py @@ -71,7 +71,7 @@ ), ], ) -def test_variable_is_null( +def test_variable_is_null_submission( data, target_var, expected, mock_data_service, operation_params: OperationParams ): config = ConfigService() @@ -79,6 +79,8 @@ def test_variable_is_null( operation_params.dataframe = data operation_params.target = target_var operation_params.domain = "AE" + operation_params.source = "submission" + operation_params.level = "dataset" mock_data_service.get_dataset.return_value = data mock_data_service.dataset_implementation = data.__class__ result = VariableIsNull(operation_params, data, cache, mock_data_service).execute() @@ -87,22 +89,93 @@ def test_variable_is_null( assert val == expected -def test_define_crosscheck_variable_is_null(mock_data_service, operation_params): - define_metadata = PandasDataset.from_dict( - { - "define_variable_name": ["AEHLT", "AETERM"], - "define_variable_has_no_data": ["Yes", "No"], - } +def test_variable_is_null_evaluation_dataset_level(mock_data_service, operation_params): + evaluation_dataset = PandasDataset.from_dict({"VAR1": [None, None], "VAR2": [1, 2]}) + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + operation_params.dataframe = evaluation_dataset + operation_params.target = "VAR1" + operation_params.source = "evaluation" + operation_params.level = "dataset" + mock_data_service.dataset_implementation = PandasDataset + result = VariableIsNull( + operation_params, evaluation_dataset, cache, mock_data_service + ).execute() + assert operation_params.operation_id in result + for val in result[operation_params.operation_id]: + assert val is True + + +def test_variable_is_null_evaluation_row_level(mock_data_service, operation_params): + evaluation_dataset = PandasDataset.from_dict( + {"VAR1": [None, "A", None, "B"], "VAR2": [1, 2, 3, 4]} + ) + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + operation_params.dataframe = evaluation_dataset + operation_params.target = "VAR1" + operation_params.source = "evaluation" + operation_params.level = "row" + mock_data_service.dataset_implementation = PandasDataset + result = VariableIsNull( + operation_params, evaluation_dataset, cache, mock_data_service + ).execute() + assert operation_params.operation_id in result + assert result[operation_params.operation_id].to_list() == [True, False, True, False] + + +def test_variable_is_null_row_level_empty_string(mock_data_service, operation_params): + """Test row-level check treats empty string as null.""" + evaluation_dataset = PandasDataset.from_dict( + {"VAR1": ["", "A", None, "B"], "VAR2": [1, 2, 3, 4]} ) - dataset = PandasDataset.from_dict({"AEHLT": [None, None], "AETERM": [1, 2]}) config = ConfigService() cache = CacheServiceFactory(config).get_cache_service() - operation_params.dataframe = define_metadata - operation_params.target = "define_variable_name" - mock_data_service.get_dataset.return_value = dataset + operation_params.dataframe = evaluation_dataset + operation_params.target = "VAR1" + operation_params.source = "evaluation" + operation_params.level = "row" mock_data_service.dataset_implementation = PandasDataset result = VariableIsNull( - operation_params, PandasDataset(define_metadata.data), cache, mock_data_service + operation_params, evaluation_dataset, cache, mock_data_service ).execute() assert operation_params.operation_id in result - assert result[operation_params.operation_id].to_list() == [True, False] + assert result[operation_params.operation_id].to_list() == [True, False, True, False] + + +def test_variable_is_null_row_level_missing_variable( + mock_data_service, operation_params +): + """Test row-level check returns all True when variable is missing from dataset.""" + evaluation_dataset = PandasDataset.from_dict({"VAR2": ["A", "B", "C"]}) + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + operation_params.dataframe = evaluation_dataset + operation_params.target = "VAR1" + operation_params.source = "evaluation" + operation_params.level = "row" + mock_data_service.dataset_implementation = PandasDataset + result = VariableIsNull( + operation_params, evaluation_dataset, cache, mock_data_service + ).execute() + assert operation_params.operation_id in result + assert result[operation_params.operation_id].to_list() == [True, True, True] + + +def test_variable_is_null_row_level_raises_for_submission( + mock_data_service, operation_params +): + """Test that level=row raises an error when source=submission.""" + data = PandasDataset.from_dict({"VAR1": ["A", "B", "C"]}) + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + operation_params.dataframe = data + operation_params.target = "VAR1" + operation_params.source = "submission" + operation_params.level = "row" + mock_data_service.get_dataset.return_value = data + mock_data_service.dataset_implementation = PandasDataset + with pytest.raises( + ValueError, match="level: row may only be used with source: evaluation" + ): + VariableIsNull(operation_params, data, cache, mock_data_service).execute() From 868988e0df306b88141c1b22949517becdb46f28 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Wed, 18 Feb 2026 18:14:41 -0500 Subject: [PATCH 2/5] operator --- ...with_define_and_library_dataset_builder.py | 29 +- ...ariables_metadata_with_library_metadata.py | 17 +- .../operations/variable_is_null.py | 13 +- resources/schema/rule/MetaVariables.json | 1 + resources/schema/rule/MetaVariables.md | 4 + resources/schema/rule/Operations.json | 2 +- resources/schema/rule/Operations.md | 15 +- resources/schema/rule/Rule_Type.md | 4 +- resources/schema/rule/check_parameter.md | 293 ++++++++++++++++++ ...with_define_and_library_dataset_builder.py | 27 +- ...a_with_library_metadata_dataset_builder.py | 9 + .../test_operations/test_variable_is_null.py | 77 ----- 12 files changed, 341 insertions(+), 150 deletions(-) create mode 100644 resources/schema/rule/check_parameter.md diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py index 0faf1ad84..6151530a1 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_define_and_library_dataset_builder.py @@ -13,6 +13,8 @@ def build(self): variable_label variable_size variable_data_type + variable_is_empty + variable_has_empty_values define_variable_name, define_variable_label, define_variable_data_type, @@ -29,7 +31,6 @@ def build(self): define_variable_codelist_coded_values, define_variable_codelist_coded_codes, define_variable_mandatory, - variable_has_empty_values library_variable_name, library_variable_label, library_variable_data_type, @@ -82,24 +83,22 @@ def build(self): right_on="library_variable_name", ).fillna("") - final_dataframe["variable_has_empty_values"] = final_dataframe.apply( - lambda row: self.variable_has_null_values( - ( - row["variable_name"] - if row["variable_name"] != "" - else row["library_variable_name"] + final_dataframe[["variable_has_empty_values", "variable_is_empty"]] = ( + final_dataframe.apply( + lambda row: self.get_variable_null_stats( + row["variable_name"], dataset_contents ), - dataset_contents, - ), - axis=1, + axis=1, + result_type="expand", + ) ) return final_dataframe - def variable_has_null_values( + def get_variable_null_stats( self, variable: str, content: DatasetInterface - ) -> bool: + ) -> tuple[bool, bool]: if variable not in content: - return True - series = content[variable] - return series.mask(series == "").isnull().any() + return True, True + series = content[variable].mask(content[variable] == "") + return series.isnull().any(), series.isnull().all() diff --git a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py index 9c224b551..1d99c9e99 100644 --- a/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py +++ b/cdisc_rules_engine/dataset_builders/variables_metadata_with_library_metadata.py @@ -13,6 +13,7 @@ def build(self): variable_size variable_data_type variable_has_empty_values + variable_is_empty library_variable_name, library_variable_label, library_variable_data_type, @@ -57,18 +58,20 @@ def build(self): right_on="library_variable_name", ).fillna("") - data["variable_has_empty_values"] = data.apply( - lambda row: self.variable_has_null_values( + data[["variable_has_empty_values", "variable_is_empty"]] = data.apply( + lambda row: self.get_variable_null_stats( row["variable_name"], dataset_contents ), axis=1, + result_type="expand", ) + return data - def variable_has_null_values( + def get_variable_null_stats( self, variable: str, content: DatasetInterface - ) -> bool: + ) -> tuple[bool, bool]: if variable not in content: - return True - series = content[variable] - return series.mask(series == "").isnull().any() + return True, True + series = content[variable].mask(content[variable] == "") + return series.isnull().any(), series.isnull().all() diff --git a/cdisc_rules_engine/operations/variable_is_null.py b/cdisc_rules_engine/operations/variable_is_null.py index f548ed953..7e14bfcd4 100644 --- a/cdisc_rules_engine/operations/variable_is_null.py +++ b/cdisc_rules_engine/operations/variable_is_null.py @@ -12,21 +12,10 @@ def _execute_operation(self): else: dataframe = self.evaluation_dataset - if self.params.level == "row": - return self._is_variable_null_by_row(dataframe, self.params.target) - else: - return self._is_target_variable_null(dataframe, self.params.target) + return self._is_target_variable_null(dataframe, self.params.target) def _is_target_variable_null(self, dataframe, target_variable: str) -> bool: if target_variable not in dataframe: return True series = dataframe[target_variable] return (series.isnull() | (series == "")).all() - - def _is_variable_null_by_row(self, dataframe, target_variable: str): - if target_variable not in dataframe: - return self.data_service.dataset_implementation().convert_to_series( - [True] * len(dataframe) - ) - series = dataframe[target_variable] - return series.isnull() | (series == "") diff --git a/resources/schema/rule/MetaVariables.json b/resources/schema/rule/MetaVariables.json index 2e3a690a7..3ac38aee2 100644 --- a/resources/schema/rule/MetaVariables.json +++ b/resources/schema/rule/MetaVariables.json @@ -146,6 +146,7 @@ }, { "const": "variable_format" }, { "const": "variable_has_empty_values" }, + { "const": "variable_is_empty" }, { "const": "variable_label" }, { "const": "variable_name" }, { diff --git a/resources/schema/rule/MetaVariables.md b/resources/schema/rule/MetaVariables.md index 91945918d..e15193cfa 100644 --- a/resources/schema/rule/MetaVariables.md +++ b/resources/schema/rule/MetaVariables.md @@ -238,6 +238,10 @@ Variable format True/False value indicating whether a variable has any empty values +## variable_is_empty + +True/False value indicating whether a variable is completely empty + ## variable_label Variable long label diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index e7f222c39..e86abf3cc 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -380,7 +380,7 @@ "properties": { "operator": { "const": "variable_is_null" } }, - "required": ["id", "operator", "source", "level"], + "required": ["id", "operator", "source"], "type": "object" }, { diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 9cceae356..693a58a44 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -1311,32 +1311,19 @@ Operations: ### variable_is_null -Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty. +Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its vaes are null or empty. The operation supports two sources via the `source` parameter: - **`submission`** : checks against the raw submission dataset - **`evaluation`** (default): checks against the evaluation dataset built based on the rule type - The operation supports two levels via the `level` parameter: -- **`dataset`** (default): returns a single boolean broadcast to all rows — `true` if the variable is missing or all values are all null or empty string -- **`row`**: returns a boolean Series — `true` for each row where the value in the target variable is null or empty string. May only be used on the evaluation dataset. ```yaml - # Dataset level check - is this variable entirely null/missing from the source data? Operations: - operator: variable_is_null name: USUBJID id: $usubjid_is_null source: submission - level: dataset - -# Row level check - is USUBJID blank on this row? -Operations: - - operator: variable_is_null - name: USUBJID - id: $usubjid_is_null - source: evaluation - level: row ``` ### get_xhtml_errors diff --git a/resources/schema/rule/Rule_Type.md b/resources/schema/rule/Rule_Type.md index bbf184d8f..4dd4d5034 100644 --- a/resources/schema/rule/Rule_Type.md +++ b/resources/schema/rule/Rule_Type.md @@ -555,6 +555,7 @@ Attach define xml metadata at variable level - `variable_data_type` - `variable_format` - `variable_has_empty_values` +- `variable_is_empty` - `library_variable_name` - `library_variable_role` - `library_variable_label` @@ -572,6 +573,8 @@ Attach define xml metadata at variable level - `variable_size` - `variable_order_number` - `variable_data_type` +- `variable_has_empty_values` +- `variable_is_empty` - `define_variable_name` - `define_variable_label` - `define_variable_data_type` @@ -597,7 +600,6 @@ Attach define xml metadata at variable level - `library_variable_order_number` - `library_variable_data_type` - `library_variable_ccode` -- `variable_has_empty_values` ## JSON Schema Check diff --git a/resources/schema/rule/check_parameter.md b/resources/schema/rule/check_parameter.md new file mode 100644 index 000000000..28e883ed6 --- /dev/null +++ b/resources/schema/rule/check_parameter.md @@ -0,0 +1,293 @@ +# Check Parameters + +## Overview + +Check parameters are configuration elements that define how validation rules are applied within the CDISC rules engine. These parameters control the behavior, scope, and criteria for data validation checks across clinical trial datasets. Each parameter serves a specific purpose in customizing rule logic to ensure data integrity and compliance with CDISC standards. + +The rules engine uses these parameters to construct validation logic that can be applied to various datasets to identify data inconsistencies, missing values, invalid formats, and other quality issues. + +## Parameter Definitions + +### comparator + +Specifies the column/variable name that provides the comparison values for validation rules. In the rules engine implementation, this parameter is processed through `replace_prefix()` and used to extract values for comparison operations. For example, in the `has_next_corresponding_record` operator, the comparator column's next row value is compared against the target column's current row value. + +```yaml +- name: "VSTESTCD" + operator: "has_next_corresponding_record" + target: "VSTESTCD" + comparator: "VSTESTCD" # Column providing comparison values + within: "USUBJID" + ordering: "VISITNUM" +``` + +### context + +### date_component + +Specifies which component of a date/datetime variable should be validated. Available options include: + +- year +- month +- day +- hour +- minute +- second +- microsecond + +### name + +Identifies the specific variable or field name that the validation rule targets. This parameter links the rule to the appropriate data element within the dataset. + +### negative + +Boolean parameter used with the `invalid_duration` operator to specify whether negative durations should be considered valid (True) or invalid (False). + +```yaml +- name: "BRTHDTC" + operator: "invalid_duration" + negative: False +``` + +In this example, the rule will flag any negative durations in the DURVAR variable as invalid. If `negative` were set to `true`, negative durations would be considered valid and not raise issues. + +### operator + +Defines the specific validation operation to be performed. This parameter determines the type of check the engine will execute (e.g., equal_to, not_null, within_range, invalid_duration). + +### order + +Specifies the sort order for validation results or data processing. Available options: + +- asc (ascending) +- dsc (descending) + +This parameter helps organize validation outputs and can influence how rules are applied to ordered data. + +### ordering + +Specifies the column name used to sort data before applying validation rules. In the rules engine, this parameter is processed through `replace_prefix()` and used in `sort_values()` operations to establish the correct sequence for row-by-row comparisons. Critical for operators that depend on data order, such as `has_next_corresponding_record`. + +Example usage: + +```yaml +ordering: "VISITNUM" # Sort by visit number to ensure proper sequence +``` + +### separator + +Specifies the delimiter character(s) used to split string values into parts for comparison. Used by operators that validate paired data formats to ensure both parts have equal precision or completeness. Default value is "/" (forward slash) if not specified. + +Example usage: + +```yaml +- name: "--DTC" + operator: "split_parts_have_equal_length" + separator: "/" # Split by forward slash for date intervals +``` + +Used with operators: + +- `split_parts_have_equal_length` - Validates that both parts of a split string have equal length +- `split_parts_have_unequal_length` - Validates that parts have different lengths (complement) + +### prefix + +Specifies a string prefix that should be present at the beginning of a variable's value. Used for format validation and standardization checks. + +### suffix + +Specifies a string suffix that should be present at the end of a variable's value. Complements prefix validation for comprehensive format checking. + +### regex + +Specifies a regular expression pattern used to extract portions of string values before performing validation operations. + +### target + +Specifies the primary column/variable name that the validation rule evaluates. In the rules engine implementation, this parameter is processed through `replace_prefix()` and represents the column whose values are being validated. The target column typically contains the data being checked for compliance or consistency. The results of validation rules are typically reported for the target variable. + +### value + +Contains the reference value or criteria against which the validation check is performed. The interpretation of this parameter depends on the `value_is_literal` setting. + +### value_is_literal + +Boolean parameter that signifies whether the string in the value key should be treated as a literal string. When value_is_literal is false or not specified, the string in the value key will be interpreted as a variable name in the dataset. + +> IDVAR = "VISIT" as a value, not IDVAR = VISIT as a variable in the dataset + +```yaml +- "name": "IDVAR" + "operator": "equal_to" + "value": "VISIT" + "value_is_literal": true +``` + +# Operation Parameters + +The rules engine uses operation parameters defined in the JSON schema to configure how operations execute. These parameters are specified by users in YAML operation definitions and work in conjunction with check parameters to provide comprehensive validation capabilities. + +### attribute_name + +Specifies the metadata attribute name to extract from variable definitions. Used in operations like `define_variable_metadata` to retrieve specific metadata properties such as variable labels, core status, or other attributes. + +### case_sensitive + +Boolean flag that controls whether string comparisons in operations should be case-sensitive. Default is `True`. If not explicitly specified, string comparisons will be case-sensitive. Used in external dictionary validation operations to allow flexible matching. + +Examples: + +Default behavior (case-sensitive): + +```yaml +- operator: valid_external_dictionary_value + # case_sensitive is not specified, so it defaults to True + case_sensitive: false # Enable case-insensitive matching +``` + +### codelist + +Specifies the name of a controlled terminology codelist for validation operations. Used with codelist-related operations to determine valid values. + +### codelist_code + +Contains the specific code value within a codelist. Used in operations that need to reference particular codelist entries. + +### codelists + +List of multiple codelist names for operations that work with multiple controlled terminology codelists simultaneously. + +### ct_attribute + +Specifies the controlled terminology attribute to retrieve, such as "Term CCODE" or other CT-specific attributes. + +### ct_package_type + +Single CT package type identifier. Valid values include: "ADAM", "CDASH", "COA", "DDF", "DEFINE-XML", "GLOSSARY", "MRCT", "PROTOCOL", "QRS", "QS-FT", "SDTM", "SEND", "TMF". + +### ct_package_types + +List of CT package types to include in operations. References the `ct_package_type` enumeration. + +### ct_packages + +List of multiple controlled terminology packages. Used when operations need to work across multiple CT packages. + +### ct_version + +Specifies the version of controlled terminology to use in validation operations. + +### dictionary_term_type + +Classification of external dictionary terms. Valid values include: "LLT", "PT", "HLT", "HLGT", "SOC". Used in MedDRA and other external dictionary operations. + +### domain + +Specifies the domain or data structure context for the operation. References Dataset or DataStructure definitions. + +### external_dictionary_type + +Type of external dictionary to use. Currently supports "meddra" for MedDRA validation operations. + +### filter + +Dictionary containing filter criteria for conditional validation. When specified, operations work only on data subsets that meet the filter criteria. + +```yaml +filter: + AESEV: "SEVERE" # Only process severe adverse events + AEOUT: "FATAL" # Only process fatal outcomes +``` + +### filter_key + +String parameter used for filtering operations. Works with filter-related functionality. + +### filter_value + +String parameter that specifies the value to filter by. Used in conjunction with filter operations. + +### group + +List of variable names used for grouping data before applying operations. Used to ensure operations are applied within appropriate data boundaries. + +### group_aliases + +Alternative names for grouping variables that allow grouped results to be merged back to datasets using different column names. + +```yaml +group: + - USUBJID + - IDVARVAL +group_aliases: + - USUBJID + - GROUPID # Rename IDVARVAL to GROUPID for merging +``` + +### id + +String identifier for the operation result. Used to reference operation outputs in subsequent rule logic. + +### key_name + +Specifies the metadata attribute name for filtering operations. Valid values include: "definition", "examples", "label", "name", "notes", "ordinal", "role", "simpleDatatype", "variableCcode". + +### key_value + +Works with `key_name` to specify the metadata value to filter by. Returns variables that match the specified key-value pair. + +```yaml +- operator: get_model_filtered_variables + key_name: "role" + key_value: "Timing" # Find variables with role = "Timing" +``` + +### level + +Specifies the level of data to retrieve from controlled terminology operations. Valid values are "codelist" or "term" to determine whether to return codelist-level or term-level data. + +### map + +List of mapping dictionaries for value transformations. Each dictionary contains input column names as properties and an `output` property specifying the result value. + +### name + +Variable name that the operation targets. Used to specify which column or variable the operation should process. + +### operator + +String that specifies the operation type to execute. Must match one of the defined operation types in the schema. + +### returntype + +Expected return type for operation results. Valid values are "code" (for NCI codes) or "value" (for submission values) in controlled terminology operations. + +### source + +Either "submission" or "evaluation" for which dataset to check the variable_is_null from. Evaluation is the dataset constructed by +the rule type while submission is the raw dataset submitted that is being evaluated. + +### term_code + +Terminology code value used in controlled terminology operations for code-based lookups. + +### term_value + +Terminology term value used in controlled terminology operations for value-based lookups. + +### version + +Version parameter used in codelist operations that require version-specific processing or validation. + +### within + +Specifies the column name used for grouping data before applying validation rules. In the rules engine implementation, this parameter is processed through `replace_prefix()` and used in `groupby()` operations to ensure validation logic is applied within appropriate data boundaries (e.g., by subject, by study). + +Example usage: + +```yaml +within: "USUBJID" # Group by subject ID to keep validations within subject data +``` + +This parameter is essential for maintaining data integrity boundaries and preventing inappropriate cross-subject or cross-study comparisons. diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py index 0810892c4..4f6244e02 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_define_and_library_dataset_builder.py @@ -10,7 +10,6 @@ from cdisc_rules_engine.services.data_services import LocalDataService from pathlib import Path import pandas as pd -import numpy as np from cdisc_rules_engine.dataset_builders.variables_metadata_with_define_and_library_dataset_builder import ( VariablesMetadataWithDefineAndLibraryDatasetBuilder, ) @@ -184,6 +183,7 @@ def test_build_combined_metadata( "library_variable_ccode", "library_variable_order_number", "variable_has_empty_values", + "variable_is_empty", } assert set(result.columns.tolist()) == expected_columns @@ -201,28 +201,7 @@ def test_build_combined_metadata( assert studyid_row["define_variable_role"] == "Identifier" assert studyid_row["library_variable_core"] == "Req" assert not studyid_row["variable_has_empty_values"] - - mandatory_vars = result[result["define_variable_mandatory"] == "Yes"] - assert not mandatory_vars.empty - assert all( - mandatory_vars["define_variable_name"].isin(["STUDYID", "USUBJID", "AETERM"]) - ) - - empty_value_vars = result[result["variable_has_empty_values"]] - assert not empty_value_vars.empty - assert "USUBJID" in empty_value_vars["variable_name"].values - assert "AETERM" in empty_value_vars["variable_name"].values - - non_empty_vars = ["STUDYID"] - for _, row in result.iterrows(): - if row["variable_name"] in non_empty_vars: - assert row["variable_has_empty_values"] is False - else: - assert row["variable_has_empty_values"] is True - - assert not result["variable_name"].isin([np.nan]).any() - assert not result["define_variable_name"].isin([np.nan]).any() - assert not result["library_variable_name"].isin([np.nan]).any() + assert not studyid_row["variable_is_empty"] usubjid_row = result[result["variable_name"] == "USUBJID"].iloc[0] assert usubjid_row["variable_size"] == 16.0 @@ -231,6 +210,7 @@ def test_build_combined_metadata( assert usubjid_row["define_variable_role"] == "Identifier" assert usubjid_row["library_variable_core"] == "Req" assert usubjid_row["variable_has_empty_values"] + assert not usubjid_row["variable_is_empty"] aeterm_row = result[result["variable_name"] == "AETERM"].iloc[0] assert aeterm_row["variable_size"] == 200.0 @@ -239,6 +219,7 @@ def test_build_combined_metadata( assert aeterm_row["define_variable_role"] == "Topic" assert aeterm_row["library_variable_core"] == "Req" assert aeterm_row["variable_has_empty_values"] + assert not aeterm_row["variable_is_empty"] assert len(result) == 3 diff --git a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py index 424161286..4ca4fee95 100644 --- a/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py +++ b/tests/unit/test_dataset_builders/test_variables_metadata_with_library_metadata_dataset_builder.py @@ -166,6 +166,7 @@ def test_variable_metadata_with_library_metadata_dataset_builder( "library_variable_ccode", "library_variable_order_number", "variable_has_empty_values", + "variable_is_empty", ] assert result["library_variable_name"].tolist() == [ "STUDYID", @@ -175,6 +176,7 @@ def test_variable_metadata_with_library_metadata_dataset_builder( ] assert result["variable_name"].tolist() == ["STUDYID", "USUBJID", "AETERM", "AESEQ"] assert result["variable_has_empty_values"].tolist() == [False, True, True, False] + assert result["variable_is_empty"].tolist() == [False, False, False, False] @patch( @@ -387,6 +389,7 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i "library_variable_data_type", "library_variable_ccode", "variable_has_empty_values", + "variable_is_empty", ] ) assert result["library_variable_name"].tolist() == [ @@ -407,3 +410,9 @@ def test_variable_metadata_with_library_metadata_dataset_builder_variable_only_i True, False, ] + assert result["variable_is_empty"].tolist() == [ + False, + False, + False, + False, + ] diff --git a/tests/unit/test_operations/test_variable_is_null.py b/tests/unit/test_operations/test_variable_is_null.py index 960938ca6..894ba8548 100644 --- a/tests/unit/test_operations/test_variable_is_null.py +++ b/tests/unit/test_operations/test_variable_is_null.py @@ -80,7 +80,6 @@ def test_variable_is_null_submission( operation_params.target = target_var operation_params.domain = "AE" operation_params.source = "submission" - operation_params.level = "dataset" mock_data_service.get_dataset.return_value = data mock_data_service.dataset_implementation = data.__class__ result = VariableIsNull(operation_params, data, cache, mock_data_service).execute() @@ -96,7 +95,6 @@ def test_variable_is_null_evaluation_dataset_level(mock_data_service, operation_ operation_params.dataframe = evaluation_dataset operation_params.target = "VAR1" operation_params.source = "evaluation" - operation_params.level = "dataset" mock_data_service.dataset_implementation = PandasDataset result = VariableIsNull( operation_params, evaluation_dataset, cache, mock_data_service @@ -104,78 +102,3 @@ def test_variable_is_null_evaluation_dataset_level(mock_data_service, operation_ assert operation_params.operation_id in result for val in result[operation_params.operation_id]: assert val is True - - -def test_variable_is_null_evaluation_row_level(mock_data_service, operation_params): - evaluation_dataset = PandasDataset.from_dict( - {"VAR1": [None, "A", None, "B"], "VAR2": [1, 2, 3, 4]} - ) - config = ConfigService() - cache = CacheServiceFactory(config).get_cache_service() - operation_params.dataframe = evaluation_dataset - operation_params.target = "VAR1" - operation_params.source = "evaluation" - operation_params.level = "row" - mock_data_service.dataset_implementation = PandasDataset - result = VariableIsNull( - operation_params, evaluation_dataset, cache, mock_data_service - ).execute() - assert operation_params.operation_id in result - assert result[operation_params.operation_id].to_list() == [True, False, True, False] - - -def test_variable_is_null_row_level_empty_string(mock_data_service, operation_params): - """Test row-level check treats empty string as null.""" - evaluation_dataset = PandasDataset.from_dict( - {"VAR1": ["", "A", None, "B"], "VAR2": [1, 2, 3, 4]} - ) - config = ConfigService() - cache = CacheServiceFactory(config).get_cache_service() - operation_params.dataframe = evaluation_dataset - operation_params.target = "VAR1" - operation_params.source = "evaluation" - operation_params.level = "row" - mock_data_service.dataset_implementation = PandasDataset - result = VariableIsNull( - operation_params, evaluation_dataset, cache, mock_data_service - ).execute() - assert operation_params.operation_id in result - assert result[operation_params.operation_id].to_list() == [True, False, True, False] - - -def test_variable_is_null_row_level_missing_variable( - mock_data_service, operation_params -): - """Test row-level check returns all True when variable is missing from dataset.""" - evaluation_dataset = PandasDataset.from_dict({"VAR2": ["A", "B", "C"]}) - config = ConfigService() - cache = CacheServiceFactory(config).get_cache_service() - operation_params.dataframe = evaluation_dataset - operation_params.target = "VAR1" - operation_params.source = "evaluation" - operation_params.level = "row" - mock_data_service.dataset_implementation = PandasDataset - result = VariableIsNull( - operation_params, evaluation_dataset, cache, mock_data_service - ).execute() - assert operation_params.operation_id in result - assert result[operation_params.operation_id].to_list() == [True, True, True] - - -def test_variable_is_null_row_level_raises_for_submission( - mock_data_service, operation_params -): - """Test that level=row raises an error when source=submission.""" - data = PandasDataset.from_dict({"VAR1": ["A", "B", "C"]}) - config = ConfigService() - cache = CacheServiceFactory(config).get_cache_service() - operation_params.dataframe = data - operation_params.target = "VAR1" - operation_params.source = "submission" - operation_params.level = "row" - mock_data_service.get_dataset.return_value = data - mock_data_service.dataset_implementation = PandasDataset - with pytest.raises( - ValueError, match="level: row may only be used with source: evaluation" - ): - VariableIsNull(operation_params, data, cache, mock_data_service).execute() From c19cc23ee7164c0ac8d78d9cbf4f238e98d82e68 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Wed, 18 Feb 2026 18:21:51 -0500 Subject: [PATCH 3/5] wildcard tests --- tests/unit/test_operations/test_record_count.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/unit/test_operations/test_record_count.py b/tests/unit/test_operations/test_record_count.py index 0abcf566b..278bad5d8 100644 --- a/tests/unit/test_operations/test_record_count.py +++ b/tests/unit/test_operations/test_record_count.py @@ -476,7 +476,7 @@ def test_operation_result_grouping_record_count(operation_params: OperationParam "operation_id": [3, 3, 3, 3, 3], } ), - {"QNAM": "RACE%"}, + {"QNAM": "RACE&"}, ), ( PandasDataset.from_dict( @@ -496,7 +496,7 @@ def test_operation_result_grouping_record_count(operation_params: OperationParam "operation_id": [0, 0, 0], } ), - {"QNAM": "VITAL%"}, + {"QNAM": "VITAL&"}, ), ( PandasDataset.from_dict( @@ -568,7 +568,7 @@ def test_wildcard_filtered_record_count( "operation_id": [2, 2, 2, 2, 2, 2], } ), - {"QNAM": "RACE%"}, + {"QNAM": "RACE&"}, ["USUBJID"], ), ( @@ -589,7 +589,7 @@ def test_wildcard_filtered_record_count( "operation_id": [2, 2, 1, 0], } ), - {"QNAM": "RACE%"}, + {"QNAM": "RACE&"}, ["USUBJID"], ), ], From 203053b538dd282758ca0a8f53cc1220aa2b8eee Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Thu, 19 Feb 2026 16:49:37 -0500 Subject: [PATCH 4/5] changes --- resources/schema/rule/Operations.json | 2 +- resources/schema/rule/Operations.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/resources/schema/rule/Operations.json b/resources/schema/rule/Operations.json index e86abf3cc..5ec2f1f10 100644 --- a/resources/schema/rule/Operations.json +++ b/resources/schema/rule/Operations.json @@ -380,7 +380,7 @@ "properties": { "operator": { "const": "variable_is_null" } }, - "required": ["id", "operator", "source"], + "required": ["id", "operator"], "type": "object" }, { diff --git a/resources/schema/rule/Operations.md b/resources/schema/rule/Operations.md index 693a58a44..9eb3e89e2 100644 --- a/resources/schema/rule/Operations.md +++ b/resources/schema/rule/Operations.md @@ -1311,7 +1311,7 @@ Operations: ### variable_is_null -Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its vaes are null or empty. +Returns true if a variable is missing from the dataset or if all values within the variable are null or empty string. This operation first checks if the target variable exists in the dataset, and if it does exist, evaluates whether all its values are null or empty. The operation supports two sources via the `source` parameter: - **`submission`** : checks against the raw submission dataset From 2652643cfda4905b27a4afa788e8ad4f12e244e4 Mon Sep 17 00:00:00 2001 From: Samuel Johnson Date: Mon, 23 Feb 2026 07:57:38 -0500 Subject: [PATCH 5/5] value error test case --- .../test_operations/test_variable_is_null.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/unit/test_operations/test_variable_is_null.py b/tests/unit/test_operations/test_variable_is_null.py index 894ba8548..3b2c0e1ba 100644 --- a/tests/unit/test_operations/test_variable_is_null.py +++ b/tests/unit/test_operations/test_variable_is_null.py @@ -102,3 +102,23 @@ def test_variable_is_null_evaluation_dataset_level(mock_data_service, operation_ assert operation_params.operation_id in result for val in result[operation_params.operation_id]: assert val is True + + +def test_variable_is_null_raises_value_error_for_row_level_submission( + mock_data_service, operation_params +): + data = PandasDataset.from_dict({"VAR1": ["A", "B", "C"], "VAR2": [1, 2, 3]}) + config = ConfigService() + cache = CacheServiceFactory(config).get_cache_service() + operation_params.dataframe = data + operation_params.target = "VAR1" + operation_params.domain = "AE" + operation_params.source = "submission" + operation_params.level = "row" + mock_data_service.get_dataset.return_value = data + mock_data_service.dataset_implementation = PandasDataset + + with pytest.raises( + ValueError, match="level: row may only be used with source: evaluation" + ): + VariableIsNull(operation_params, data, cache, mock_data_service).execute()