diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index a626e9680..f28ff69b7 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1642,60 +1642,142 @@ def value_has_multiple_references(self, other_value: dict): def value_does_not_have_multiple_references(self, other_value: dict): return ~self.value_has_multiple_references(other_value) - def check_target_ascending_in_sorted_group(self, group, target, comparator): + def _mark_invalid_null_positions(self, is_valid, group, null_mask, na_pos): + null_indices = group[null_mask].index.tolist() + non_null_indices = group[~null_mask].index.tolist() + index_order = group.index.tolist() + + if not null_indices or not non_null_indices: + return is_valid + + if na_pos == "last": + last_non_null = max(index_order.index(i) for i in non_null_indices) + first_null = min(index_order.index(i) for i in null_indices) + if first_null < last_non_null: + is_valid[null_mask] = False + else: + last_null = max(index_order.index(i) for i in null_indices) + first_non_null = min(index_order.index(i) for i in non_null_indices) + if last_null > first_non_null: + is_valid[null_mask] = False + + return is_valid + + def _verify_neighbor_consistency( + self, + is_valid, + non_null_rows, + target, + comparator, + ascending, + is_numeric_comparator, + ): + indices = non_null_rows.index.tolist() + comparator_vals = non_null_rows[comparator].tolist() + + for i, idx in enumerate(indices): + if not is_valid.loc[idx]: + continue + + curr = comparator_vals[i] + if self._is_null_or_empty(curr): + continue + + prev = next( + ( + comparator_vals[j] + for j in range(i - 1, -1, -1) + if not self._is_null_or_empty(comparator_vals[j]) + ), + None, + ) + nxt = next( + ( + comparator_vals[j] + for j in range(i + 1, len(comparator_vals)) + if not self._is_null_or_empty(comparator_vals[j]) + ), + None, + ) + if not is_numeric_comparator and not is_valid_date(str(curr)): + continue + + if ascending: + if prev is not None and curr < prev: + is_valid.loc[idx] = False + elif nxt is not None and curr > nxt: + is_valid.loc[idx] = False + else: + if prev is not None and curr > prev: + is_valid.loc[idx] = False + elif nxt is not None and curr < nxt: + is_valid.loc[idx] = False + + return is_valid + + def check_target_ascending_in_sorted_group( + self, group, target, comparator, ascending, na_pos + ): """ Check if target values are in ascending order within a group already sorted by comparator. - - Null comparator or null target: mark that row as False - - Only check ascending order between rows where both are non-null """ is_valid = pd.Series(True, index=group.index) - target_values = group[target].tolist() - comparator_values = group[comparator].tolist() + is_numeric_comparator = pd.api.types.is_numeric_dtype(group[comparator]) is_numeric_target = pd.api.types.is_numeric_dtype(group[target]) - # Mark any row with null comparator or null target as False - for i in range(len(target_values)): - if pd.isna(comparator_values[i]) or pd.isna(target_values[i]): - is_valid.iloc[i] = False + null_mask = group[comparator].isna() | ( + group[comparator].astype(str).str.strip() == "" + ) + non_null_rows = group[~null_mask] - # Only check ascending order on rows where both target and comparator are non-null - valid_positions = [ - i - for i in range(len(target_values)) - if not pd.isna(comparator_values[i]) and not pd.isna(target_values[i]) + is_valid = self._mark_invalid_null_positions(is_valid, group, null_mask, na_pos) + overlap_check = self.check_date_overlaps(group, target, comparator) + overlap_invalid_mask = ~overlap_check + non_null_rows_for_order_check = non_null_rows[ + ~overlap_invalid_mask[non_null_rows.index] ] - for i in range(len(valid_positions) - 1): - curr_pos = valid_positions[i] - next_pos = valid_positions[i + 1] - current = target_values[curr_pos] - next_val = target_values[next_pos] - - if ( + non_null_sorted = non_null_rows_for_order_check.sort_values( + by=comparator, ascending=ascending + ) + actual_target = non_null_rows_for_order_check[target].tolist() + expected_target = non_null_sorted[target].tolist() + non_null_indices = non_null_rows_for_order_check.index.tolist() + + for i in range(len(actual_target)): + actual = actual_target[i] + expected_val = expected_target[i] + + if pd.isna(actual) and pd.isna(expected_val): + continue + elif pd.isna(actual) or pd.isna(expected_val): + is_valid.loc[non_null_indices[i]] = False + elif ( not is_numeric_target - and is_valid_date(current) - and is_valid_date(next_val) + and is_valid_date(actual) + and is_valid_date(expected_val) ): - date1, _ = parse_date(current) - date2, _ = parse_date(next_val) - if date1 > date2: - is_valid.iloc[curr_pos] = False - is_valid.iloc[next_pos] = False + date1, _ = parse_date(actual) + date2, _ = parse_date(expected_val) + if date1 != date2: + is_valid.loc[non_null_indices[i]] = False else: - if current > next_val: - is_valid.iloc[curr_pos] = False - is_valid.iloc[next_pos] = False - + if actual != expected_val: + is_valid.loc[non_null_indices[i]] = False + + non_null_target_sorted = non_null_rows.sort_values(by=target, ascending=True) + is_valid = self._verify_neighbor_consistency( + is_valid, + non_null_target_sorted, + target, + comparator, + ascending, + is_numeric_comparator, + ) return is_valid def check_date_overlaps(self, group, target, comparator): - """ - Check for date overlaps in comparator column. - When dates have different precisions and overlap, mark them as invalid. - Only applies to date columns - returns all True for numeric columns. - Skips null comparator values. - """ comparator_values = group[comparator].tolist() is_valid = pd.Series(True, index=group.index) is_numeric = pd.api.types.is_numeric_dtype(group[comparator]) @@ -1703,30 +1785,33 @@ def check_date_overlaps(self, group, target, comparator): if is_numeric: return is_valid - # Only check non-null comparator values valid_positions = [ i for i in range(len(comparator_values)) - if not pd.isna(comparator_values[i]) + if not ( + pd.isna(comparator_values[i]) or str(comparator_values[i]).strip() == "" + ) ] - for i in range(len(valid_positions) - 1): + for i in range(len(valid_positions)): curr_pos = valid_positions[i] - next_pos = valid_positions[i + 1] current = comparator_values[curr_pos] - next_val = comparator_values[next_pos] - - if is_valid_date(current) and is_valid_date(next_val): - date1, prec1 = parse_date(current) - date2, prec2 = parse_date(next_val) - + if not is_valid_date(current): + continue + _, prec1 = parse_date(current) + for j in range(len(valid_positions)): + if i == j: + continue + other_pos = valid_positions[j] + other = comparator_values[other_pos] + if not is_valid_date(other): + continue + _, prec2 = parse_date(other) if prec1 != prec2: - overlaps, less_precise = dates_overlap(date1, prec1, date2, prec2) + overlaps, _ = dates_overlap(current, prec1, other, prec2) if overlaps: - if date1.startswith(less_precise): - is_valid.iloc[curr_pos] = False - elif date2.startswith(less_precise): - is_valid.iloc[next_pos] = False + is_valid.iloc[curr_pos] = False + is_valid.iloc[other_pos] = False return is_valid @@ -1767,20 +1852,6 @@ def _process_grouped_result( @log_operator_execution @type_operator(FIELD_DATAFRAME) def target_is_sorted_by(self, other_value: dict): - """ - Check if target is in ascending order when rows are sorted by comparator. - - Nulls in either target or comparator are marked False and excluded - from the ascending order check. - - Process: - 1. Sort data by within columns (always ASC) and comparator (ASC/DESC) - 2. Within each group: - - Mark null comparator or null target rows as False - - Check remaining rows: is target ascending? - - Check for date overlaps in comparator (if dates) - 3. Map results back to original row order - """ target = other_value.get("target") within_columns = self._normalize_grouping_columns(other_value.get("within")) columns = other_value["comparator"] @@ -1790,23 +1861,22 @@ def target_is_sorted_by(self, other_value: dict): for col in columns: comparator: str = self.replace_prefix(col["name"]) ascending: bool = col["sort_order"].lower() != "desc" + na_pos: str = col.get("null_position", "last") selected_columns = list( dict.fromkeys([target, comparator, *within_columns]) ) - # Sort by within columns (always ASC) and comparator in specified order sorted_df = self.value[selected_columns].sort_values( - by=[*within_columns, comparator], - ascending=[True] * len(within_columns) + [ascending], + by=[*within_columns, target], + ascending=[True] * (len(within_columns) + 1), ) grouped_df = sorted_df.groupby(within_columns, sort=False) - # Check 1: Target is ascending in sorted groups, nulls marked False target_check = grouped_df.apply( lambda x: self.check_target_ascending_in_sorted_group( - x, target, comparator + x, target, comparator, ascending, na_pos ) ) target_check = self._process_grouped_result( @@ -1815,11 +1885,10 @@ def target_is_sorted_by(self, other_value: dict): within_columns, sorted_df, lambda group: self.check_target_ascending_in_sorted_group( - group, target, comparator + group, target, comparator, ascending, na_pos ), ) - # Check 2: No date overlaps in comparator (only for date columns) date_overlap_check = grouped_df.apply( lambda x: self.check_date_overlaps(x, target, comparator) ) @@ -1831,10 +1900,7 @@ def target_is_sorted_by(self, other_value: dict): lambda group: self.check_date_overlaps(group, target, comparator), ) - # Combine both checks combined_check = target_check & date_overlap_check - - # Map results back to original dataframe order result = result & combined_check.reindex(self.value.index, fill_value=True) if isinstance(result, (pd.DataFrame, dd.DataFrame)): diff --git a/resources/schema/rule/Operator.json b/resources/schema/rule/Operator.json index f18d9937f..4c6bb6c87 100644 --- a/resources/schema/rule/Operator.json +++ b/resources/schema/rule/Operator.json @@ -579,6 +579,10 @@ "items": { "properties": { "name": { "$ref": "Operator.json#/properties/name" }, + "null_position": { + "enum": ["first", "last"], + "type": "string" + }, "order": { "$ref": "Operator.json#/properties/order" } }, "type": "object" diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index a5ce269bf..5218ff867 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -1120,7 +1120,7 @@ Complement of `is_ordered_by` ### target_is_sorted_by -True if the values in `name` are ordered according to the values specified by `value` in ascending/descending order, grouped by the values in `within`. Each `value` requires a variable `name` and an ordering of 'asc' or 'desc' specified by `order`. `within` accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 'YYYY-MM-DD' format +True if the values in name are ordered according to the values specified by value in ascending/descending order, grouped by the values in within. Each value entry requires a variable name, a sort_order of asc or desc, and an optional null_position of first or last (defaults to last) which controls where null/empty comparator values are placed in the expected ordering. Within accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 YYYY-MM-DD format. Date value(s) with different precisions that overlap (e.g. 2005-10, 2005-10-3 and 2005-10-08) are all flagged as not sorted as their order cannot be inferred. ```yaml Check: @@ -1133,6 +1133,7 @@ Check: value: - name: --STDTC sort_order: asc + null_position: last ``` ### target_is_not_sorted_by diff --git a/tests/unit/test_check_operators/test_relationship_integrity_checks.py b/tests/unit/test_check_operators/test_relationship_integrity_checks.py index 4b0da3398..8e8b78752 100644 --- a/tests/unit/test_check_operators/test_relationship_integrity_checks.py +++ b/tests/unit/test_check_operators/test_relationship_integrity_checks.py @@ -776,7 +776,7 @@ def test_target_is_sorted_by_dates(dataset_class): "2006-06-02", "2006-06-04", "2006-06-01", - "2006-06-05", + "2006-06", "2006-06-03", ], } @@ -784,7 +784,33 @@ def test_target_is_sorted_by_dates(dataset_class): result = DataframeType( {"value": df_invalid, "column_prefix_map": {"--": "SE"}} ).target_is_sorted_by(other_value) - assert result.equals(pd.Series([True, False, False, True, True])) + assert result.equals(pd.Series([False, False, False, False, False])) + + df_partial = dataset_class.from_dict( + { + "USUBJID": [ + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC001", + ], + "SESEQ": [1, 2, 3, 4, 5, 6], + "SESTDTC": [ + "2006-05-01", + "2006-06-01", + "2006-06-03", + "2006-06", + "2006-06-05", + "2006-06-07", + ], + } + ) + result = DataframeType( + {"value": df_partial, "column_prefix_map": {"--": "SE"}} + ).target_is_sorted_by(other_value) + assert result.equals(pd.Series([True, False, False, False, False, False])) df_desc = dataset_class.from_dict( { @@ -958,26 +984,59 @@ def test_target_is_sorted_by_multiple_within_numeric(dataset_class): @pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) def test_target_is_sorted_by_with_nulls(dataset_class): - """Test target_is_sorted_by handles null values correctly. - Null in either target or comparator marks that row as False, - but does not affect the ordering check of surrounding non-null rows. - """ - df = dataset_class.from_dict( + df_null_last = dataset_class.from_dict( { - "USUBJID": [123, 456, 456, 123, 123], - "SESEQ": [1, 2, 1, None, None], - "SESTDTC": ["2006-06-02", None, "2006-06-01", None, "2006-06-03"], + "USUBJID": ["001", "001", "001", "001", "002", "002", "002"], + "SESEQ": [1, 2, 3, 4, 1, 2, 3], + "SESTDTC": [ + "2006-01-01", + "2006-01-02", + "", + "2006-01-04", + "2006-01-01", + "", + "2006-01-03", + ], } ) - other_value = { + other_value_last = { "target": "--SEQ", "within": "USUBJID", - "comparator": [{"name": "--STDTC", "sort_order": "ASC"}], + "comparator": [ + {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} + ], } result = DataframeType( - {"value": df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals(pd.Series([True, False, True, False, False])) + {"value": df_null_last, "column_prefix_map": {"--": "SE"}} + ).target_is_sorted_by(other_value_last) + assert result.equals(pd.Series([True, True, False, True, True, False, True])) + + df_null_first = dataset_class.from_dict( + { + "USUBJID": ["001", "001", "001", "001", "002", "002", "002"], + "SESEQ": [1, 2, 3, 4, 1, 2, 3], + "SESTDTC": [ + "", + "2006-01-01", + "2006-01-02", + "2006-01-03", + "2006-01-01", + "", + "2006-01-03", + ], + } + ) + other_value_first = { + "target": "--SEQ", + "within": "USUBJID", + "comparator": [ + {"name": "--STDTC", "sort_order": "ASC", "null_position": "first"} + ], + } + result = DataframeType( + {"value": df_null_first, "column_prefix_map": {"--": "SE"}} + ).target_is_sorted_by(other_value_first) + assert result.equals(pd.Series([True, True, True, True, True, False, True])) @pytest.mark.parametrize(