diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py index 7c98f1942..5c00e5e5c 100644 --- a/cdisc_rules_engine/check_operators/dataframe_operators.py +++ b/cdisc_rules_engine/check_operators/dataframe_operators.py @@ -1357,14 +1357,16 @@ def is_unique_relationship(self, other_value): def is_ordered_set(self, other_value): target = other_value.get("target") value = other_value.get("comparator") - if not isinstance(value, str): - raise Exception("Comparator must be a single String value") + if not isinstance(value, (str, list)): + raise Exception("Comparator must be a String or list of Strings") + if isinstance(value, list) and not all(isinstance(v, str) for v in value): + raise Exception("All comparator values must be Strings") return self.value.is_column_sorted_within(value, target) @log_operator_execution @type_operator(FIELD_DATAFRAME) def is_not_ordered_set(self, other_value): - return not self.is_ordered_set(other_value) + return ~self.is_ordered_set(other_value) @log_operator_execution @type_operator(FIELD_DATAFRAME) @@ -1637,47 +1639,93 @@ def value_has_multiple_references(self, other_value: dict): def value_does_not_have_multiple_references(self, other_value: dict): return ~self.value_has_multiple_references(other_value) - def check_basic_sort_order(self, group, target, comparator, ascending): + def check_target_ascending_in_sorted_group(self, group, target, comparator): + """ + Check if target values are in ascending order within a group + already sorted by comparator. + - Null comparator or null target: mark that row as False + - Only check ascending order between rows where both are non-null + """ + is_valid = pd.Series(True, index=group.index) target_values = group[target].tolist() comparator_values = group[comparator].tolist() - is_sorted = pd.Series(True, index=group.index) - - def safe_compare(x, index): - if pd.isna(x): - is_sorted.loc[index] = False - return "9999-12-31" if ascending else "0001-01-01" - return x - - expected_order = sorted( - range(len(comparator_values)), - key=lambda k: safe_compare(comparator_values[k], group.index[k]), - reverse=not ascending, - ) - actual_order = sorted(range(len(target_values)), key=lambda k: target_values[k]) - - mismatches = np.array(expected_order) != np.array(actual_order) - is_sorted.iloc[mismatches] = False + is_numeric_target = pd.api.types.is_numeric_dtype(group[target]) + + # Mark any row with null comparator or null target as False + for i in range(len(target_values)): + if pd.isna(comparator_values[i]) or pd.isna(target_values[i]): + is_valid.iloc[i] = False + + # Only check ascending order on rows where both target and comparator are non-null + valid_positions = [ + i + for i in range(len(target_values)) + if not pd.isna(comparator_values[i]) and not pd.isna(target_values[i]) + ] + + for i in range(len(valid_positions) - 1): + curr_pos = valid_positions[i] + next_pos = valid_positions[i + 1] + current = target_values[curr_pos] + next_val = target_values[next_pos] + + if ( + not is_numeric_target + and is_valid_date(current) + and is_valid_date(next_val) + ): + date1, _ = parse_date(current) + date2, _ = parse_date(next_val) + if date1 > date2: + is_valid.iloc[curr_pos] = False + is_valid.iloc[next_pos] = False + else: + if current > next_val: + is_valid.iloc[curr_pos] = False + is_valid.iloc[next_pos] = False - return is_sorted + return is_valid def check_date_overlaps(self, group, target, comparator): + """ + Check for date overlaps in comparator column. + When dates have different precisions and overlap, mark them as invalid. + Only applies to date columns - returns all True for numeric columns. + Skips null comparator values. + """ comparator_values = group[comparator].tolist() - is_sorted = pd.Series(True, index=group.index) + is_valid = pd.Series(True, index=group.index) + is_numeric = pd.api.types.is_numeric_dtype(group[comparator]) + + if is_numeric: + return is_valid + + # Only check non-null comparator values + valid_positions = [ + i + for i in range(len(comparator_values)) + if not pd.isna(comparator_values[i]) + ] + + for i in range(len(valid_positions) - 1): + curr_pos = valid_positions[i] + next_pos = valid_positions[i + 1] + current = comparator_values[curr_pos] + next_val = comparator_values[next_pos] + + if is_valid_date(current) and is_valid_date(next_val): + date1, prec1 = parse_date(current) + date2, prec2 = parse_date(next_val) - for i in range(len(comparator_values) - 1): - if is_valid_date(comparator_values[i]) and is_valid_date( - comparator_values[i + 1] - ): - date1, prec1 = parse_date(comparator_values[i]) - date2, prec2 = parse_date(comparator_values[i + 1]) if prec1 != prec2: overlaps, less_precise = dates_overlap(date1, prec1, date2, prec2) - if overlaps and date1.startswith(less_precise): - is_sorted.iloc[i] = False - elif overlaps and date2.startswith(less_precise): - is_sorted.iloc[i + 1] = False + if overlaps: + if date1.startswith(less_precise): + is_valid.iloc[curr_pos] = False + elif date2.startswith(less_precise): + is_valid.iloc[next_pos] = False - return is_sorted + return is_valid def _process_grouped_result( self, @@ -1717,38 +1765,58 @@ def _process_grouped_result( @type_operator(FIELD_DATAFRAME) def target_is_sorted_by(self, other_value: dict): """ - Checking the sort order based on comparators, including date overlap checks + Check if target is in ascending order when rows are sorted by comparator. + + Nulls in either target or comparator are marked False and excluded + from the ascending order check. + + Process: + 1. Sort data by within columns (always ASC) and comparator (ASC/DESC) + 2. Within each group: + - Mark null comparator or null target rows as False + - Check remaining rows: is target ascending? + - Check for date overlaps in comparator (if dates) + 3. Map results back to original row order """ target = other_value.get("target") within_columns = self._normalize_grouping_columns(other_value.get("within")) columns = other_value["comparator"] + result = pd.Series([True] * len(self.value), index=self.value.index) + for col in columns: comparator: str = self.replace_prefix(col["name"]) ascending: bool = col["sort_order"].lower() != "desc" - na_pos: str = col["null_position"] + selected_columns = list( dict.fromkeys([target, comparator, *within_columns]) ) + + # Sort by within columns (always ASC) and comparator in specified order sorted_df = self.value[selected_columns].sort_values( by=[*within_columns, comparator], - ascending=ascending, - na_position=na_pos, + ascending=[True] * len(within_columns) + [ascending], ) - grouped_df = sorted_df.groupby(within_columns) - basic_sort_check = grouped_df.apply( - lambda x: self.check_basic_sort_order(x, target, comparator, ascending) + + grouped_df = sorted_df.groupby(within_columns, sort=False) + + # Check 1: Target is ascending in sorted groups, nulls marked False + target_check = grouped_df.apply( + lambda x: self.check_target_ascending_in_sorted_group( + x, target, comparator + ) ) - basic_sort_check = self._process_grouped_result( - basic_sort_check, + target_check = self._process_grouped_result( + target_check, grouped_df, within_columns, sorted_df, - lambda group: self.check_basic_sort_order( - group, target, comparator, ascending + lambda group: self.check_target_ascending_in_sorted_group( + group, target, comparator ), ) + # Check 2: No date overlaps in comparator (only for date columns) date_overlap_check = grouped_df.apply( lambda x: self.check_date_overlaps(x, target, comparator) ) @@ -1759,15 +1827,18 @@ def target_is_sorted_by(self, other_value: dict): sorted_df, lambda group: self.check_date_overlaps(group, target, comparator), ) - combined_check = basic_sort_check & date_overlap_check - result = result.reindex(sorted_df.index, fill_value=True) - result = result & combined_check - result = result.reindex(self.value.index, fill_value=True) + + # Combine both checks + combined_check = target_check & date_overlap_check + + # Map results back to original dataframe order + result = result & combined_check.reindex(self.value.index, fill_value=True) if isinstance(result, (pd.DataFrame, dd.DataFrame)): if isinstance(result, dd.DataFrame): result = result.compute() result = result.squeeze() + return result @log_operator_execution diff --git a/cdisc_rules_engine/models/dataset/dask_dataset.py b/cdisc_rules_engine/models/dataset/dask_dataset.py index b50d5f1f3..36736d00a 100644 --- a/cdisc_rules_engine/models/dataset/dask_dataset.py +++ b/cdisc_rules_engine/models/dataset/dask_dataset.py @@ -2,7 +2,6 @@ import dask.dataframe as dd import dask.array as da import pandas as pd -import numpy as np import re import dask from typing import List, Union @@ -59,19 +58,21 @@ def __getitem__(self, item): raise def is_column_sorted_within(self, group, column): - return ( - False - not in np.concatenate( - self._data.groupby(group, sort=False)[column] - .apply( - lambda partition: sorted(partition.sort_index().values) - == partition.sort_index().values - ) - .compute() - .values + if isinstance(group, str): + group = [group] + + def check_partition(partition): + sorted_vals = sorted(partition.values) + return pd.Series( + [a == b for a, b in zip(partition.values, sorted_vals)], + index=partition.index, ) - .ravel() - .tolist() + + return ( + self._data.compute() + .groupby(group)[column] + .transform(check_partition) + .sort_index() ) def __setitem__(self, key, value): diff --git a/cdisc_rules_engine/models/dataset/pandas_dataset.py b/cdisc_rules_engine/models/dataset/pandas_dataset.py index 6e4cbed4b..497120e3e 100644 --- a/cdisc_rules_engine/models/dataset/pandas_dataset.py +++ b/cdisc_rules_engine/models/dataset/pandas_dataset.py @@ -85,13 +85,17 @@ def get_grouped_size(self, by, **kwargs): return grouped_data.size() def is_column_sorted_within(self, group, column): - return ( - False - not in self.groupby(group)[column] - .apply(list) - .map(lambda x: sorted(x) == x) - .values - ) + if isinstance(group, str): + group = [group] + + def check_partition(partition): + sorted_vals = sorted(partition.values) + return pd.Series( + [a == b for a, b in zip(partition.values, sorted_vals)], + index=partition.index, + ) + + return self.groupby(group)[column].transform(check_partition) def concat(self, other: Union[DatasetInterface, List[DatasetInterface]], **kwargs): if isinstance(other, list): diff --git a/resources/schema/rule/Operator.json b/resources/schema/rule/Operator.json index 6abac2bfd..7055c3d09 100644 --- a/resources/schema/rule/Operator.json +++ b/resources/schema/rule/Operator.json @@ -569,10 +569,6 @@ "items": { "properties": { "name": { "$ref": "Operator.json#/properties/name" }, - "null_position": { - "enum": ["first", "last"], - "type": "string" - }, "order": { "$ref": "Operator.json#/properties/order" } }, "type": "object" diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md index f4659ffb3..1c01837ee 100644 --- a/resources/schema/rule/Operator.md +++ b/resources/schema/rule/Operator.md @@ -1059,7 +1059,7 @@ Complement of `has_next_corresponding_record` ### is_ordered_set -True if the dataset rows are in ascending order of the values within `name`, grouped by the values within `value` +True if the dataset rows are in ascending order of the values within `name`, grouped by the values within `value`. Value can either be a single column or multiple. ```yaml Check: @@ -1069,6 +1069,16 @@ Check: value: USUBJID ``` +```yaml +Check: + all: + - name: --SEQ + operator: is_ordered_set + value: + - USUBJID + - "--TESTCD" +``` + ### is_ordered_by True if the dataset rows are ordered by the values within `name`, given the ordering specified by `order` @@ -1087,7 +1097,7 @@ Complement of `is_ordered_by` ### target_is_sorted_by -True if the values in `name` are ordered according to the values specified by `value` grouped by the values in `within`. Each `value` requires a variable `name`, ordering specified by `order`, and the null position specified by `null_position`. `within` accepts either a single column or an ordered list of columns. +True if the values in `name` are ordered according to the values specified by `value` in ascending/descending order, grouped by the values in `within`. Each `value` requires a variable `name` and an ordering of 'asc' or 'desc' specified by `order`. `within` accepts either a single column or an ordered list of columns. Columns can be either number or Char Dates in ISO8601 'YYYY-MM-DD' format ```yaml Check: @@ -1100,7 +1110,6 @@ Check: value: - name: --STDTC sort_order: asc - null_position: last ``` ### target_is_not_sorted_by diff --git a/tests/unit/test_check_operators/test_relationship_integrity_checks.py b/tests/unit/test_check_operators/test_relationship_integrity_checks.py index a7edcab5c..4b0da3398 100644 --- a/tests/unit/test_check_operators/test_relationship_integrity_checks.py +++ b/tests/unit/test_check_operators/test_relationship_integrity_checks.py @@ -743,12 +743,9 @@ def test_has_next_corresponding_record(dataset_class): @pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) -def test_target_is_sorted_by(dataset_class): - """ - Unit test for target_is_sorted_by operator. - The test verifies if --SEQ is sorted based on set of user-defined columns - """ - valid_asc_df = dataset_class.from_dict( +def test_target_is_sorted_by_dates(dataset_class): + """Test target_is_sorted_by with date columns.""" + df = dataset_class.from_dict( { "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], "SESEQ": [1, 2, 1, 3, 2], @@ -761,133 +758,20 @@ def test_target_is_sorted_by(dataset_class): ], } ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - result = DataframeType( - {"value": valid_asc_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - True, - True, - ] - ) - ) - - -@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) -def test_target_is_sorted_by_multiple_within(dataset_class): - usubjid = ["CDISC001", "CDISC001", "CDISC001", "CDISC001", "CDISC002", "CDISC002"] - midstype = ["A", "A", "B", "B", "A", "A"] - mids = ["A1", "A2", "B1", "B2", "A1", "A2"] - smstdtc = [ - "2006-06-01", - "2006-06-02", - "2006-06-03", - "2006-06-04", - "2007-01-01", - "2007-01-02", - ] - data = { - "USUBJID": usubjid, - "MIDSTYPE": midstype, - "MIDS": mids, - "SMSTDTC": smstdtc, - } - df = dataset_class.from_dict(data) other_value = { - "target": "MIDS", - "within": ["USUBJID", "MIDSTYPE"], - "comparator": [ - {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - expected = [True] * len(usubjid) - result = DataframeType({"value": df}).target_is_sorted_by(other_value) - assert result.equals(df.convert_to_series(expected)) - - -@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) -def test_target_is_sorted_by_multiple_within_not_sorted(dataset_class): - usubjid = ["CDISC001", "CDISC001", "CDISC001", "CDISC001", "CDISC002", "CDISC002"] - midstype = ["A", "A", "B", "B", "A", "A"] - mids = ["A2", "A1", "B1", "B2", "A1", "A2"] - smstdtc = [ - "2006-06-01", - "2006-06-02", - "2006-06-03", - "2006-06-04", - "2007-01-01", - "2007-01-02", - ] - data = { - "USUBJID": usubjid, - "MIDSTYPE": midstype, - "MIDS": mids, - "SMSTDTC": smstdtc, - } - df = dataset_class.from_dict(data) - other_value = { - "target": "MIDS", - "within": ["USUBJID", "MIDSTYPE"], - "comparator": [ - {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - expected = [False, False, True, True, True, True] - result = DataframeType({"value": df}).target_is_sorted_by(other_value) - assert result.equals(df.convert_to_series(expected)) - - valid_desc_df = dataset_class.from_dict( - { - "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], - "SESEQ": [3, 2, 1, 2, 1], - "SESTDTC": [ - "2006-06-05", - "2006-06-04", - "2006-06-01", - "2006-06-03", - "2006-06-02", - ], - } - ) - - other_value: dict = { "target": "--SEQ", "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "DESC", "null_position": "last"} - ], + "comparator": [{"name": "--STDTC", "sort_order": "ASC"}], } result = DataframeType( - {"value": valid_desc_df, "column_prefix_map": {"--": "SE"}} + {"value": df, "column_prefix_map": {"--": "SE"}} ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - False, - False, - False, - True, - False, - ] - ) - ) + assert result.equals(pd.Series([True, True, True, True, True])) - valid_asc_df = dataset_class.from_dict( + df_invalid = dataset_class.from_dict( { - "USUBJID": [123, 456, 456, 123, 123], - "SESEQ": [1, 2, 1, 3, 2], + "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], + "SESEQ": [1, 2, 3, 3, 2], "SESTDTC": [ "2006-06-02", "2006-06-04", @@ -897,29 +781,12 @@ def test_target_is_sorted_by_multiple_within_not_sorted(dataset_class): ], } ) - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], - } result = DataframeType( - {"value": valid_asc_df, "column_prefix_map": {"--": "SE"}} + {"value": df_invalid, "column_prefix_map": {"--": "SE"}} ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - True, - True, - ] - ) - ) + assert result.equals(pd.Series([True, False, False, True, True])) - valid_desc_df = dataset_class.from_dict( + df_desc = dataset_class.from_dict( { "USUBJID": [123, 456, 456, 123, 123], "SESEQ": [1, 2, 1, 3, 2], @@ -932,538 +799,185 @@ def test_target_is_sorted_by_multiple_within_not_sorted(dataset_class): ], } ) - other_value: dict = { + other_value_desc = { "target": "--SEQ", "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "DESC", "null_position": "last"} - ], + "comparator": [{"name": "--STDTC", "sort_order": "DESC"}], } result = DataframeType( - {"value": valid_desc_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - False, - False, - False, - False, - True, - ] - ) - ) - - invalid_df = dataset_class.from_dict( - { - "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], - "SESEQ": [1, 2, 3, 3, 2], - "SESTDTC": [ - "2006-06-02", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-03", - ], - } - ) + {"value": df_desc, "column_prefix_map": {"--": "SE"}} + ).target_is_sorted_by(other_value_desc) + assert result.equals(pd.Series([False, False, False, False, False])) - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - result = DataframeType( - {"value": invalid_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - False, - False, - True, - True, - ] - ) - ) - valid_mul_df = dataset_class.from_dict( +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_numeric(dataset_class): + """Test target_is_sorted_by with numeric columns.""" + df = dataset_class.from_dict( { - "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], - "SESEQ": [1, 2, 1, 3, 2], - "SESTDTC": [ - "2006-06-02", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-03", - ], - "STUDYID": [ - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - ], - "SEENDTC": [ - "2006-06-02", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-03", + "ARM": [ + "INSULIN", + "INSULIN", + "INSULIN", + "METFORMIN", + "METFORMIN", + "METFORMIN", ], + "TAETORD": [0, 1, 2, 0, 1, 2], + "VISIT": [1, 2, 3, 1, 2, 3], } ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"}, - {"name": "--ENDTC", "sort_order": "ASC", "null_position": "last"}, - ], + other_value = { + "target": "TAETORD", + "within": "ARM", + "comparator": [{"name": "VISIT", "sort_order": "ASC"}], } - result = DataframeType( - {"value": valid_mul_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - True, - True, - ] - ) - ) + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + assert result.equals(pd.Series([True, True, True, True, True, True])) - valid_mul_df = dataset_class.from_dict( + df_invalid = dataset_class.from_dict( { - "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], - "SESEQ": [7, 1, 2, 8, 6], - "SESTDTC": [ - "2006-06-03", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-01", - ], - "STUDYID": [ - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - ], - "SEENDTC": [ - "2006-06-03", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-01", + "ARM": [ + "INSULIN", + "INSULIN", + "INSULIN", + "METFORMIN", + "METFORMIN", + "METFORMIN", ], + "TAETORD": [0, 2, 1, 0, 1, 2], + "VISIT": [1, 2, 3, 1, 2, 3], } ) + result = DataframeType({"value": df_invalid}).target_is_sorted_by(other_value) + assert result.equals(pd.Series([True, False, False, True, True, True])) - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "DESC", "null_position": "last"}, - {"name": "--ENDTC", "sort_order": "DESC", "null_position": "last"}, - ], - } - result = DataframeType( - {"value": valid_mul_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - False, - False, - ] - ) - ) - - valid_mul_df = dataset_class.from_dict( + df_desc = dataset_class.from_dict( { - "USUBJID": ["CDISC001", "CDISC001", "CDISC001", "CDISC001", "CDISC001"], - "SESEQ": [1, 2, 5, 8, 12], - "SESTDTC": [ - "2006-06-01", - "2006-06-02", - "2006-06-03", - "2006-06-04", - "2006-06-05", - ], - "STUDYID": [ - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - ], - "SEENDTC": [ - "2006-06-04", - "2006-06-05", - "2006-06-06", - "2006-06-07", - "2006-06-08", + "ARM": [ + "INSULIN", + "INSULIN", + "INSULIN", + "METFORMIN", + "METFORMIN", + "METFORMIN", ], + "TAETORD": [2, 1, 0, 0, 1, 3], + "VISIT": [3, 2, 1, 3, 2, 1], } ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"}, - {"name": "--ENDTC", "sort_order": "DESC", "null_position": "last"}, - ], + other_value_desc = { + "target": "TAETORD", + "within": "ARM", + "comparator": [{"name": "VISIT", "sort_order": "DESC"}], } - result = DataframeType( - {"value": valid_mul_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - False, - False, - True, - False, - False, - ] - ) - ) + result = DataframeType({"value": df_desc}).target_is_sorted_by(other_value_desc) + assert result.equals(pd.Series([False, False, False, True, True, True])) - invalid_mul_df = dataset_class.from_dict( + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_multiple_within(dataset_class): + """Test target_is_sorted_by with multiple grouping columns.""" + df = dataset_class.from_dict( { - "USUBJID": ["CDISC001", "CDISC002", "CDISC002", "CDISC001", "CDISC001"], - "SESEQ": [1, 2, 1, 1, 2], - "SESTDTC": [ - "2006-06-02", - "2006-06-04", - "2006-06-01", - "2006-06-05", - "2006-06-03", - ], - "STUDYID": [ - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", - "CDISCPILOT1", + "USUBJID": [ + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC002", + "CDISC002", ], - "SEENDTC": [ - "2006-06-02", - "2006-06-04", + "MIDSTYPE": ["A", "A", "B", "B", "A", "A"], + "MIDS": ["A1", "A2", "B1", "B2", "A1", "A2"], + "SMSTDTC": [ "2006-06-01", - "2006-06-05", + "2006-06-02", "2006-06-03", + "2006-06-04", + "2007-01-01", + "2007-01-02", ], } ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"}, - {"name": "--ENDTC", "sort_order": "ASC", "null_position": "last"}, - ], - } - result = DataframeType( - {"value": invalid_mul_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - False, - False, - ] - ) - ) - - valid_na_df = dataset_class.from_dict( - { - "USUBJID": [123, 456, 456, 123, 123], - "SESEQ": [1, 2, 1, None, None], - "SESTDTC": ["2006-06-02", None, "2006-06-01", None, "2006-06-03"], - } - ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - result = DataframeType( - {"value": valid_na_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - False, - True, - False, - True, - ] - ) - ) - - invalid_na_df = dataset_class.from_dict( - { - "USUBJID": [123, 456, 456, 123, 123], - "SESEQ": [1, 2, 3, None, None], - "SESTDTC": ["2006-06-02", None, "2006-06-01", None, "2006-06-03"], - } - ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], - } - result = DataframeType( - {"value": invalid_na_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - False, - False, - False, - True, - ] - ) - ) - - -_COMPARATOR_SMSTDTC = [ - {"name": "SMSTDTC", "sort_order": "ASC", "null_position": "last"} -] -_INVALID_INPUT_DF = { - "USUBJID": ["001", "002"], - "MIDS": ["A1", "A2"], - "SMSTDTC": ["2005-01-01", "2006-01-01"], -} - - -@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) -def test_target_is_sorted_by_and_complement_multi_within(dataset_class): - data = { - "USUBJID": ["001", "001", "003", "003"], - "MIDSTYPE": ["DIAGNOSIS", "DIAGNOSIS", "RELAPSE", "RELAPSE"], - "MIDS": ["DIAG1", "DIAG2", "RELAPSE2", "RELAPSE1"], - "SMSTDTC": ["2005-01-01", "2006-01-01", "2005-01-01", "2006-01-01"], - } other_value = { "target": "MIDS", "within": ["USUBJID", "MIDSTYPE"], - "comparator": _COMPARATOR_SMSTDTC, - } - sorted_expected = [True, True, False, False] - df = dataset_class.from_dict(data) - dt = DataframeType({"value": df}) - assert dt.target_is_sorted_by(other_value).equals( - df.convert_to_series(sorted_expected) - ) - assert dt.target_is_not_sorted_by(other_value).equals( - df.convert_to_series([not x for x in sorted_expected]) - ) - - -@pytest.mark.parametrize( - "other_value,exc_type,match", - [ - ( - {"target": "MIDS", "comparator": _COMPARATOR_SMSTDTC}, - ValueError, - "within parameter is required", - ), - ( - {"target": "MIDS", "within": [], "comparator": _COMPARATOR_SMSTDTC}, - ValueError, - "within must contain valid column names", - ), - ({"target": "MIDS", "within": "USUBJID"}, KeyError, "comparator"), - ( - { - "target": "MIDS", - "within": "USUBJID", - "comparator": [{"sort_order": "ASC", "null_position": "last"}], - }, - KeyError, - "name", - ), - ( - { - "target": "MIDS", - "within": "USUBJID", - "comparator": [ - { - "name": "NONEXISTENT", - "sort_order": "ASC", - "null_position": "last", - } - ], - }, - KeyError, - None, - ), - ], - ids=[ - "missing_within", - "empty_within", - "missing_comparator", - "malformed_comparator", - "missing_column", - ], -) -def test_target_is_sorted_by_invalid_input_raises(other_value, exc_type, match): - df = PandasDataset.from_dict(_INVALID_INPUT_DF) - with pytest.raises(exc_type, match=match): - DataframeType({"value": df}).target_is_sorted_by(other_value) - - -@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) -@pytest.mark.parametrize( - "data,expected", - [ - ({"USUBJID": [], "MIDS": [], "SMSTDTC": []}, None), - ({"USUBJID": ["001"], "MIDS": ["DIAG1"], "SMSTDTC": ["2005-01-01"]}, [True]), - ( - { - "USUBJID": ["001", "001"], - "MIDS": ["DIAG1", "DIAG2"], - "SMSTDTC": [None, None], - }, - [False, False], - ), - ], - ids=["empty_df", "single_row", "all_null_comparator"], -) -def test_target_is_sorted_by_edge_cases(dataset_class, data, expected): - df = dataset_class.from_dict(data) - other_value = { - "target": "MIDS", - "within": "USUBJID", - "comparator": _COMPARATOR_SMSTDTC, + "comparator": [{"name": "SMSTDTC", "sort_order": "ASC"}], } result = DataframeType({"value": df}).target_is_sorted_by(other_value) - if expected is None: - assert len(result) == 0 - else: - assert result.equals(df.convert_to_series(expected)) + assert result.equals(df.convert_to_series([True] * 6)) -@pytest.mark.parametrize("dataset_class", [PandasDataset]) -def test_target_is_sorted_by_datetime(dataset_class): - """ - Test target_is_sorted_by with datetime comparisons - """ - datetime_df = dataset_class.from_dict( +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_multiple_within_numeric(dataset_class): + """Test target_is_sorted_by with multiple grouping columns and numeric comparator.""" + df = dataset_class.from_dict( { - "USUBJID": ["CDISC001", "CDISC001", "CDISC002", "CDISC002", "CDISC003"], - "SESEQ": [1, 2, 1, 2, 1], - "SESTDTC": [ - "2006-06-02 10:00", - "2006-06-02 14:30:00", - "2006-06-03 09:15", - "2006-06-03 11:45:00", - "2006-06-04 08:00:00", + "USUBJID": [ + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC001", + "CDISC002", + "CDISC002", ], + "MIDSTYPE": ["A", "A", "B", "B", "A", "A"], + "MIDS": [1, 2, 1, 2, 1, 2], + "VISITNUM": [1, 2, 1, 2, 1, 2], } ) - - other_value: dict = { - "target": "--SEQ", - "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], + other_value = { + "target": "MIDS", + "within": ["USUBJID", "MIDSTYPE"], + "comparator": [{"name": "VISITNUM", "sort_order": "ASC"}], } - result = DataframeType( - {"value": datetime_df, "column_prefix_map": {"--": "SE"}} - ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - True, - True, - True, - True, - True, - ] - ) - ) - + result = DataframeType({"value": df}).target_is_sorted_by(other_value) + assert result.equals(df.convert_to_series([True] * 6)) -@pytest.mark.parametrize("dataset_class", [PandasDataset]) -def test_target_is_sorted_by_partial_dates(dataset_class): - """ - Test target_is_sorted_by with partial date comparisons - """ - partial_date_df = dataset_class.from_dict( + df_invalid = dataset_class.from_dict( { "USUBJID": [ "CDISC001", "CDISC001", "CDISC001", + "CDISC001", "CDISC002", "CDISC002", - "CDISC002", - ], - "SESEQ": [1, 2, 3, 1, 2, 3], - "SESTDTC": [ - "2006", - "2006-06", - "2006-06-15", - "2007", - "2007-01", - "2007-02-01", ], + "MIDSTYPE": ["A", "A", "B", "B", "A", "A"], + "MIDS": [2, 1, 1, 2, 2, 1], + "VISITNUM": [1, 2, 1, 2, 1, 2], } ) + result = DataframeType({"value": df_invalid}).target_is_sorted_by(other_value) + assert result.equals( + df_invalid.convert_to_series([False, False, True, True, False, False]) + ) - other_value: dict = { + +@pytest.mark.parametrize("dataset_class", [PandasDataset, DaskDataset]) +def test_target_is_sorted_by_with_nulls(dataset_class): + """Test target_is_sorted_by handles null values correctly. + Null in either target or comparator marks that row as False, + but does not affect the ordering check of surrounding non-null rows. + """ + df = dataset_class.from_dict( + { + "USUBJID": [123, 456, 456, 123, 123], + "SESEQ": [1, 2, 1, None, None], + "SESTDTC": ["2006-06-02", None, "2006-06-01", None, "2006-06-03"], + } + ) + other_value = { "target": "--SEQ", "within": "USUBJID", - "comparator": [ - {"name": "--STDTC", "sort_order": "ASC", "null_position": "last"} - ], + "comparator": [{"name": "--STDTC", "sort_order": "ASC"}], } result = DataframeType( - {"value": partial_date_df, "column_prefix_map": {"--": "SE"}} + {"value": df, "column_prefix_map": {"--": "SE"}} ).target_is_sorted_by(other_value) - assert result.equals( - pd.Series( - [ - False, - False, - True, - False, - True, - True, - ] - ) - ) + assert result.equals(pd.Series([True, False, True, False, False])) @pytest.mark.parametrize( diff --git a/tests/unit/test_check_operators/test_value_set_checks.py b/tests/unit/test_check_operators/test_value_set_checks.py index 459b78a76..638605083 100644 --- a/tests/unit/test_check_operators/test_value_set_checks.py +++ b/tests/unit/test_check_operators/test_value_set_checks.py @@ -2,6 +2,7 @@ import pytest from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +import pandas as pd @pytest.mark.parametrize( @@ -97,10 +98,42 @@ def test_is_unique_set_with_regex( @pytest.mark.parametrize( "target, comparator, dataset_type, expected_result", [ - ("SESEQ", "USUBJID", PandasDataset, True), - ("UNORDERED", "USUBJID", PandasDataset, False), - ("SESEQ", "USUBJID", DaskDataset, True), - ("UNORDERED", "USUBJID", DaskDataset, False), + ( + "SESEQ", + "USUBJID", + PandasDataset, + pd.Series([True, True, True, True]), + ), + ( + "UNORDERED", + "USUBJID", + PandasDataset, + pd.Series([False, True, False, True]), + ), + ( + "SESEQ", + "USUBJID", + DaskDataset, + pd.Series([True, True, True, True]), + ), + ( + "UNORDERED", + "USUBJID", + DaskDataset, + pd.Series([False, True, False, True]), + ), + ( + "SESEQ", + ["USUBJID"], + PandasDataset, + pd.Series([True, True, True, True]), + ), + ( + "UNORDERED", + ["USUBJID"], + PandasDataset, + pd.Series([False, True, False, True]), + ), ], ) def test_is_ordered_set(target, comparator, dataset_type, expected_result): @@ -109,16 +142,36 @@ def test_is_ordered_set(target, comparator, dataset_type, expected_result): result = DataframeType({"value": df}).is_ordered_set( {"target": target, "comparator": comparator} ) - assert result == expected_result + pd.testing.assert_series_equal(result, expected_result, check_names=False) @pytest.mark.parametrize( "target, comparator, dataset_type, expected_result", [ - ("SESEQ", "USUBJID", PandasDataset, False), - ("UNORDERED", "USUBJID", PandasDataset, True), - ("SESEQ", "USUBJID", DaskDataset, False), - ("UNORDERED", "USUBJID", DaskDataset, True), + ( + "SESEQ", + "USUBJID", + PandasDataset, + pd.Series([False, False, False, False]), + ), + ( + "UNORDERED", + "USUBJID", + PandasDataset, + pd.Series([True, False, True, False]), + ), + ( + "SESEQ", + "USUBJID", + DaskDataset, + pd.Series([False, False, False, False]), + ), + ( + "UNORDERED", + "USUBJID", + DaskDataset, + pd.Series([True, False, True, False]), + ), ], ) def test_is_not_ordered_set(target, comparator, dataset_type, expected_result): @@ -127,7 +180,42 @@ def test_is_not_ordered_set(target, comparator, dataset_type, expected_result): result = DataframeType({"value": df}).is_not_ordered_set( {"target": target, "comparator": comparator} ) - assert result == expected_result + pd.testing.assert_series_equal(result, expected_result, check_names=False) + + +def test_is_ordered_set_multiple_comparators(): + data = { + "ARMCD": [ + "PLACEBO", + "PLACEBO", + "ZAN_LOW", + "ZAN_LOW", + "ZAN_HIGH", + "ZAN_HIGH", + "ZAN_HIGH", + "ZAN_HIGH", + ], + "ARM": [ + "Placebo", + "Placebo", + "Zanomaline Low Dose", + "Zanomaline Low Dose", + "Zanomaline High Dose", + "Zanomaline High Dose", + "Zanomaline High Dose", + "Zanomaline High Dose", + ], + "TAETORD": [1, 2, 1, 2, 1, 2, 3, 2], + } + df = PandasDataset.from_dict(data) + result = DataframeType({"value": df}).is_ordered_set( + {"target": "TAETORD", "comparator": ["ARMCD", "ARM"]} + ) + pd.testing.assert_series_equal( + result, + pd.Series([True, True, True, True, True, True, False, False]), + check_names=False, + ) @pytest.mark.parametrize(