diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 9383c7703..e6bc63ce8 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -5,19 +5,56 @@ import pytz from cdisc_rules_engine.services import logger import traceback - +from functools import lru_cache +from enum import IntEnum +import operator # Date regex pattern for validation date_regex = re.compile( - r"^((-?[0-9]{4}|-)(-(1[0-2]|0[1-9]|-)(-(3[01]|0[1-9]|[12][0-9]|-)" - r"(T(2[0-3]|[01][0-9]|-)(:([0-5][0-9]|-)((:([0-5][0-9]|-))?(\.[0-9]+)?" - r"((Z|[+-](:2[0-3]|[01][0-9]):[0-5][0-9]))?)?)?)?)?)?)(\/((-?[0-9]{4}|-)" - r"(-(1[0-2]|0[1-9]|-)(-(3[01]|0[1-9]|[12][0-9]|-)(T(2[0-3]|[01][0-9]|-)" - r"(:([0-5][0-9]|-)((:([0-5][0-9]|-))?(\.[0-9]+)?((Z|[+-](:2[0-3]|[01][0-9])" - r":[0-5][0-9]))?)?)?)?)?)?))?$" + r"^(" + r"(?P-?[0-9]{4}|-)(-{1,2}(?P1[0-2]|0[1-9]|-))?" + r"(-{1,2}(?P3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r"(\/" + r"(?P-?[0-9]{4}|-)(-{1,2}(?P1[0-2]|0[1-9]|-))?" + r"(-{1,2}(?P3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r")?" + r"|" + r"-{4,8}T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?" + r")$" ) +class DatePrecision(IntEnum): + year = 0 + month = 1 + day = 2 + hour = 3 + minute = 4 + second = 5 + microsecond = 6 + + @property + def default_value(self): + default_values = { + DatePrecision.year: 1970, + DatePrecision.month: 1, + DatePrecision.day: 1, + DatePrecision.hour: 0, + DatePrecision.minute: 0, + DatePrecision.second: 0, + DatePrecision.microsecond: 0, + } + return default_values[self] + + def is_valid_date(date_string: str) -> bool: if date_string is None: return False @@ -56,20 +93,16 @@ def is_valid_duration(duration: str, negative) -> bool: match = re.match(pattern, duration) if not match: return False - years, months, days, time_designator, hours, minutes, seconds, weeks = ( match.groups() ) - if time_designator and not any([hours, minutes, seconds]): return False - components = [ c for c in [years, months, weeks, days, hours, minutes, seconds] if c is not None ] - # Check if decimal is only in the smallest unit decimal_found = False for i, component in enumerate(components): @@ -77,70 +110,143 @@ def is_valid_duration(duration: str, negative) -> bool: if decimal_found or i != len(components) - 1: return False decimal_found = True - return True -def get_year(date_string: str): - timestamp = get_date(date_string) - return timestamp.year +def _empty_datetime_components(): + return {precision: None for precision in DatePrecision} -def get_month(date_string: str): - timestamp = get_date(date_string) - return timestamp.month +def _extract_datetime_components(date_str: str) -> dict: + """Extract datetime components using regex pattern matching.""" + if not date_str or not isinstance(date_str, str): + return _empty_datetime_components() + match = date_regex.match(date_str) + if not match: + return _empty_datetime_components() + + matches = { + DatePrecision.year: match.group("year") or match.group("interval_year"), + DatePrecision.month: match.group("month") or match.group("interval_month"), + DatePrecision.day: match.group("day") or match.group("interval_day"), + DatePrecision.hour: ( + match.group("hour") + or match.group("interval_hour") + or match.group("timeonly_hour") + ), + DatePrecision.minute: ( + match.group("minute") + or match.group("interval_minute") + or match.group("timeonly_minute") + ), + DatePrecision.second: ( + match.group("second") + or match.group("interval_second") + or match.group("timeonly_second") + ), + DatePrecision.microsecond: ( + match.group("microsecond") + or match.group("interval_microsecond") + or match.group("timeonly_microsecond") + ), + } + components = { + precision: None if _check_date_component_missing(component) else component + for precision, component in matches.items() + } + return components + +@lru_cache(maxsize=1000) +def detect_datetime_precision(date_str: str) -> DatePrecision | None: + if not _datestring_is_valid(date_str): + return None + components = _extract_datetime_components(date_str) + if all(_check_date_component_missing(component) for component in components): + return None + return _date_and_time_precision(components) -def get_day(date_string: str): - timestamp = get_date(date_string) - return timestamp.day +def _datestring_is_valid(date_str: str) -> bool: + return bool(date_str and isinstance(date_str, str) and date_regex.match(date_str)) -def get_hour(date_string: str): - timestamp = get_date(date_string) - return timestamp.hour +def _check_date_component_missing(component) -> bool: + return component is None or component == "-" or component == "" -def get_minute(date_string: str): - timestamp = get_date(date_string) - return timestamp.minute +def _get_precision_before(precision: DatePrecision) -> DatePrecision | None: + prev_index = precision.value - 1 + return DatePrecision(prev_index) if prev_index >= 0 else None -def get_second(date_string: str): - timestamp = get_date(date_string) - return timestamp.second +def _date_and_time_precision( + components: dict, +) -> DatePrecision | None: + for precision in DatePrecision: + component = components[precision] if precision in components else None + if _check_date_component_missing(component): + return _get_precision_before(precision) -def get_microsecond(date_string: str): - timestamp = get_date(date_string) - return timestamp.microsecond + return DatePrecision.microsecond + + +def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: + p1 = detect_datetime_precision(dt1) + p2 = detect_datetime_precision(dt2) + if p1 is None or p2 is None: + return None + min_idx = min(p1.value, p2.value) + return DatePrecision(min_idx) def get_date_component(component: str, date_string: str): - component_func_map = { - "year": get_year, - "month": get_month, - "day": get_day, - "hour": get_hour, - "minute": get_minute, - "microsecond": get_microsecond, - "second": get_second, - } - component_function = component_func_map.get(component) - if component_function: - return component_function(date_string) - else: - return get_date(date_string) + date = get_date(date_string) + try: + return getattr(date, DatePrecision[component].name) + except (KeyError, ValueError): + return date + + +def _parse_uncertain_date(date_string: str) -> datetime | None: + """Parse uncertain dates with missing components using regex groups.""" + components = _extract_datetime_components(date_string) + component_ints = [ + int(components.get(precision) or precision.default_value) + for precision in DatePrecision + ] + try: + return datetime(*component_ints) + except (ValueError, TypeError): + return None def get_date(date_string: str): """ Returns a utc timestamp for comparison """ - date = parse(date_string, default=datetime(1970, 1, 1)) + uncertainty_substrings = ["/", "--", "-:"] + has_uncertainty = any([substr in date_string for substr in uncertainty_substrings]) + + if has_uncertainty: + uncertain_date = _parse_uncertain_date(date_string) + if uncertain_date is not None: + utc = pytz.UTC + return utc.localize(uncertain_date) + + date = parse( + date_string, + default=datetime( + *[ + precision.default_value + for precision in list(DatePrecision)[ + DatePrecision.year : DatePrecision.day + 1 + ] + ] + ), + ) utc = pytz.UTC if date.tzinfo is not None and date.tzinfo.utcoffset(date) is not None: - # timezone aware return date.astimezone(utc) else: return utc.localize(date) @@ -185,15 +291,112 @@ def case_insensitive_is_in(value, values): return str(value).lower() in str(values).lower() -def compare_dates(component, target, comparator, operator): +def truncate_datetime_to_precision(date_string: str, precision: DatePrecision): + dt = get_date(date_string) + if precision is None: + return dt + replacements = { + precision_component.name: precision_component.default_value + for precision_component in list(DatePrecision)[precision.value + 1 :] + } + return dt.replace(**replacements) + + +def _dates_are_comparable(target: str, comparator: str) -> bool: if not target or not comparator: - # Comparison should return false if either is empty or None return False - else: - return operator( - get_date_component(component, target), - get_date_component(component, comparator), - ) + return is_valid_date(target) and is_valid_date(comparator) + + +def _has_explicit_component(component) -> bool: + return component not in (None, "auto") + + +def _compare_with_component(component, target, comparator, operator_func): + return operator_func( + get_date_component(component, target), + get_date_component(component, comparator), + ) + + +def _build_precision_context(target: str, comparator: str) -> dict: + return { + "target_precision": detect_datetime_precision(target), + "comparator_precision": detect_datetime_precision(comparator), + "precision": get_common_precision(target, comparator), + } + + +def _truncate_by_precision( + target: str, comparator: str, precision: DatePrecision | None +) -> tuple: + if precision is None: + return get_date(target), get_date(comparator) + return ( + truncate_datetime_to_precision(target, precision), + truncate_datetime_to_precision(comparator, precision), + ) + + +def _compare_with_inferred_precision( + operator_func, + target: str, + comparator: str, + truncated_target, + truncated_comparator, + context: dict, +): + target_precision = context["target_precision"] + comparator_precision = context["comparator_precision"] + + if operator_func is operator.eq: + if target_precision != comparator_precision: + return False + return truncated_target == truncated_comparator + + if operator_func is operator.ne: + if target_precision != comparator_precision: + return True + return truncated_target != truncated_comparator + + result = operator_func(truncated_target, truncated_comparator) + + if truncated_target == truncated_comparator: + if target_precision and comparator_precision: + if target_precision.value > comparator_precision.value: + return operator_func(get_date(target), get_date(comparator)) + return result + + return result + + +def compare_dates(component, target, comparator, operator_func): + if not _dates_are_comparable(target, comparator): + return False + + if _has_explicit_component(component): + return _compare_with_component(component, target, comparator, operator_func) + + context = _build_precision_context(target, comparator) + precision = context["precision"] + if precision is None: + return False + + truncated_target, truncated_comparator = _truncate_by_precision( + target, comparator, precision + ) + + if component == "auto": + return operator_func(truncated_target, truncated_comparator) + + return _compare_with_inferred_precision( + operator_func, + target, + comparator, + truncated_target, + truncated_comparator, + context, + ) def apply_regex(regex: str, val: str): diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index 1efd47518..459a19607 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -545,7 +545,8 @@ "hour", "minute", "second", - "microsecond" + "microsecond", + "auto" ], "type": "string" }, diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index 16e0f5a89..b84d0e668 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -450,15 +450,26 @@ Date and time specific operations for comparing dates, validating date completen Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +The `date_component` parameter accepts: `"year"`, `"month"`, `"day"`, `"hour"`, `"minute"`, `"second"`, `"microsecond"`, or `"auto"`. + +When `date_component: "auto"` is used, the operator automatically detects the precision of both dates and compares at the common (less precise) level. + +```yaml +- name: "AESTDTC" + operator: "date_equal_to" + value: "RFSTDTC" + date_component: "auto" +``` + ### date_not_equal_to Complement of `date_equal_to` -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. ### date_greater_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > Year part of BRTHDTC > 2021 @@ -471,7 +482,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_greater_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > Year part of BRTHDTC >= 2021 @@ -484,7 +495,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_less_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > AEENDTC < AESTDTC @@ -514,7 +525,7 @@ Operations: ### date_less_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > AEENDTC <= AESTDTC diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index b66cde8b3..e2e4e2a16 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -1,9 +1,31 @@ from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType +from cdisc_rules_engine.check_operators.helpers import ( + DatePrecision, + detect_datetime_precision, + is_valid_date, +) import pytest from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +@pytest.mark.parametrize( + "value,expected_precision", + [ + ("2003-12-15T13:14:17.123", DatePrecision.microsecond), + ("2003-12-15T13:14:17", DatePrecision.second), + ("2003-12-15T13:14", DatePrecision.minute), + ("2003-12-15T13", DatePrecision.hour), + ("2003-12-15", DatePrecision.day), + ("2003-12", DatePrecision.month), + ("2003", DatePrecision.year), + ], +) +def test_detect_datetime_precision_with_truncated_values(value, expected_precision): + assert is_valid_date(value) + assert detect_datetime_precision(value) == expected_precision + + @pytest.mark.parametrize( "data,dataset_type,expected_result", [ @@ -50,6 +72,29 @@ def test_invalid_date(data, dataset_type, expected_result): assert result.equals(df.convert_to_series(expected_result)) +@pytest.mark.parametrize( + "value,expected_precision", + [ + ("2003-12-15T13:15:17", DatePrecision.second), + ("2003-12-15T13:15", DatePrecision.minute), + ("2003-12-15T-:15", DatePrecision.day), + ("2003-12-15T13:-:17", DatePrecision.hour), + ("2003---15", DatePrecision.year), + ("--12-15", None), + ("-----T07:15", None), + ("-----T07:15:30", None), + ("-----T-:15", None), + ("-----T07:-:30", None), + ("2003-12-15T-:-:17", DatePrecision.day), + ("2003-12--", DatePrecision.month), + ("2003--", DatePrecision.year), + ], +) +def test_detect_datetime_precision_with_uncertain_components(value, expected_precision): + assert is_valid_date(value) + assert detect_datetime_precision(value) == expected_precision + + @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ @@ -223,6 +268,7 @@ def test_date_equal_to_date_components( ) def test_date_less_than(data, comparator, dataset_type, expected_result): df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_less_than( {"target": "target", "comparator": comparator} @@ -747,3 +793,103 @@ def test_is_incomplete_date(target, dataset_type, expected_result): .is_incomplete_date({"target": target}) .equals(df.convert_to_series(expected_result)) ) + + +AUTO_PRECISION_CASES = { + "date_equal_to": [ + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-06-24", "2025-06-25T17:22", "auto", False), + ("2025-06-26", "2025-06-25T17:22", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ("2025-07", "2025-06-25", "auto", False), + ("2025-05", "2025-06-25", "auto", False), + ("2025", "2025-06-25T17:22:30", "auto", True), + ("2024", "2025-06-25T17:22:30", "auto", False), + ("2026", "2025-06-25T17:22:30", "auto", False), + ("2025-06-25", "2025-06-25", "auto", True), + ("2025-06-24", "2025-06-25", "auto", False), + ("2025-06-26", "2025-06-25", "auto", False), + ("2025-06-25T17:22", "2025-06-25T17:22:30", "auto", True), + ("2025-06-25T17:21", "2025-06-25T17:22:30", "auto", False), + ("2025-06-25T", "2025-06-25", "auto", False), + ("2025-06-24T", "2025-06-25", "auto", False), + ("2003---15", "2003-12-15", "auto", True), + ("2003---15", "2003-11-15", "auto", True), + ("2003---15", "2004-12-15", "auto", False), + ("2003-12-15T-:15", "2003-12-15T13:15", "auto", True), + ("2003-12-15T-:15", "2003-12-15T14:15", "auto", True), + ("2003-12-15T-:15", "2003-12-16T13:15", "auto", False), + ("2003-12-15T13:-:17", "2003-12-15T13:30:17", "auto", True), + ("2003-12-15T13:-:17", "2003-12-15T14:30:17", "auto", False), + ], + "date_greater_than": [ + ("2025-06-26", "2025-06-25T17:22", None, True), + ("2025-06-24", "2025-06-25T17:22", None, False), + ("2025-06-25", "2025-06-25T17:22", None, False), + ("2025-07", "2025-06-25", None, True), + ("2025-05", "2025-06-25", None, False), + ("2025-06", "2025-06-25", None, False), + ("2026", "2025-06-25T17:22", None, True), + ("2024", "2025-06-25T17:22", None, False), + ("2025", "2025-06-25T17:22", None, False), + ], + "date_greater_than_or_equal_to": [ + ("2025-06-26", "2025-06-25T17:22", "auto", True), + ("2025-06-24", "2025-06-25T17:22", "auto", False), + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-07", "2025-06-25", "auto", True), + ("2025-05", "2025-06-25", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ], + "date_less_than": [ + ("2025-06-24", "2025-06-25T17:22", None, True), + ("2025-06-26", "2025-06-25T17:22", None, False), + ("2025-06-25", "2025-06-25T17:22", None, False), + ("2025-05", "2025-06-25", None, True), + ("2025-07", "2025-06-25", None, False), + ("2025-06", "2025-06-25", None, False), + ("2024", "2025-06-25T17:22", None, True), + ("2026", "2025-06-25T17:22", None, False), + ("2025", "2025-06-25T17:22", None, False), + ], + "date_less_than_or_equal_to": [ + ("2025-06-24", "2025-06-25T17:22", "auto", True), + ("2025-06-26", "2025-06-25T17:22", "auto", False), + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-05", "2025-06-25", "auto", True), + ("2025-07", "2025-06-25", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ], + "date_not_equal_to": [ + ("2025-06-24", "2025-06-25T17:22", "auto", True), + ("2025-06-25", "2025-06-25T17:22", "auto", False), + ("2025-06-26", "2025-06-25T17:22", "auto", True), + ("2025-05", "2025-06-25", "auto", True), + ("2025-06", "2025-06-25", "auto", False), + ("2025-07", "2025-06-25", "auto", True), + ], +} + +AUTO_PRECISION_PARAMS = [ + (operator_name, target, comparator, date_component, expected_result) + for operator_name, scenarios in AUTO_PRECISION_CASES.items() + for target, comparator, date_component, expected_result in scenarios +] + + +@pytest.mark.parametrize( + "operator_name,target,comparator,date_component,expected_result", + AUTO_PRECISION_PARAMS, +) +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_auto_precision_operators( + operator_name, target, comparator, date_component, expected_result, dataset_type +): + df = dataset_type.from_dict({"target": [target]}) + dataframe_type = DataframeType({"value": df}) + operator_method = getattr(dataframe_type, operator_name) + params = {"target": "target", "comparator": comparator} + if date_component is not None: + params["date_component"] = date_component + result = operator_method(params) + assert result.equals(df.convert_to_series([expected_result]))