From d8614fed120cd94171731f4377533f34f06d58e6 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 21 Oct 2025 09:12:57 -0400 Subject: [PATCH 01/19] 1372-add-'auto'-precision-to-compare-datetimes-at=common-granularity --- README.md | 47 ++++ cdisc_rules_engine/check_operators/helpers.py | 109 +++++++++ resources/schema/Operator.json | 3 +- resources/schema/Operator.md | 29 ++- .../test_date_comparison_checks.py | 214 ++++++++++++++++++ 5 files changed, 396 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 948654624..86c2dbd95 100644 --- a/README.md +++ b/README.md @@ -266,6 +266,53 @@ The possible rule run statuses are: - `SUCCESS` - The rule ran and data was validated against the rule. May or may not produce results - `SKIPPED` - The rule was unable to be run. Usually due to missing required data, but could also be cause by rule execution errors. +## Date Comparison with Automatic Precision Detection + +When writing validation rules that compare dates or datetimes with different precision levels, you can use the `date_component: "auto"` parameter to automatically compare at the common precision level. This feature is particularly useful in clinical trial data validation where date fields may have varying levels of precision. + +### How It Works + +The system automatically detects the precision of both dates being compared (year, month, day, hour, minute, second, or microsecond) and performs the comparison at the less precise (common) level. + +### Example Usage + +```yaml +Check: + all: + - name: "AESTDTC" + operator: "date_greater_than_or_equal_to" + value: "RFSTDTC" + date_component: "auto" +``` + +### Common Scenarios + +- **Date vs Datetime**: Comparing `RFSTDTC` (date only, e.g., "2025-06-25") with `AESTDTC` (datetime, e.g., "2025-06-25T17:22") → compared at day precision +- **Partial Dates**: Comparing `"2025-06"` (year-month) with `"2025-06-25"` (complete date) → compared at month precision +- **Mixed Precision Data**: Comparing `"2025"` (year only) with `"2025-06-25T17:22:30"` (full datetime) → compared at year precision + +### When to Use + +This feature is useful when: + +- Comparing date-only fields (like `RFSTDTC`) with datetime fields (like `AESTDTC`) +- Working with partial dates where precision varies (year-only or year-month formats) +- Handling data with varying precision across different records +- Dealing with CDISC uncertainty markers (e.g., `"2025-06--"` for unknown day) + +### Supported Operators + +All date comparison operators support the `auto` precision parameter: + +- `date_equal_to` +- `date_not_equal_to` +- `date_greater_than` +- `date_greater_than_or_equal_to` +- `date_less_than` +- `date_less_than_or_equal_to` + +For more details on date operators, see the [Operator documentation](resources/schema/Operator.md#date). + # Additional Core Commands **- update-cache** - update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 515dc2fee..928db5b34 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -116,6 +116,108 @@ def get_microsecond(date_string: str): return timestamp.microsecond +def _detect_time_precision(time_part: str) -> str: + """Helper to detect time precision from time component.""" + # Check for fractional seconds + if "." in time_part: + return "microsecond" + + # Count colons to determine time precision + colon_count = time_part.count(":") + if colon_count >= 2: + return "second" + elif colon_count == 1: + return "minute" + else: + return "hour" + + +def detect_date_precision(date_str: str) -> str: + """ + Detect the precision level of an ISO-8601 date string. + + Args: + date_str: ISO-8601 formatted date/datetime string + + Returns: + One of: "year", "month", "day", "hour", "minute", "second", "microsecond" + Returns None if date_str is empty or invalid + + Examples: + "2025" -> "year" + "2025-06" -> "month" + "2025-06-25" -> "day" + "2025-06-25T17:22" -> "minute" + "2025-06--" -> "month" (handles CDISC uncertainty markers) + """ + if not date_str or not isinstance(date_str, str): + return None + + # Handle CDISC uncertainty markers + if "--" in date_str or "-:" in date_str: + date_str = date_str.split("--")[0].split("-:")[0] + if not date_str or date_str.endswith("-"): + date_str = date_str.rstrip("-") + + # Check if datetime (has time component) + if "T" in date_str: + time_part = date_str.split("T")[1] + # Remove timezone (+HH:MM or Z) + time_part = time_part.split("+")[0].split("-")[-1].split("Z")[0] + return _detect_time_precision(time_part) + + # Date only - count dashes to determine precision + date_parts = [p for p in date_str.split("-") if p] + + if len(date_parts) >= 3: + return "day" + elif len(date_parts) == 2: + return "month" + elif len(date_parts) == 1: + return "year" + + return None + + +def get_common_precision(dt1: str, dt2: str) -> str: + """ + Determine the common (less precise) precision level between two date strings. + + Args: + dt1: First date string + dt2: Second date string + + Returns: + The less precise precision level, or None if either date is invalid + + Examples: + ("2025-06-25", "2025-06-25T17:22") -> "day" + ("2025-06", "2025-06-25") -> "month" + ("2025", "2025-06-25T17:22:30") -> "year" + """ + precision_order = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", + ] + + p1 = detect_date_precision(dt1) + p2 = detect_date_precision(dt2) + + if not p1 or not p2: + return None + + # Return the less precise of the two + idx1 = precision_order.index(p1) + idx2 = precision_order.index(p2) + + return precision_order[min(idx1, idx2)] + + def get_date_component(component: str, date_string: str): component_func_map = { "year": get_year, @@ -190,6 +292,13 @@ def compare_dates(component, target, comparator, operator): # Comparison should return false if either is empty or None return False else: + # Handle automatic precision detection + if component == "auto": + component = get_common_precision(target, comparator) + # If precision detection fails, fall back to full comparison + if component is None: + component = None # Will trigger get_date() in get_date_component + return operator( get_date_component(component, target), get_date_component(component, comparator), diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index f2829a397..8422f7cc1 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -847,7 +847,8 @@ "hour", "minute", "second", - "microsecond" + "microsecond", + "auto" ], "type": "string" }, diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index 17a1967b3..e7f1a768b 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -399,15 +399,34 @@ Date and time specific operations for comparing dates, validating date completen Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +The `date_component` parameter accepts specific precision levels: `"year"`, `"month"`, `"day"`, `"hour"`, `"minute"`, `"second"`, `"microsecond"`, or `"auto"`. + +When `date_component: "auto"` is used, the operator automatically detects the precision of both dates being compared and performs the comparison at the common (less precise) level. This is useful when comparing dates with different precision levels, such as a date-only field with a datetime field. + +> Compare AESTDTC with RFSTDTC at automatically detected common precision + +```yaml +- name: "AESTDTC" + operator: "date_equal_to" + value: "RFSTDTC" + date_component: "auto" +``` + +Examples of auto precision detection: + +- `"2025-06-25"` compared with `"2025-06-25T17:22"` → compared at day precision +- `"2025-06"` compared with `"2025-06-25"` → compared at month precision +- `"2025"` compared with `"2025-06-25T17:22:30"` → compared at year precision + ### date_not_equal_to Complement of `date_equal_to` -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). ### date_greater_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). > Year part of BRTHDTC > 2021 @@ -420,7 +439,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_greater_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). > Year part of BRTHDTC >= 2021 @@ -433,7 +452,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_less_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). > AEENDTC < AESTDTC @@ -463,7 +482,7 @@ Operations: ### date_less_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). > AEENDTC <= AESTDTC diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index b66cde8b3..5c4bd4f05 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -747,3 +747,217 @@ def test_is_incomplete_date(target, dataset_type, expected_result): .is_incomplete_date({"target": target}) .equals(df.convert_to_series(expected_result)) ) + + +# Tests for automatic precision detection (date_component: "auto") + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime - should compare at day level + ( + {"target": ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, True, False], + ), + # Partial date (year-month) vs complete date - should compare at month level + ( + {"target": ["2025-06", "2025-07", "2025-06", "2025-05"]}, + "2025-06-25", + DaskDataset, + [True, False, True, False], + ), + # Year only vs complete date - should compare at year level + ( + {"target": ["2025", "2024", "2025", "2026"]}, + "2025-06-25T17:22:30", + PandasDataset, + [True, False, True, False], + ), + # Both have same precision - should work normally + ( + {"target": ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"]}, + "2025-06-25", + DaskDataset, + [True, False, False, True], + ), + # Datetime vs datetime with different time precision + ( + {"target": ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"]}, + "2025-06-25T17:22:30", + PandasDataset, + [True, False, True], + ), + ], +) +def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_equal_to( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime at day level + ( + {"target": ["2025-06-26", "2025-06-24", "2025-06-25"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, False], + ), + # Year-month vs complete date at month level + ( + {"target": ["2025-07", "2025-05", "2025-06"]}, + "2025-06-25", + DaskDataset, + [True, False, False], + ), + # Year only vs datetime + ( + {"target": ["2026", "2024", "2025"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, False], + ), + ], +) +def test_date_greater_than_auto_precision( + data, comparator, dataset_type, expected_result +): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_greater_than( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime at day level + ( + {"target": ["2025-06-26", "2025-06-24", "2025-06-25"]}, + "2025-06-25T17:22", + DaskDataset, + [True, False, True], + ), + # Year-month vs complete date at month level + ( + {"target": ["2025-07", "2025-05", "2025-06"]}, + "2025-06-25", + PandasDataset, + [True, False, True], + ), + ], +) +def test_date_greater_than_or_equal_to_auto_precision( + data, comparator, dataset_type, expected_result +): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_greater_than_or_equal_to( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime at day level + ( + {"target": ["2025-06-24", "2025-06-26", "2025-06-25"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, False], + ), + # Year-month vs complete date at month level + ( + {"target": ["2025-05", "2025-07", "2025-06"]}, + "2025-06-25", + DaskDataset, + [True, False, False], + ), + # Year only vs datetime + ( + {"target": ["2024", "2026", "2025"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, False], + ), + ], +) +def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_result): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_less_than( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime at day level + ( + {"target": ["2025-06-24", "2025-06-26", "2025-06-25"]}, + "2025-06-25T17:22", + DaskDataset, + [True, False, True], + ), + # Year-month vs complete date at month level + ( + {"target": ["2025-05", "2025-07", "2025-06"]}, + "2025-06-25", + PandasDataset, + [True, False, True], + ), + ], +) +def test_date_less_than_or_equal_to_auto_precision( + data, comparator, dataset_type, expected_result +): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_less_than_or_equal_to( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) + + +@pytest.mark.parametrize( + "data,comparator,dataset_type,expected_result", + [ + # Date vs datetime at day level + ( + {"target": ["2025-06-24", "2025-06-25", "2025-06-26"]}, + "2025-06-25T17:22", + PandasDataset, + [True, False, True], + ), + # Year-month vs complete date at month level + ( + {"target": ["2025-05", "2025-06", "2025-07"]}, + "2025-06-25", + DaskDataset, + [True, False, True], + ), + ], +) +def test_date_not_equal_to_auto_precision( + data, comparator, dataset_type, expected_result +): + df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) + result = dataframe_type.date_not_equal_to( + {"target": "target", "comparator": comparator, "date_component": "auto"} + ) + assert result.equals(df.convert_to_series(expected_result)) From 17c311cc00fa69a63ade4491a44a7227f29fc67f Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 27 Oct 2025 21:52:38 -0400 Subject: [PATCH 02/19] Add auto precision detection to date comparison operators via date_component parameter --- cdisc_rules_engine/check_operators/helpers.py | 58 +++++-------------- .../test_date_comparison_checks.py | 53 +++++++++-------- 2 files changed, 43 insertions(+), 68 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 928db5b34..464631d0a 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -118,11 +118,9 @@ def get_microsecond(date_string: str): def _detect_time_precision(time_part: str) -> str: """Helper to detect time precision from time component.""" - # Check for fractional seconds if "." in time_part: return "microsecond" - # Count colons to determine time precision colon_count = time_part.count(":") if colon_count >= 2: return "second" @@ -136,37 +134,25 @@ def detect_date_precision(date_str: str) -> str: """ Detect the precision level of an ISO-8601 date string. - Args: - date_str: ISO-8601 formatted date/datetime string - - Returns: - One of: "year", "month", "day", "hour", "minute", "second", "microsecond" - Returns None if date_str is empty or invalid - - Examples: - "2025" -> "year" - "2025-06" -> "month" - "2025-06-25" -> "day" - "2025-06-25T17:22" -> "minute" - "2025-06--" -> "month" (handles CDISC uncertainty markers) + Returns precision level or None if invalid. Handles CDISC uncertainty markers. """ if not date_str or not isinstance(date_str, str): return None - # Handle CDISC uncertainty markers if "--" in date_str or "-:" in date_str: date_str = date_str.split("--")[0].split("-:")[0] if not date_str or date_str.endswith("-"): date_str = date_str.rstrip("-") - # Check if datetime (has time component) if "T" in date_str: time_part = date_str.split("T")[1] - # Remove timezone (+HH:MM or Z) + + if not time_part: + return "day" + time_part = time_part.split("+")[0].split("-")[-1].split("Z")[0] return _detect_time_precision(time_part) - # Date only - count dashes to determine precision date_parts = [p for p in date_str.split("-") if p] if len(date_parts) >= 3: @@ -182,18 +168,6 @@ def detect_date_precision(date_str: str) -> str: def get_common_precision(dt1: str, dt2: str) -> str: """ Determine the common (less precise) precision level between two date strings. - - Args: - dt1: First date string - dt2: Second date string - - Returns: - The less precise precision level, or None if either date is invalid - - Examples: - ("2025-06-25", "2025-06-25T17:22") -> "day" - ("2025-06", "2025-06-25") -> "month" - ("2025", "2025-06-25T17:22:30") -> "year" """ precision_order = [ "year", @@ -211,7 +185,6 @@ def get_common_precision(dt1: str, dt2: str) -> str: if not p1 or not p2: return None - # Return the less precise of the two idx1 = precision_order.index(p1) idx2 = precision_order.index(p2) @@ -289,20 +262,15 @@ def case_insensitive_is_in(value, values): def compare_dates(component, target, comparator, operator): if not target or not comparator: - # Comparison should return false if either is empty or None return False - else: - # Handle automatic precision detection - if component == "auto": - component = get_common_precision(target, comparator) - # If precision detection fails, fall back to full comparison - if component is None: - component = None # Will trigger get_date() in get_date_component - - return operator( - get_date_component(component, target), - get_date_component(component, comparator), - ) + + if component == "auto": + component = get_common_precision(target, comparator) + + return operator( + get_date_component(component, target), + get_date_component(component, comparator), + ) def apply_regex(regex: str, val: str): diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index 5c4bd4f05..bbd2ec21f 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -757,43 +757,50 @@ def test_is_incomplete_date(target, dataset_type, expected_result): [ # Date vs datetime - should compare at day level ( - {"target": ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"]}, + ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", PandasDataset, [True, False, True, False], ), # Partial date (year-month) vs complete date - should compare at month level ( - {"target": ["2025-06", "2025-07", "2025-06", "2025-05"]}, + ["2025-06", "2025-07", "2025-06", "2025-05"], "2025-06-25", DaskDataset, [True, False, True, False], ), # Year only vs complete date - should compare at year level ( - {"target": ["2025", "2024", "2025", "2026"]}, + ["2025", "2024", "2025", "2026"], "2025-06-25T17:22:30", PandasDataset, [True, False, True, False], ), # Both have same precision - should work normally ( - {"target": ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"]}, + ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25", DaskDataset, [True, False, False, True], ), # Datetime vs datetime with different time precision ( - {"target": ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"]}, + ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"], "2025-06-25T17:22:30", PandasDataset, [True, False, True], ), + # Empty time component edge case + ( + ["2025-06-25T", "2025-06-24T", "2025-06-25T"], + "2025-06-25", + PandasDataset, + [True, False, True], + ), ], ) def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_result): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_equal_to( {"target": "target", "comparator": comparator, "date_component": "auto"} @@ -806,21 +813,21 @@ def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_r [ # Date vs datetime at day level ( - {"target": ["2025-06-26", "2025-06-24", "2025-06-25"]}, + ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", PandasDataset, [True, False, False], ), # Year-month vs complete date at month level ( - {"target": ["2025-07", "2025-05", "2025-06"]}, + ["2025-07", "2025-05", "2025-06"], "2025-06-25", DaskDataset, [True, False, False], ), # Year only vs datetime ( - {"target": ["2026", "2024", "2025"]}, + ["2026", "2024", "2025"], "2025-06-25T17:22", PandasDataset, [True, False, False], @@ -830,7 +837,7 @@ def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_r def test_date_greater_than_auto_precision( data, comparator, dataset_type, expected_result ): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_greater_than( {"target": "target", "comparator": comparator, "date_component": "auto"} @@ -843,14 +850,14 @@ def test_date_greater_than_auto_precision( [ # Date vs datetime at day level ( - {"target": ["2025-06-26", "2025-06-24", "2025-06-25"]}, + ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", DaskDataset, [True, False, True], ), # Year-month vs complete date at month level ( - {"target": ["2025-07", "2025-05", "2025-06"]}, + ["2025-07", "2025-05", "2025-06"], "2025-06-25", PandasDataset, [True, False, True], @@ -860,7 +867,7 @@ def test_date_greater_than_auto_precision( def test_date_greater_than_or_equal_to_auto_precision( data, comparator, dataset_type, expected_result ): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_greater_than_or_equal_to( {"target": "target", "comparator": comparator, "date_component": "auto"} @@ -873,21 +880,21 @@ def test_date_greater_than_or_equal_to_auto_precision( [ # Date vs datetime at day level ( - {"target": ["2025-06-24", "2025-06-26", "2025-06-25"]}, + ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", PandasDataset, [True, False, False], ), # Year-month vs complete date at month level ( - {"target": ["2025-05", "2025-07", "2025-06"]}, + ["2025-05", "2025-07", "2025-06"], "2025-06-25", DaskDataset, [True, False, False], ), # Year only vs datetime ( - {"target": ["2024", "2026", "2025"]}, + ["2024", "2026", "2025"], "2025-06-25T17:22", PandasDataset, [True, False, False], @@ -895,7 +902,7 @@ def test_date_greater_than_or_equal_to_auto_precision( ], ) def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_result): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_less_than( {"target": "target", "comparator": comparator, "date_component": "auto"} @@ -908,14 +915,14 @@ def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_ [ # Date vs datetime at day level ( - {"target": ["2025-06-24", "2025-06-26", "2025-06-25"]}, + ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", DaskDataset, [True, False, True], ), # Year-month vs complete date at month level ( - {"target": ["2025-05", "2025-07", "2025-06"]}, + ["2025-05", "2025-07", "2025-06"], "2025-06-25", PandasDataset, [True, False, True], @@ -925,7 +932,7 @@ def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_ def test_date_less_than_or_equal_to_auto_precision( data, comparator, dataset_type, expected_result ): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_less_than_or_equal_to( {"target": "target", "comparator": comparator, "date_component": "auto"} @@ -938,14 +945,14 @@ def test_date_less_than_or_equal_to_auto_precision( [ # Date vs datetime at day level ( - {"target": ["2025-06-24", "2025-06-25", "2025-06-26"]}, + ["2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", PandasDataset, [True, False, True], ), # Year-month vs complete date at month level ( - {"target": ["2025-05", "2025-06", "2025-07"]}, + ["2025-05", "2025-06", "2025-07"], "2025-06-25", DaskDataset, [True, False, True], @@ -955,7 +962,7 @@ def test_date_less_than_or_equal_to_auto_precision( def test_date_not_equal_to_auto_precision( data, comparator, dataset_type, expected_result ): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_not_equal_to( {"target": "target", "comparator": comparator, "date_component": "auto"} From 419dd77b7ebfda7a4e2361467200143ff765fb72 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 27 Oct 2025 22:09:02 -0400 Subject: [PATCH 03/19] minor cleanup --- README.md | 46 +++---------------- cdisc_rules_engine/check_operators/helpers.py | 4 -- resources/schema/Operator.md | 22 +++------ .../test_date_comparison_checks.py | 24 ++-------- 4 files changed, 16 insertions(+), 80 deletions(-) diff --git a/README.md b/README.md index 86c2dbd95..0e43bdc82 100644 --- a/README.md +++ b/README.md @@ -268,50 +268,16 @@ The possible rule run statuses are: ## Date Comparison with Automatic Precision Detection -When writing validation rules that compare dates or datetimes with different precision levels, you can use the `date_component: "auto"` parameter to automatically compare at the common precision level. This feature is particularly useful in clinical trial data validation where date fields may have varying levels of precision. - -### How It Works - -The system automatically detects the precision of both dates being compared (year, month, day, hour, minute, second, or microsecond) and performs the comparison at the less precise (common) level. - -### Example Usage +Use `date_component: "auto"` to automatically compare dates at their common precision level. ```yaml -Check: - all: - - name: "AESTDTC" - operator: "date_greater_than_or_equal_to" - value: "RFSTDTC" - date_component: "auto" +- name: "AESTDTC" + operator: "date_greater_than_or_equal_to" + value: "RFSTDTC" + date_component: "auto" ``` -### Common Scenarios - -- **Date vs Datetime**: Comparing `RFSTDTC` (date only, e.g., "2025-06-25") with `AESTDTC` (datetime, e.g., "2025-06-25T17:22") → compared at day precision -- **Partial Dates**: Comparing `"2025-06"` (year-month) with `"2025-06-25"` (complete date) → compared at month precision -- **Mixed Precision Data**: Comparing `"2025"` (year only) with `"2025-06-25T17:22:30"` (full datetime) → compared at year precision - -### When to Use - -This feature is useful when: - -- Comparing date-only fields (like `RFSTDTC`) with datetime fields (like `AESTDTC`) -- Working with partial dates where precision varies (year-only or year-month formats) -- Handling data with varying precision across different records -- Dealing with CDISC uncertainty markers (e.g., `"2025-06--"` for unknown day) - -### Supported Operators - -All date comparison operators support the `auto` precision parameter: - -- `date_equal_to` -- `date_not_equal_to` -- `date_greater_than` -- `date_greater_than_or_equal_to` -- `date_less_than` -- `date_less_than_or_equal_to` - -For more details on date operators, see the [Operator documentation](resources/schema/Operator.md#date). +This compares `"2025-06-25"` with `"2025-06-25T17:22"` at day precision, `"2025-06"` with `"2025-06-25"` at month precision, etc. # Additional Core Commands diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 464631d0a..81fe227c9 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -26,8 +26,6 @@ def is_valid_date(date_string: str) -> bool: except Exception as e: uncertainty_substrings = ["/", "--", "-:"] if any([substr in date_string for substr in uncertainty_substrings]): - # date_string contains uncertainty - # will not parse with isoparse return date_regex.match(date_string) is not None else: logger.error( @@ -70,7 +68,6 @@ def is_valid_duration(duration: str, negative) -> bool: if c is not None ] - # Check if decimal is only in the smallest unit decimal_found = False for i, component in enumerate(components): if "." in component or "," in component: @@ -215,7 +212,6 @@ def get_date(date_string: str): date = parse(date_string, default=datetime(1970, 1, 1)) utc = pytz.UTC if date.tzinfo is not None and date.tzinfo.utcoffset(date) is not None: - # timezone aware return date.astimezone(utc) else: return utc.localize(date) diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index e7f1a768b..3211af3e4 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -399,11 +399,9 @@ Date and time specific operations for comparing dates, validating date completen Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. -The `date_component` parameter accepts specific precision levels: `"year"`, `"month"`, `"day"`, `"hour"`, `"minute"`, `"second"`, `"microsecond"`, or `"auto"`. +The `date_component` parameter accepts: `"year"`, `"month"`, `"day"`, `"hour"`, `"minute"`, `"second"`, `"microsecond"`, or `"auto"`. -When `date_component: "auto"` is used, the operator automatically detects the precision of both dates being compared and performs the comparison at the common (less precise) level. This is useful when comparing dates with different precision levels, such as a date-only field with a datetime field. - -> Compare AESTDTC with RFSTDTC at automatically detected common precision +When `date_component: "auto"` is used, the operator automatically detects the precision of both dates and compares at the common (less precise) level. ```yaml - name: "AESTDTC" @@ -412,21 +410,15 @@ When `date_component: "auto"` is used, the operator automatically detects the pr date_component: "auto" ``` -Examples of auto precision detection: - -- `"2025-06-25"` compared with `"2025-06-25T17:22"` → compared at day precision -- `"2025-06"` compared with `"2025-06-25"` → compared at month precision -- `"2025"` compared with `"2025-06-25T17:22:30"` → compared at year precision - ### date_not_equal_to Complement of `date_equal_to` -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. ### date_greater_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > Year part of BRTHDTC > 2021 @@ -439,7 +431,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_greater_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > Year part of BRTHDTC >= 2021 @@ -452,7 +444,7 @@ Date comparison. Compare `name` to `value`. Compares partial dates if `date_comp ### date_less_than -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > AEENDTC < AESTDTC @@ -482,7 +474,7 @@ Operations: ### date_less_than_or_equal_to -Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Also supports `date_component: "auto"` for automatic precision detection (see `date_equal_to` for details). +Date comparison. Compare `name` to `value`. Compares partial dates if `date_component` is specified. Supports `date_component: "auto"`. > AEENDTC <= AESTDTC diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index bbd2ec21f..0ee39a53e 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -749,48 +749,39 @@ def test_is_incomplete_date(target, dataset_type, expected_result): ) -# Tests for automatic precision detection (date_component: "auto") - - @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime - should compare at day level ( ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", PandasDataset, [True, False, True, False], ), - # Partial date (year-month) vs complete date - should compare at month level ( ["2025-06", "2025-07", "2025-06", "2025-05"], "2025-06-25", DaskDataset, [True, False, True, False], ), - # Year only vs complete date - should compare at year level ( ["2025", "2024", "2025", "2026"], "2025-06-25T17:22:30", PandasDataset, [True, False, True, False], ), - # Both have same precision - should work normally ( ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25", DaskDataset, [True, False, False, True], ), - # Datetime vs datetime with different time precision ( ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"], "2025-06-25T17:22:30", PandasDataset, [True, False, True], ), - # Empty time component edge case ( ["2025-06-25T", "2025-06-24T", "2025-06-25T"], "2025-06-25", @@ -811,21 +802,18 @@ def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_r @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime at day level ( ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", PandasDataset, [True, False, False], ), - # Year-month vs complete date at month level ( ["2025-07", "2025-05", "2025-06"], "2025-06-25", DaskDataset, [True, False, False], ), - # Year only vs datetime ( ["2026", "2024", "2025"], "2025-06-25T17:22", @@ -848,14 +836,12 @@ def test_date_greater_than_auto_precision( @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime at day level ( ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", DaskDataset, [True, False, True], ), - # Year-month vs complete date at month level ( ["2025-07", "2025-05", "2025-06"], "2025-06-25", @@ -878,21 +864,18 @@ def test_date_greater_than_or_equal_to_auto_precision( @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime at day level ( ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", PandasDataset, [True, False, False], ), - # Year-month vs complete date at month level ( ["2025-05", "2025-07", "2025-06"], "2025-06-25", DaskDataset, [True, False, False], ), - # Year only vs datetime ( ["2024", "2026", "2025"], "2025-06-25T17:22", @@ -913,14 +896,12 @@ def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_ @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime at day level ( ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", DaskDataset, [True, False, True], ), - # Year-month vs complete date at month level ( ["2025-05", "2025-07", "2025-06"], "2025-06-25", @@ -943,14 +924,12 @@ def test_date_less_than_or_equal_to_auto_precision( @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ - # Date vs datetime at day level ( ["2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", PandasDataset, [True, False, True], ), - # Year-month vs complete date at month level ( ["2025-05", "2025-06", "2025-07"], "2025-06-25", @@ -968,3 +947,6 @@ def test_date_not_equal_to_auto_precision( {"target": "target", "comparator": comparator, "date_component": "auto"} ) assert result.equals(df.convert_to_series(expected_result)) + + +"" From 260a30584d7584f3ed193e7e6a6fbfc8664337a4 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 27 Oct 2025 22:33:22 -0400 Subject: [PATCH 04/19] Optimization --- cdisc_rules_engine/check_operators/helpers.py | 70 +++++----- .../test_date_comparison_checks.py | 125 ++++-------------- 2 files changed, 62 insertions(+), 133 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 81fe227c9..714fa33ff 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -5,6 +5,7 @@ import pytz from cdisc_rules_engine.services import logger import traceback +from functools import lru_cache # Date regex pattern for validation @@ -127,15 +128,25 @@ def _detect_time_precision(time_part: str) -> str: return "hour" -def detect_date_precision(date_str: str) -> str: - """ - Detect the precision level of an ISO-8601 date string. +def _detect_date_precision_simple(date_str: str) -> str: + date_parts = [p for p in date_str.split("-") if p] + if len(date_parts) >= 3: + return "day" + elif len(date_parts) == 2: + return "month" + elif len(date_parts) == 1: + return "year" + return None - Returns precision level or None if invalid. Handles CDISC uncertainty markers. - """ + +@lru_cache(maxsize=1000) +def detect_date_precision(date_str: str) -> str: if not date_str or not isinstance(date_str, str): return None + if "T" not in date_str and "--" not in date_str and "-:" not in date_str: + return _detect_date_precision_simple(date_str) + if "--" in date_str or "-:" in date_str: date_str = date_str.split("--")[0].split("-:")[0] if not date_str or date_str.endswith("-"): @@ -143,49 +154,44 @@ def detect_date_precision(date_str: str) -> str: if "T" in date_str: time_part = date_str.split("T")[1] - if not time_part: return "day" - time_part = time_part.split("+")[0].split("-")[-1].split("Z")[0] return _detect_time_precision(time_part) - date_parts = [p for p in date_str.split("-") if p] + return _detect_date_precision_simple(date_str) - if len(date_parts) >= 3: - return "day" - elif len(date_parts) == 2: - return "month" - elif len(date_parts) == 1: - return "year" - return None +PRECISION_ORDER = { + "year": 0, + "month": 1, + "day": 2, + "hour": 3, + "minute": 4, + "second": 5, + "microsecond": 6, +} +PRECISION_LEVELS = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", +] -def get_common_precision(dt1: str, dt2: str) -> str: - """ - Determine the common (less precise) precision level between two date strings. - """ - precision_order = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", - ] +def get_common_precision(dt1: str, dt2: str) -> str: p1 = detect_date_precision(dt1) p2 = detect_date_precision(dt2) if not p1 or not p2: return None - idx1 = precision_order.index(p1) - idx2 = precision_order.index(p2) - - return precision_order[min(idx1, idx2)] + min_idx = min(PRECISION_ORDER[p1], PRECISION_ORDER[p2]) + return PRECISION_LEVELS[min_idx] def get_date_component(component: str, date_string: str): diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index 0ee39a53e..ccc96cd5f 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -750,203 +750,126 @@ def test_is_incomplete_date(target, dataset_type, expected_result): @pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", + "operator_name,data,comparator,expected_result", [ ( + "date_equal_to", ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", - PandasDataset, [True, False, True, False], ), ( + "date_equal_to", ["2025-06", "2025-07", "2025-06", "2025-05"], "2025-06-25", - DaskDataset, [True, False, True, False], ), ( + "date_equal_to", ["2025", "2024", "2025", "2026"], "2025-06-25T17:22:30", - PandasDataset, [True, False, True, False], ), ( + "date_equal_to", ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25", - DaskDataset, [True, False, False, True], ), ( + "date_equal_to", ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"], "2025-06-25T17:22:30", - PandasDataset, [True, False, True], ), ( + "date_equal_to", ["2025-06-25T", "2025-06-24T", "2025-06-25T"], "2025-06-25", - PandasDataset, [True, False, True], ), - ], -) -def test_date_equal_to_auto_precision(data, comparator, dataset_type, expected_result): - df = dataset_type.from_dict({"target": data}) - dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_equal_to( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) - - -@pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", - [ ( + "date_greater_than", ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", - PandasDataset, [True, False, False], ), ( + "date_greater_than", ["2025-07", "2025-05", "2025-06"], "2025-06-25", - DaskDataset, [True, False, False], ), ( + "date_greater_than", ["2026", "2024", "2025"], "2025-06-25T17:22", - PandasDataset, [True, False, False], ), - ], -) -def test_date_greater_than_auto_precision( - data, comparator, dataset_type, expected_result -): - df = dataset_type.from_dict({"target": data}) - dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_greater_than( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) - - -@pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", - [ ( + "date_greater_than_or_equal_to", ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", - DaskDataset, [True, False, True], ), ( + "date_greater_than_or_equal_to", ["2025-07", "2025-05", "2025-06"], "2025-06-25", - PandasDataset, [True, False, True], ), - ], -) -def test_date_greater_than_or_equal_to_auto_precision( - data, comparator, dataset_type, expected_result -): - df = dataset_type.from_dict({"target": data}) - dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_greater_than_or_equal_to( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) - - -@pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", - [ ( + "date_less_than", ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", - PandasDataset, [True, False, False], ), ( + "date_less_than", ["2025-05", "2025-07", "2025-06"], "2025-06-25", - DaskDataset, [True, False, False], ), ( + "date_less_than", ["2024", "2026", "2025"], "2025-06-25T17:22", - PandasDataset, [True, False, False], ), - ], -) -def test_date_less_than_auto_precision(data, comparator, dataset_type, expected_result): - df = dataset_type.from_dict({"target": data}) - dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_less_than( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) - - -@pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", - [ ( + "date_less_than_or_equal_to", ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", - DaskDataset, [True, False, True], ), ( + "date_less_than_or_equal_to", ["2025-05", "2025-07", "2025-06"], "2025-06-25", - PandasDataset, [True, False, True], ), - ], -) -def test_date_less_than_or_equal_to_auto_precision( - data, comparator, dataset_type, expected_result -): - df = dataset_type.from_dict({"target": data}) - dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_less_than_or_equal_to( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) - - -@pytest.mark.parametrize( - "data,comparator,dataset_type,expected_result", - [ ( + "date_not_equal_to", ["2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", - PandasDataset, [True, False, True], ), ( + "date_not_equal_to", ["2025-05", "2025-06", "2025-07"], "2025-06-25", - DaskDataset, [True, False, True], ), ], ) -def test_date_not_equal_to_auto_precision( - data, comparator, dataset_type, expected_result +@pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) +def test_auto_precision_operators( + operator_name, data, comparator, expected_result, dataset_type ): df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) - result = dataframe_type.date_not_equal_to( + operator_method = getattr(dataframe_type, operator_name) + result = operator_method( {"target": "target", "comparator": comparator, "date_component": "auto"} ) assert result.equals(df.convert_to_series(expected_result)) - - -"" From 2940244853e8d9d9c946dfb4f80bfca34e045f4a Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 28 Oct 2025 09:07:37 -0400 Subject: [PATCH 05/19] Unit Test Fix\Update --- cdisc_rules_engine/check_operators/helpers.py | 1 - .../test_date_comparison_checks.py | 34 ++++++++----------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 714fa33ff..fbc2c3c32 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -115,7 +115,6 @@ def get_microsecond(date_string: str): def _detect_time_precision(time_part: str) -> str: - """Helper to detect time precision from time component.""" if "." in time_part: return "microsecond" diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index ccc96cd5f..d9fe84c13 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -8,43 +8,39 @@ "data,dataset_type,expected_result", [ ( - {"target": ["2021", "2099", "2022", "2023"]}, + ["2021", "2099", "2022", "2023"], PandasDataset, [False, False, False, False], ), ( - {"target": ["90999", "20999", "2022", "2023"]}, + ["90999", "20999", "2022", "2023"], DaskDataset, [True, True, False, False], ), ( - { - "target": [ - "2022-03-11T092030", - "2022-03-11T09,20,30", - "2022-03-11T09@20@30", - "2022-03-11T09!20:30", - ] - }, + [ + "2022-03-11T092030", + "2022-03-11T09,20,30", + "2022-03-11T09@20@30", + "2022-03-11T09!20:30", + ], PandasDataset, [True, True, True, True], ), ( - { - "target": [ - "1997-07", - "1997-07-16", - "1997-07-16T19:20:30.45+01:00", - "2022-05-08T13:44:66", - ] - }, + [ + "1997-07", + "1997-07-16", + "1997-07-16T19:20:30.45+01:00", + "2022-05-08T13:44:66", + ], DaskDataset, [False, False, False, True], ), ], ) def test_invalid_date(data, dataset_type, expected_result): - df = dataset_type.from_dict(data) + df = dataset_type.from_dict({"target": data}) dataframe_type = DataframeType({"value": df}) result = dataframe_type.invalid_date({"target": "target"}) assert result.equals(df.convert_to_series(expected_result)) From 61dad95cd8d5619bcd0728f20880803302a4d669 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 3 Nov 2025 09:56:34 -0500 Subject: [PATCH 06/19] restore comments --- cdisc_rules_engine/check_operators/helpers.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index a8ac100f5..aa9a9c0e9 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -27,6 +27,8 @@ def is_valid_date(date_string: str) -> bool: except Exception as e: uncertainty_substrings = ["/", "--", "-:"] if any([substr in date_string for substr in uncertainty_substrings]): + # date_string contains uncertainty + # will not parse with isoparse return date_regex.match(date_string) is not None else: logger.error( @@ -69,6 +71,7 @@ def is_valid_duration(duration: str, negative) -> bool: if c is not None ] + # Check if decimal is only in the smallest unit decimal_found = False for i, component in enumerate(components): if "." in component or "," in component: @@ -217,6 +220,7 @@ def get_date(date_string: str): date = parse(date_string, default=datetime(1970, 1, 1)) utc = pytz.UTC if date.tzinfo is not None and date.tzinfo.utcoffset(date) is not None: + # timezone aware return date.astimezone(utc) else: return utc.localize(date) From a3d7a1bd3ebca1fc4d69fd0b8d614d684d12dc7b Mon Sep 17 00:00:00 2001 From: Rakesh Date: Mon, 3 Nov 2025 18:17:56 -0500 Subject: [PATCH 07/19] restore test_invalid_date format, move precision constants to top, add test cases with 'T' in data list --- cdisc_rules_engine/check_operators/helpers.py | 41 ++++---- .../test_date_comparison_checks.py | 94 ++++++++++++++++--- 2 files changed, 99 insertions(+), 36 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index aa9a9c0e9..b76fd7e34 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -18,6 +18,26 @@ r":[0-5][0-9]))?)?)?)?)?)?))?$" ) +PRECISION_ORDER = { + "year": 0, + "month": 1, + "day": 2, + "hour": 3, + "minute": 4, + "second": 5, + "microsecond": 6, +} + +PRECISION_LEVELS = [ + "year", + "month", + "day", + "hour", + "minute", + "second", + "microsecond", +] + def is_valid_date(date_string: str) -> bool: if date_string is None: @@ -164,27 +184,6 @@ def detect_date_precision(date_str: str) -> str: return _detect_date_precision_simple(date_str) -PRECISION_ORDER = { - "year": 0, - "month": 1, - "day": 2, - "hour": 3, - "minute": 4, - "second": 5, - "microsecond": 6, -} - -PRECISION_LEVELS = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", -] - - def get_common_precision(dt1: str, dt2: str) -> str: p1 = detect_date_precision(dt1) p2 = detect_date_precision(dt2) diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index d9fe84c13..7813517f8 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -8,39 +8,43 @@ "data,dataset_type,expected_result", [ ( - ["2021", "2099", "2022", "2023"], + {"target": ["2021", "2099", "2022", "2023"]}, PandasDataset, [False, False, False, False], ), ( - ["90999", "20999", "2022", "2023"], + {"target": ["90999", "20999", "2022", "2023"]}, DaskDataset, [True, True, False, False], ), ( - [ - "2022-03-11T092030", - "2022-03-11T09,20,30", - "2022-03-11T09@20@30", - "2022-03-11T09!20:30", - ], + { + "target": [ + "2022-03-11T092030", + "2022-03-11T09,20,30", + "2022-03-11T09@20@30", + "2022-03-11T09!20:30", + ] + }, PandasDataset, [True, True, True, True], ), ( - [ - "1997-07", - "1997-07-16", - "1997-07-16T19:20:30.45+01:00", - "2022-05-08T13:44:66", - ], + { + "target": [ + "1997-07", + "1997-07-16", + "1997-07-16T19:20:30.45+01:00", + "2022-05-08T13:44:66", + ] + }, DaskDataset, [False, False, False, True], ), ], ) def test_invalid_date(data, dataset_type, expected_result): - df = dataset_type.from_dict({"target": data}) + df = dataset_type.from_dict(data) dataframe_type = DataframeType({"value": df}) result = dataframe_type.invalid_date({"target": "target"}) assert result.equals(df.convert_to_series(expected_result)) @@ -754,6 +758,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, True, False], ), + ( + "date_equal_to", + ["2025-06-25", "2025-06-24", "2025-06-25T17:22", "2025-06-26"], + "2025-06-25T17:22", + [True, False, True, False], + ), ( "date_equal_to", ["2025-06", "2025-07", "2025-06", "2025-05"], @@ -766,6 +776,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22:30", [True, False, True, False], ), + ( + "date_equal_to", + ["2025", "2024", "2025-06-25T17:22:30", "2026"], + "2025-06-25T17:22:30", + [True, False, True, False], + ), ( "date_equal_to", ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"], @@ -778,6 +794,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22:30", [True, False, True], ), + ( + "date_equal_to", + ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22:30"], + "2025-06-25T17:22:30", + [True, False, True], + ), ( "date_equal_to", ["2025-06-25T", "2025-06-24T", "2025-06-25T"], @@ -790,6 +812,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, False], ), + ( + "date_greater_than", + ["2025-06-26", "2025-06-24", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, False], + ), ( "date_greater_than", ["2025-07", "2025-05", "2025-06"], @@ -802,12 +830,24 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, False], ), + ( + "date_greater_than", + ["2026", "2024", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, False], + ), ( "date_greater_than_or_equal_to", ["2025-06-26", "2025-06-24", "2025-06-25"], "2025-06-25T17:22", [True, False, True], ), + ( + "date_greater_than_or_equal_to", + ["2025-06-26", "2025-06-24", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, True], + ), ( "date_greater_than_or_equal_to", ["2025-07", "2025-05", "2025-06"], @@ -820,6 +860,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, False], ), + ( + "date_less_than", + ["2025-06-24", "2025-06-26", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, False], + ), ( "date_less_than", ["2025-05", "2025-07", "2025-06"], @@ -832,12 +878,24 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, False], ), + ( + "date_less_than", + ["2024", "2026", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, False], + ), ( "date_less_than_or_equal_to", ["2025-06-24", "2025-06-26", "2025-06-25"], "2025-06-25T17:22", [True, False, True], ), + ( + "date_less_than_or_equal_to", + ["2025-06-24", "2025-06-26", "2025-06-25T17:22"], + "2025-06-25T17:22", + [True, False, True], + ), ( "date_less_than_or_equal_to", ["2025-05", "2025-07", "2025-06"], @@ -850,6 +908,12 @@ def test_is_incomplete_date(target, dataset_type, expected_result): "2025-06-25T17:22", [True, False, True], ), + ( + "date_not_equal_to", + ["2025-06-24", "2025-06-25T17:22", "2025-06-26"], + "2025-06-25T17:22", + [True, False, True], + ), ( "date_not_equal_to", ["2025-05", "2025-06", "2025-07"], From ada41a2a2b433cad9f481dc1f888b003152e1e33 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Wed, 5 Nov 2025 12:18:22 -0500 Subject: [PATCH 08/19] Restructure auto precision tests into individual cases, remove duplicates --- cdisc_rules_engine/check_operators/helpers.py | 178 +++++++++---- .../test_date_comparison_checks.py | 247 ++++++------------ 2 files changed, 204 insertions(+), 221 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index b76fd7e34..323bdf31d 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -6,6 +6,8 @@ from cdisc_rules_engine.services import logger import traceback from functools import lru_cache +from enum import IntEnum +import operator # Date regex pattern for validation @@ -18,25 +20,19 @@ r":[0-5][0-9]))?)?)?)?)?)?))?$" ) -PRECISION_ORDER = { - "year": 0, - "month": 1, - "day": 2, - "hour": 3, - "minute": 4, - "second": 5, - "microsecond": 6, -} - -PRECISION_LEVELS = [ - "year", - "month", - "day", - "hour", - "minute", - "second", - "microsecond", -] + +class DatePrecision(IntEnum): + year = 0 + month = 1 + day = 2 + hour = 3 + minute = 4 + second = 5 + microsecond = 6 + + @classmethod + def get_name_by_index(cls, index: int) -> str: + return list(cls.__members__.keys())[index] def is_valid_date(date_string: str) -> bool: @@ -137,62 +133,124 @@ def get_microsecond(date_string: str): return timestamp.microsecond -def _detect_time_precision(time_part: str) -> str: - if "." in time_part: - return "microsecond" +def _extract_datetime_components(date_str: str) -> dict: + """Extract datetime components using regex pattern matching.""" + if not date_str or not isinstance(date_str, str): + return {} - colon_count = time_part.count(":") - if colon_count >= 2: - return "second" - elif colon_count == 1: - return "minute" - else: - return "hour" + if not date_regex.match(date_str): + return {} + if "--" in date_str or "-:" in date_str: + date_str = date_str.split("--")[0].split("-:")[0] + if not date_str or date_str.endswith("-"): + date_str = date_str.rstrip("-") -def _detect_date_precision_simple(date_str: str) -> str: - date_parts = [p for p in date_str.split("-") if p] - if len(date_parts) >= 3: - return "day" - elif len(date_parts) == 2: - return "month" - elif len(date_parts) == 1: - return "year" - return None + has_time = "T" in date_str + if has_time: + parts = date_str.split("T", 1) + date_part = parts[0] + time_part = ( + parts[1].split("+")[0].split("-")[0].split("Z")[0] + if len(parts) > 1 and parts[1] + else "" + ) + else: + date_part = date_str + time_part = "" + + date_components = date_part.split("-") + year = ( + date_components[0] + if len(date_components) > 0 and date_components[0] and date_components[0] != "-" + else None + ) + month = ( + date_components[1] + if len(date_components) > 1 and date_components[1] and date_components[1] != "-" + else None + ) + day = ( + date_components[2] + if len(date_components) > 2 and date_components[2] and date_components[2] != "-" + else None + ) + + hour = None + minute = None + second = None + microsecond = None + + if time_part: + time_components = time_part.split(":") + hour = ( + time_components[0] + if len(time_components) > 0 + and time_components[0] + and time_components[0] != "-" + else None + ) + minute = ( + time_components[1] + if len(time_components) > 1 + and time_components[1] + and time_components[1] != "-" + else None + ) + if len(time_components) > 2: + second_part = time_components[2] + if "." in second_part: + second, microsecond_part = second_part.split(".", 1) + second = second if second and second != "-" else None + microsecond = microsecond_part if microsecond_part else None + else: + second = second_part if second_part and second_part != "-" else None + + return { + "year": year, + "month": month, + "day": day, + "hour": hour, + "minute": minute, + "second": second, + "microsecond": microsecond, + } @lru_cache(maxsize=1000) -def detect_date_precision(date_str: str) -> str: +def detect_datetime_precision(date_str: str) -> str: if not date_str or not isinstance(date_str, str): return None - if "T" not in date_str and "--" not in date_str and "-:" not in date_str: - return _detect_date_precision_simple(date_str) + components = _extract_datetime_components(date_str) + if not components: + return None - if "--" in date_str or "-:" in date_str: - date_str = date_str.split("--")[0].split("-:")[0] - if not date_str or date_str.endswith("-"): - date_str = date_str.rstrip("-") + precision_names = list(DatePrecision.__members__.keys()) + last_precision = None - if "T" in date_str: - time_part = date_str.split("T")[1] - if not time_part: - return "day" - time_part = time_part.split("+")[0].split("-")[-1].split("Z")[0] - return _detect_time_precision(time_part) + for precision_name in precision_names: + if components.get(precision_name) is not None: + last_precision = precision_name + else: + if precision_name == "hour" and components.get("day") is not None: + return "day" + if precision_name == "hour" and components.get("year") is None: + return None + return last_precision if last_precision else precision_name - return _detect_date_precision_simple(date_str) + return "microsecond" def get_common_precision(dt1: str, dt2: str) -> str: - p1 = detect_date_precision(dt1) - p2 = detect_date_precision(dt2) + p1 = detect_datetime_precision(dt1) + p2 = detect_datetime_precision(dt2) if not p1 or not p2: return None - min_idx = min(PRECISION_ORDER[p1], PRECISION_ORDER[p2]) - return PRECISION_LEVELS[min_idx] + min_idx = min(DatePrecision[p1].value, DatePrecision[p2].value) + return DatePrecision.get_name_by_index(min_idx) def get_date_component(component: str, date_string: str): @@ -264,14 +322,18 @@ def case_insensitive_is_in(value, values): return str(value).lower() in str(values).lower() -def compare_dates(component, target, comparator, operator): +def compare_dates(component, target, comparator, operator_func): if not target or not comparator: return False + is_strict_comparison = operator_func in (operator.lt, operator.gt) + if component == "auto": component = get_common_precision(target, comparator) + elif is_strict_comparison and component is None: + component = get_common_precision(target, comparator) - return operator( + return operator_func( get_date_component(component, target), get_date_component(component, comparator), ) diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index 7813517f8..83cfbd985 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -750,186 +750,107 @@ def test_is_incomplete_date(target, dataset_type, expected_result): @pytest.mark.parametrize( - "operator_name,data,comparator,expected_result", + "operator_name,target,comparator,date_component,expected_result", [ - ( - "date_equal_to", - ["2025-06-25", "2025-06-24", "2025-06-25", "2025-06-26"], - "2025-06-25T17:22", - [True, False, True, False], - ), - ( - "date_equal_to", - ["2025-06-25", "2025-06-24", "2025-06-25T17:22", "2025-06-26"], - "2025-06-25T17:22", - [True, False, True, False], - ), - ( - "date_equal_to", - ["2025-06", "2025-07", "2025-06", "2025-05"], - "2025-06-25", - [True, False, True, False], - ), - ( - "date_equal_to", - ["2025", "2024", "2025", "2026"], - "2025-06-25T17:22:30", - [True, False, True, False], - ), - ( - "date_equal_to", - ["2025", "2024", "2025-06-25T17:22:30", "2026"], - "2025-06-25T17:22:30", - [True, False, True, False], - ), - ( - "date_equal_to", - ["2025-06-25", "2025-06-24", "2025-06-26", "2025-06-25"], - "2025-06-25", - [True, False, False, True], - ), - ( - "date_equal_to", - ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22"], - "2025-06-25T17:22:30", - [True, False, True], - ), - ( - "date_equal_to", - ["2025-06-25T17:22", "2025-06-25T17:21", "2025-06-25T17:22:30"], - "2025-06-25T17:22:30", - [True, False, True], - ), - ( - "date_equal_to", - ["2025-06-25T", "2025-06-24T", "2025-06-25T"], - "2025-06-25", - [True, False, True], - ), - ( - "date_greater_than", - ["2025-06-26", "2025-06-24", "2025-06-25"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_greater_than", - ["2025-06-26", "2025-06-24", "2025-06-25T17:22"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_greater_than", - ["2025-07", "2025-05", "2025-06"], - "2025-06-25", - [True, False, False], - ), - ( - "date_greater_than", - ["2026", "2024", "2025"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_greater_than", - ["2026", "2024", "2025-06-25T17:22"], - "2025-06-25T17:22", - [True, False, False], - ), + ("date_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", True), + ("date_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", False), + ("date_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", False), + ("date_equal_to", "2025-06", "2025-06-25", "auto", True), + ("date_equal_to", "2025-07", "2025-06-25", "auto", False), + ("date_equal_to", "2025-05", "2025-06-25", "auto", False), + ("date_equal_to", "2025", "2025-06-25T17:22:30", "auto", True), + ("date_equal_to", "2024", "2025-06-25T17:22:30", "auto", False), + ("date_equal_to", "2026", "2025-06-25T17:22:30", "auto", False), + ("date_equal_to", "2025-06-25", "2025-06-25", "auto", True), + ("date_equal_to", "2025-06-24", "2025-06-25", "auto", False), + ("date_equal_to", "2025-06-26", "2025-06-25", "auto", False), + ("date_equal_to", "2025-06-25T17:22", "2025-06-25T17:22:30", "auto", True), + ("date_equal_to", "2025-06-25T17:21", "2025-06-25T17:22:30", "auto", False), + ("date_equal_to", "2025-06-25T", "2025-06-25", "auto", True), + ("date_equal_to", "2025-06-24T", "2025-06-25", "auto", False), + ("date_greater_than", "2025-06-26", "2025-06-25T17:22", "auto", True), + ("date_greater_than", "2025-06-26", "2025-06-25T17:22", None, True), + ("date_greater_than", "2025-06-24", "2025-06-25T17:22", "auto", False), + ("date_greater_than", "2025-06-24", "2025-06-25T17:22", None, False), + ("date_greater_than", "2025-06-25", "2025-06-25T17:22", "auto", False), + ("date_greater_than", "2025-06-25", "2025-06-25T17:22", None, False), + ("date_greater_than", "2025-07", "2025-06-25", "auto", True), + ("date_greater_than", "2025-07", "2025-06-25", None, True), + ("date_greater_than", "2025-05", "2025-06-25", "auto", False), + ("date_greater_than", "2025-05", "2025-06-25", None, False), + ("date_greater_than", "2025-06", "2025-06-25", "auto", False), + ("date_greater_than", "2025-06", "2025-06-25", None, False), + ("date_greater_than", "2026", "2025-06-25T17:22", "auto", True), + ("date_greater_than", "2026", "2025-06-25T17:22", None, True), + ("date_greater_than", "2024", "2025-06-25T17:22", "auto", False), + ("date_greater_than", "2024", "2025-06-25T17:22", None, False), + ("date_greater_than", "2025", "2025-06-25T17:22", "auto", False), + ("date_greater_than", "2025", "2025-06-25T17:22", None, False), ( "date_greater_than_or_equal_to", - ["2025-06-26", "2025-06-24", "2025-06-25"], + "2025-06-26", "2025-06-25T17:22", - [True, False, True], + "auto", + True, ), ( "date_greater_than_or_equal_to", - ["2025-06-26", "2025-06-24", "2025-06-25T17:22"], + "2025-06-24", "2025-06-25T17:22", - [True, False, True], + "auto", + False, ), ( "date_greater_than_or_equal_to", - ["2025-07", "2025-05", "2025-06"], - "2025-06-25", - [True, False, True], - ), - ( - "date_less_than", - ["2025-06-24", "2025-06-26", "2025-06-25"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_less_than", - ["2025-06-24", "2025-06-26", "2025-06-25T17:22"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_less_than", - ["2025-05", "2025-07", "2025-06"], - "2025-06-25", - [True, False, False], - ), - ( - "date_less_than", - ["2024", "2026", "2025"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_less_than", - ["2024", "2026", "2025-06-25T17:22"], - "2025-06-25T17:22", - [True, False, False], - ), - ( - "date_less_than_or_equal_to", - ["2025-06-24", "2025-06-26", "2025-06-25"], - "2025-06-25T17:22", - [True, False, True], - ), - ( - "date_less_than_or_equal_to", - ["2025-06-24", "2025-06-26", "2025-06-25T17:22"], - "2025-06-25T17:22", - [True, False, True], - ), - ( - "date_less_than_or_equal_to", - ["2025-05", "2025-07", "2025-06"], "2025-06-25", - [True, False, True], - ), - ( - "date_not_equal_to", - ["2025-06-24", "2025-06-25", "2025-06-26"], "2025-06-25T17:22", - [True, False, True], - ), - ( - "date_not_equal_to", - ["2025-06-24", "2025-06-25T17:22", "2025-06-26"], - "2025-06-25T17:22", - [True, False, True], - ), - ( - "date_not_equal_to", - ["2025-05", "2025-06", "2025-07"], - "2025-06-25", - [True, False, True], - ), + "auto", + True, + ), + ("date_greater_than_or_equal_to", "2025-07", "2025-06-25", "auto", True), + ("date_greater_than_or_equal_to", "2025-05", "2025-06-25", "auto", False), + ("date_greater_than_or_equal_to", "2025-06", "2025-06-25", "auto", True), + ("date_less_than", "2025-06-24", "2025-06-25T17:22", "auto", True), + ("date_less_than", "2025-06-24", "2025-06-25T17:22", None, True), + ("date_less_than", "2025-06-26", "2025-06-25T17:22", "auto", False), + ("date_less_than", "2025-06-26", "2025-06-25T17:22", None, False), + ("date_less_than", "2025-06-25", "2025-06-25T17:22", "auto", False), + ("date_less_than", "2025-06-25", "2025-06-25T17:22", None, False), + ("date_less_than", "2025-05", "2025-06-25", "auto", True), + ("date_less_than", "2025-05", "2025-06-25", None, True), + ("date_less_than", "2025-07", "2025-06-25", "auto", False), + ("date_less_than", "2025-07", "2025-06-25", None, False), + ("date_less_than", "2025-06", "2025-06-25", "auto", False), + ("date_less_than", "2025-06", "2025-06-25", None, False), + ("date_less_than", "2024", "2025-06-25T17:22", "auto", True), + ("date_less_than", "2024", "2025-06-25T17:22", None, True), + ("date_less_than", "2026", "2025-06-25T17:22", "auto", False), + ("date_less_than", "2026", "2025-06-25T17:22", None, False), + ("date_less_than", "2025", "2025-06-25T17:22", "auto", False), + ("date_less_than", "2025", "2025-06-25T17:22", None, False), + ("date_less_than_or_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", True), + ("date_less_than_or_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", False), + ("date_less_than_or_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", True), + ("date_less_than_or_equal_to", "2025-05", "2025-06-25", "auto", True), + ("date_less_than_or_equal_to", "2025-07", "2025-06-25", "auto", False), + ("date_less_than_or_equal_to", "2025-06", "2025-06-25", "auto", True), + ("date_not_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", True), + ("date_not_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", False), + ("date_not_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", True), + ("date_not_equal_to", "2025-05", "2025-06-25", "auto", True), + ("date_not_equal_to", "2025-06", "2025-06-25", "auto", False), + ("date_not_equal_to", "2025-07", "2025-06-25", "auto", True), ], ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_auto_precision_operators( - operator_name, data, comparator, expected_result, dataset_type + operator_name, target, comparator, date_component, expected_result, dataset_type ): - df = dataset_type.from_dict({"target": data}) + df = dataset_type.from_dict({"target": [target]}) dataframe_type = DataframeType({"value": df}) operator_method = getattr(dataframe_type, operator_name) - result = operator_method( - {"target": "target", "comparator": comparator, "date_component": "auto"} - ) - assert result.equals(df.convert_to_series(expected_result)) + params = {"target": "target", "comparator": comparator} + if date_component is not None: + params["date_component"] = date_component + result = operator_method(params) + assert result.equals(df.convert_to_series([expected_result])) From 1bc55050888e0b4aeaa87c7144ce529f21454f5c Mon Sep 17 00:00:00 2001 From: Rakesh Date: Sun, 9 Nov 2025 22:24:11 -0500 Subject: [PATCH 09/19] Implement auto precision datetime comparisons --- README.md | 13 - cdisc_rules_engine/check_operators/helpers.py | 440 ++++++++++++++---- .../test_date_comparison_checks.py | 200 ++++---- 3 files changed, 448 insertions(+), 205 deletions(-) diff --git a/README.md b/README.md index df1706436..b2070e719 100644 --- a/README.md +++ b/README.md @@ -280,19 +280,6 @@ The possible rule run statuses are: - `SUCCESS` - The rule ran and data was validated against the rule. May or may not produce results - `SKIPPED` - The rule was unable to be run. Usually due to missing required data, but could also be cause by rule execution errors. -## Date Comparison with Automatic Precision Detection - -Use `date_component: "auto"` to automatically compare dates at their common precision level. - -```yaml -- name: "AESTDTC" - operator: "date_greater_than_or_equal_to" - value: "RFSTDTC" - date_component: "auto" -``` - -This compares `"2025-06-25"` with `"2025-06-25T17:22"` at day precision, `"2025-06"` with `"2025-06-25"` at month precision, etc. - # Additional Core Commands **- update-cache** - update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 323bdf31d..126b39bd2 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -9,15 +9,21 @@ from enum import IntEnum import operator - # Date regex pattern for validation date_regex = re.compile( - r"^((-?[0-9]{4}|-)(-(1[0-2]|0[1-9]|-)(-(3[01]|0[1-9]|[12][0-9]|-)" - r"(T(2[0-3]|[01][0-9]|-)(:([0-5][0-9]|-)((:([0-5][0-9]|-))?(\.[0-9]+)?" - r"((Z|[+-](:2[0-3]|[01][0-9]):[0-5][0-9]))?)?)?)?)?)?)(\/((-?[0-9]{4}|-)" - r"(-(1[0-2]|0[1-9]|-)(-(3[01]|0[1-9]|[12][0-9]|-)(T(2[0-3]|[01][0-9]|-)" - r"(:([0-5][0-9]|-)((:([0-5][0-9]|-))?(\.[0-9]+)?((Z|[+-](:2[0-3]|[01][0-9])" - r":[0-5][0-9]))?)?)?)?)?)?))?$" + r"^(" + r"(-?[0-9]{4}|-)(-{1,2}(1[0-2]|0[1-9]|-))?(-{1,2}(3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" + r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r"(\/" + r"(-?[0-9]{4}|-)(-{1,2}(1[0-2]|0[1-9]|-))?(-{1,2}(3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" + r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r")?" + r"|" + r"-{4,8}T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" + r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?" + r")$" ) @@ -34,6 +40,10 @@ class DatePrecision(IntEnum): def get_name_by_index(cls, index: int) -> str: return list(cls.__members__.keys())[index] + @classmethod + def names(cls) -> list: + return list(cls.__members__.keys()) + def is_valid_date(date_string: str) -> bool: if date_string is None: @@ -73,20 +83,16 @@ def is_valid_duration(duration: str, negative) -> bool: match = re.match(pattern, duration) if not match: return False - years, months, days, time_designator, hours, minutes, seconds, weeks = ( match.groups() ) - if time_designator and not any([hours, minutes, seconds]): return False - components = [ c for c in [years, months, weeks, days, hours, minutes, seconds] if c is not None ] - # Check if decimal is only in the smallest unit decimal_found = False for i, component in enumerate(components): @@ -94,7 +100,6 @@ def is_valid_duration(duration: str, negative) -> bool: if decimal_found or i != len(components) - 1: return False decimal_found = True - return True @@ -133,122 +138,258 @@ def get_microsecond(date_string: str): return timestamp.microsecond +def _empty_datetime_components() -> dict: + return { + "year": None, + "month": None, + "day": None, + "hour": None, + "minute": None, + "second": None, + "microsecond": None, + } + + +def _assign_date_components(components: dict, date_parts: list) -> None: + if not date_parts: + return + keys = ("year", "month", "day") + for index, key in enumerate(keys): + if len(date_parts) > index and date_parts[index] not in (None, "-"): + components[key] = date_parts[index] + + +def _assign_time_components(components: dict, time_part: str) -> None: + if not time_part: + return + tokens = time_part.split(":", 2) + if tokens and tokens[0] not in ("", "-"): + components["hour"] = tokens[0] + if len(tokens) > 1 and tokens[1] not in ("", "-"): + components["minute"] = tokens[1] + if len(tokens) == 3 and tokens[2]: + second_part = tokens[2] + if "." in second_part: + second_val, micro_val = second_part.split(".", 1) + if second_val not in ("", "-"): + components["second"] = second_val + if micro_val: + components["microsecond"] = micro_val + elif second_part not in ("", "-"): + components["second"] = second_part + + def _extract_datetime_components(date_str: str) -> dict: """Extract datetime components using regex pattern matching.""" + components = _empty_datetime_components() if not date_str or not isinstance(date_str, str): - return {} - + return components if not date_regex.match(date_str): - return {} + return components + date_parts, time_part, has_time = _parse_datetime_string(date_str) + _assign_date_components(components, date_parts) + if has_time: + _assign_time_components(components, time_part) + return components - if "--" in date_str or "-:" in date_str: - date_str = date_str.split("--")[0].split("-:")[0] - if not date_str or date_str.endswith("-"): - date_str = date_str.rstrip("-") + +def _parse_datetime_string(date_str: str): + if not date_str or not isinstance(date_str, str): + return [], "", False has_time = "T" in date_str + if has_time: - parts = date_str.split("T", 1) - date_part = parts[0] - time_part = ( - parts[1].split("+")[0].split("-")[0].split("Z")[0] - if len(parts) > 1 and parts[1] - else "" - ) + date_part, time_part = date_str.split("T", 1) + time_part = re.sub(r"(Z|[+\-]\d{2}:\d{2})$", "", time_part) else: date_part = date_str time_part = "" - date_components = date_part.split("-") - year = ( - date_components[0] - if len(date_components) > 0 and date_components[0] and date_components[0] != "-" - else None - ) - month = ( - date_components[1] - if len(date_components) > 1 and date_components[1] and date_components[1] != "-" - else None - ) - day = ( - date_components[2] - if len(date_components) > 2 and date_components[2] and date_components[2] != "-" - else None - ) + if not date_part or all(c == "-" for c in date_part): + return ["-", "-", "-"], time_part, has_time - hour = None - minute = None - second = None - microsecond = None - - if time_part: - time_components = time_part.split(":") - hour = ( - time_components[0] - if len(time_components) > 0 - and time_components[0] - and time_components[0] != "-" - else None - ) - minute = ( - time_components[1] - if len(time_components) > 1 - and time_components[1] - and time_components[1] != "-" - else None - ) - if len(time_components) > 2: - second_part = time_components[2] - if "." in second_part: - second, microsecond_part = second_part.split(".", 1) - second = second if second and second != "-" else None - microsecond = microsecond_part if microsecond_part else None - else: - second = second_part if second_part and second_part != "-" else None + segments = date_part.split("-") + components = [] + i = 0 - return { - "year": year, - "month": month, - "day": day, - "hour": hour, - "minute": minute, - "second": second, - "microsecond": microsecond, - } + while i < len(segments) and len(components) < 3: + segment = segments[i] + + if segment: + components.append(segment) + i += 1 + else: + empty_start = i + while i < len(segments) and not segments[i]: + i += 1 + + empty_count = i - empty_start + if empty_count >= 2: + components.append("-") + + while len(components) < 3: + components.append("-") + + if len(components) > 3: + components = components[:3] + + return components, time_part, has_time + + +def _check_date_component(date_components, index, precision_name): + if len(date_components) <= index: + if precision_name == "year": + return None + precision_names = DatePrecision.names() + prev_index = precision_names.index(precision_name) - 1 + return precision_names[prev_index] if prev_index >= 0 else None + + component = date_components[index] + + if not component or component == "-": + has_later_component = False + for later_idx in range(index + 1, min(3, len(date_components))): + if later_idx < len(date_components): + later_comp = date_components[later_idx] + if later_comp and later_comp != "-": + has_later_component = True + break + + if has_later_component: + return None + + if precision_name == "year": + return None + precision_names = DatePrecision.names() + prev_index = precision_names.index(precision_name) - 1 + return precision_names[prev_index] if prev_index >= 0 else None + + return None + + +def _check_time_component(time_part, has_time, component_index): + if not has_time or not time_part: + return "day" + time_components = time_part.split(":") + if len(time_components) <= component_index: + precision_names = DatePrecision.names() + prev_index = precision_names.index("hour") + component_index - 1 + return precision_names[prev_index] if prev_index >= 0 else "day" + component = time_components[component_index] + if not component or component == "-": + precision_names = DatePrecision.names() + prev_index = precision_names.index("hour") + component_index - 1 + return precision_names[prev_index] if prev_index >= 0 else "day" + return None + + +def _check_second_component(time_part, has_time): + if not has_time or not time_part: + return "day" + time_components = time_part.split(":") + if len(time_components) <= 2: + return "minute" + second_part = time_components[2] + second = second_part.split(".", 1)[0] if "." in second_part else second_part + if not second or second == "-": + return "minute" + return None + + +def _check_microsecond_component(time_part, has_time): + if not has_time or not time_part: + return "day" + time_components = time_part.split(":") + if len(time_components) <= 2: + return "minute" + second_part = time_components[2] + if "." not in second_part: + return "second" + microsecond_part = second_part.split(".", 1)[1] + if not microsecond_part: + return "second" + return None @lru_cache(maxsize=1000) def detect_datetime_precision(date_str: str) -> str: - if not date_str or not isinstance(date_str, str): + if not _datestring_is_valid(date_str): + return None + + date_components, time_part, has_time = _parse_datetime_string(date_str) + + if _is_time_only_precision(date_components, has_time): + return _time_only_precision(time_part) + + return _date_and_time_precision(date_components, time_part, has_time) + + +def _datestring_is_valid(date_str: str) -> bool: + return bool(date_str and isinstance(date_str, str) and date_regex.match(date_str)) + + +def _is_time_only_precision(date_components: list, has_time: bool) -> bool: + return has_time and all(component == "-" for component in date_components) + + +def _time_only_precision(time_part: str) -> str: + if not time_part: return None - components = _extract_datetime_components(date_str) - if not components: + time_components = time_part.split(":") + if not time_components or not time_components[0] or time_components[0] == "-": return None + if len(time_components) <= 1 or not time_components[1] or time_components[1] == "-": + return "hour" + if len(time_components) <= 2: + return "minute" - precision_names = list(DatePrecision.__members__.keys()) - last_precision = None + return _precision_from_second_component(time_components[2]) - for precision_name in precision_names: - if components.get(precision_name) is not None: - last_precision = precision_name - else: - if precision_name == "hour" and components.get("day") is not None: - return "day" - if precision_name == "hour" and components.get("year") is None: - return None - return last_precision if last_precision else precision_name + +def _precision_from_second_component(second_part: str) -> str: + if "." in second_part: + second, microsecond = second_part.split(".", 1) + if not second or second == "-": + return "minute" + return "second" if not microsecond else "microsecond" + + if not second_part or second_part == "-": + return "minute" + return "second" + + +def _date_and_time_precision(date_components, time_part, has_time) -> str: + precision_checks = _precision_check_functions(date_components, time_part, has_time) + + for index, precision_name in enumerate(DatePrecision.names()): + result = precision_checks[precision_name](index, precision_name) + if result is not None: + return result return "microsecond" +def _precision_check_functions(date_components, time_part, has_time): + return { + "year": lambda i, name: _check_date_component(date_components, i, name), + "month": lambda i, name: _check_date_component(date_components, i, name), + "day": lambda i, name: _check_date_component(date_components, i, name), + "hour": lambda i, name: _check_time_component(time_part, has_time, 0), + "minute": lambda i, name: _check_time_component(time_part, has_time, 1), + "second": lambda i, name: _check_second_component(time_part, has_time), + "microsecond": lambda i, name: _check_microsecond_component( + time_part, has_time + ), + } + + def get_common_precision(dt1: str, dt2: str) -> str: p1 = detect_datetime_precision(dt1) p2 = detect_datetime_precision(dt2) - if not p1 or not p2: return None - min_idx = min(DatePrecision[p1].value, DatePrecision[p2].value) return DatePrecision.get_name_by_index(min_idx) @@ -322,23 +463,118 @@ def case_insensitive_is_in(value, values): return str(value).lower() in str(values).lower() -def compare_dates(component, target, comparator, operator_func): +def truncate_datetime_to_precision(date_string: str, precision: str): + dt = get_date(date_string) + if precision == "year": + return dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0) + if precision == "month": + return dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + if precision == "day": + return dt.replace(hour=0, minute=0, second=0, microsecond=0) + if precision == "hour": + return dt.replace(minute=0, second=0, microsecond=0) + if precision == "minute": + return dt.replace(second=0, microsecond=0) + if precision == "second": + return dt.replace(microsecond=0) + return dt + + +def _dates_are_comparable(target: str, comparator: str) -> bool: if not target or not comparator: return False + return is_valid_date(target) and is_valid_date(comparator) - is_strict_comparison = operator_func in (operator.lt, operator.gt) - if component == "auto": - component = get_common_precision(target, comparator) - elif is_strict_comparison and component is None: - component = get_common_precision(target, comparator) +def _has_explicit_component(component) -> bool: + return component not in (None, "auto") + +def _compare_with_component(component, target, comparator, operator_func): return operator_func( get_date_component(component, target), get_date_component(component, comparator), ) +def _build_precision_context(target: str, comparator: str) -> dict: + return { + "target_precision": detect_datetime_precision(target), + "comparator_precision": detect_datetime_precision(comparator), + "precision": get_common_precision(target, comparator), + } + + +def _truncate_by_precision(target: str, comparator: str, precision: str) -> tuple: + return ( + truncate_datetime_to_precision(target, precision), + truncate_datetime_to_precision(comparator, precision), + ) + + +def _compare_with_inferred_precision( + operator_func, + target: str, + comparator: str, + truncated_target, + truncated_comparator, + context: dict, +): + target_precision = context["target_precision"] + comparator_precision = context["comparator_precision"] + + if operator_func is operator.eq: + if target_precision != comparator_precision: + return False + return truncated_target == truncated_comparator + + if operator_func is operator.ne: + if target_precision != comparator_precision: + return True + return truncated_target != truncated_comparator + + result = operator_func(truncated_target, truncated_comparator) + + if truncated_target == truncated_comparator: + if target_precision and comparator_precision: + target_value = DatePrecision[target_precision].value + comparator_value = DatePrecision[comparator_precision].value + if target_value > comparator_value: + return operator_func(get_date(target), get_date(comparator)) + return result + + return result + + +def compare_dates(component, target, comparator, operator_func): + if not _dates_are_comparable(target, comparator): + return False + + if _has_explicit_component(component): + return _compare_with_component(component, target, comparator, operator_func) + + context = _build_precision_context(target, comparator) + precision = context["precision"] + if precision is None: + return False + + truncated_target, truncated_comparator = _truncate_by_precision( + target, comparator, precision + ) + + if component == "auto": + return operator_func(truncated_target, truncated_comparator) + + return _compare_with_inferred_precision( + operator_func, + target, + comparator, + truncated_target, + truncated_comparator, + context, + ) + + def apply_regex(regex: str, val: str): result = re.findall(regex, val) if result: diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index 83cfbd985..7470948ee 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -1,9 +1,30 @@ from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType +from cdisc_rules_engine.check_operators.helpers import ( + detect_datetime_precision, + is_valid_date, +) import pytest from cdisc_rules_engine.models.dataset.dask_dataset import DaskDataset from cdisc_rules_engine.models.dataset.pandas_dataset import PandasDataset +@pytest.mark.parametrize( + "value,expected_precision", + [ + ("2003-12-15T13:14:17.123", "microsecond"), + ("2003-12-15T13:14:17", "second"), + ("2003-12-15T13:14", "minute"), + ("2003-12-15T13", "hour"), + ("2003-12-15", "day"), + ("2003-12", "month"), + ("2003", "year"), + ], +) +def test_detect_datetime_precision_with_truncated_values(value, expected_precision): + assert is_valid_date(value) + assert detect_datetime_precision(value) == expected_precision + + @pytest.mark.parametrize( "data,dataset_type,expected_result", [ @@ -50,6 +71,19 @@ def test_invalid_date(data, dataset_type, expected_result): assert result.equals(df.convert_to_series(expected_result)) +@pytest.mark.parametrize( + "value,expected_precision", + [ + ("2003---15", "day"), + ("--12-15", "day"), + ("-----T07:15", "minute"), + ], +) +def test_detect_datetime_precision_with_uncertain_components(value, expected_precision): + assert is_valid_date(value) + assert detect_datetime_precision(value) == expected_precision + + @pytest.mark.parametrize( "data,comparator,dataset_type,expected_result", [ @@ -223,6 +257,7 @@ def test_date_equal_to_date_components( ) def test_date_less_than(data, comparator, dataset_type, expected_result): df = dataset_type.from_dict(data) + dataframe_type = DataframeType({"value": df}) result = dataframe_type.date_less_than( {"target": "target", "comparator": comparator} @@ -749,98 +784,83 @@ def test_is_incomplete_date(target, dataset_type, expected_result): ) +AUTO_PRECISION_CASES = { + "date_equal_to": [ + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-06-24", "2025-06-25T17:22", "auto", False), + ("2025-06-26", "2025-06-25T17:22", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ("2025-07", "2025-06-25", "auto", False), + ("2025-05", "2025-06-25", "auto", False), + ("2025", "2025-06-25T17:22:30", "auto", True), + ("2024", "2025-06-25T17:22:30", "auto", False), + ("2026", "2025-06-25T17:22:30", "auto", False), + ("2025-06-25", "2025-06-25", "auto", True), + ("2025-06-24", "2025-06-25", "auto", False), + ("2025-06-26", "2025-06-25", "auto", False), + ("2025-06-25T17:22", "2025-06-25T17:22:30", "auto", True), + ("2025-06-25T17:21", "2025-06-25T17:22:30", "auto", False), + ("2025-06-25T", "2025-06-25", "auto", False), + ("2025-06-24T", "2025-06-25", "auto", False), + ], + "date_greater_than": [ + ("2025-06-26", "2025-06-25T17:22", None, True), + ("2025-06-24", "2025-06-25T17:22", None, False), + ("2025-06-25", "2025-06-25T17:22", None, False), + ("2025-07", "2025-06-25", None, True), + ("2025-05", "2025-06-25", None, False), + ("2025-06", "2025-06-25", None, False), + ("2026", "2025-06-25T17:22", None, True), + ("2024", "2025-06-25T17:22", None, False), + ("2025", "2025-06-25T17:22", None, False), + ], + "date_greater_than_or_equal_to": [ + ("2025-06-26", "2025-06-25T17:22", "auto", True), + ("2025-06-24", "2025-06-25T17:22", "auto", False), + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-07", "2025-06-25", "auto", True), + ("2025-05", "2025-06-25", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ], + "date_less_than": [ + ("2025-06-24", "2025-06-25T17:22", None, True), + ("2025-06-26", "2025-06-25T17:22", None, False), + ("2025-06-25", "2025-06-25T17:22", None, False), + ("2025-05", "2025-06-25", None, True), + ("2025-07", "2025-06-25", None, False), + ("2025-06", "2025-06-25", None, False), + ("2024", "2025-06-25T17:22", None, True), + ("2026", "2025-06-25T17:22", None, False), + ("2025", "2025-06-25T17:22", None, False), + ], + "date_less_than_or_equal_to": [ + ("2025-06-24", "2025-06-25T17:22", "auto", True), + ("2025-06-26", "2025-06-25T17:22", "auto", False), + ("2025-06-25", "2025-06-25T17:22", "auto", True), + ("2025-05", "2025-06-25", "auto", True), + ("2025-07", "2025-06-25", "auto", False), + ("2025-06", "2025-06-25", "auto", True), + ], + "date_not_equal_to": [ + ("2025-06-24", "2025-06-25T17:22", "auto", True), + ("2025-06-25", "2025-06-25T17:22", "auto", False), + ("2025-06-26", "2025-06-25T17:22", "auto", True), + ("2025-05", "2025-06-25", "auto", True), + ("2025-06", "2025-06-25", "auto", False), + ("2025-07", "2025-06-25", "auto", True), + ], +} + +AUTO_PRECISION_PARAMS = [ + (operator_name, target, comparator, date_component, expected_result) + for operator_name, scenarios in AUTO_PRECISION_CASES.items() + for target, comparator, date_component, expected_result in scenarios +] + + @pytest.mark.parametrize( "operator_name,target,comparator,date_component,expected_result", - [ - ("date_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", True), - ("date_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", False), - ("date_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", False), - ("date_equal_to", "2025-06", "2025-06-25", "auto", True), - ("date_equal_to", "2025-07", "2025-06-25", "auto", False), - ("date_equal_to", "2025-05", "2025-06-25", "auto", False), - ("date_equal_to", "2025", "2025-06-25T17:22:30", "auto", True), - ("date_equal_to", "2024", "2025-06-25T17:22:30", "auto", False), - ("date_equal_to", "2026", "2025-06-25T17:22:30", "auto", False), - ("date_equal_to", "2025-06-25", "2025-06-25", "auto", True), - ("date_equal_to", "2025-06-24", "2025-06-25", "auto", False), - ("date_equal_to", "2025-06-26", "2025-06-25", "auto", False), - ("date_equal_to", "2025-06-25T17:22", "2025-06-25T17:22:30", "auto", True), - ("date_equal_to", "2025-06-25T17:21", "2025-06-25T17:22:30", "auto", False), - ("date_equal_to", "2025-06-25T", "2025-06-25", "auto", True), - ("date_equal_to", "2025-06-24T", "2025-06-25", "auto", False), - ("date_greater_than", "2025-06-26", "2025-06-25T17:22", "auto", True), - ("date_greater_than", "2025-06-26", "2025-06-25T17:22", None, True), - ("date_greater_than", "2025-06-24", "2025-06-25T17:22", "auto", False), - ("date_greater_than", "2025-06-24", "2025-06-25T17:22", None, False), - ("date_greater_than", "2025-06-25", "2025-06-25T17:22", "auto", False), - ("date_greater_than", "2025-06-25", "2025-06-25T17:22", None, False), - ("date_greater_than", "2025-07", "2025-06-25", "auto", True), - ("date_greater_than", "2025-07", "2025-06-25", None, True), - ("date_greater_than", "2025-05", "2025-06-25", "auto", False), - ("date_greater_than", "2025-05", "2025-06-25", None, False), - ("date_greater_than", "2025-06", "2025-06-25", "auto", False), - ("date_greater_than", "2025-06", "2025-06-25", None, False), - ("date_greater_than", "2026", "2025-06-25T17:22", "auto", True), - ("date_greater_than", "2026", "2025-06-25T17:22", None, True), - ("date_greater_than", "2024", "2025-06-25T17:22", "auto", False), - ("date_greater_than", "2024", "2025-06-25T17:22", None, False), - ("date_greater_than", "2025", "2025-06-25T17:22", "auto", False), - ("date_greater_than", "2025", "2025-06-25T17:22", None, False), - ( - "date_greater_than_or_equal_to", - "2025-06-26", - "2025-06-25T17:22", - "auto", - True, - ), - ( - "date_greater_than_or_equal_to", - "2025-06-24", - "2025-06-25T17:22", - "auto", - False, - ), - ( - "date_greater_than_or_equal_to", - "2025-06-25", - "2025-06-25T17:22", - "auto", - True, - ), - ("date_greater_than_or_equal_to", "2025-07", "2025-06-25", "auto", True), - ("date_greater_than_or_equal_to", "2025-05", "2025-06-25", "auto", False), - ("date_greater_than_or_equal_to", "2025-06", "2025-06-25", "auto", True), - ("date_less_than", "2025-06-24", "2025-06-25T17:22", "auto", True), - ("date_less_than", "2025-06-24", "2025-06-25T17:22", None, True), - ("date_less_than", "2025-06-26", "2025-06-25T17:22", "auto", False), - ("date_less_than", "2025-06-26", "2025-06-25T17:22", None, False), - ("date_less_than", "2025-06-25", "2025-06-25T17:22", "auto", False), - ("date_less_than", "2025-06-25", "2025-06-25T17:22", None, False), - ("date_less_than", "2025-05", "2025-06-25", "auto", True), - ("date_less_than", "2025-05", "2025-06-25", None, True), - ("date_less_than", "2025-07", "2025-06-25", "auto", False), - ("date_less_than", "2025-07", "2025-06-25", None, False), - ("date_less_than", "2025-06", "2025-06-25", "auto", False), - ("date_less_than", "2025-06", "2025-06-25", None, False), - ("date_less_than", "2024", "2025-06-25T17:22", "auto", True), - ("date_less_than", "2024", "2025-06-25T17:22", None, True), - ("date_less_than", "2026", "2025-06-25T17:22", "auto", False), - ("date_less_than", "2026", "2025-06-25T17:22", None, False), - ("date_less_than", "2025", "2025-06-25T17:22", "auto", False), - ("date_less_than", "2025", "2025-06-25T17:22", None, False), - ("date_less_than_or_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", True), - ("date_less_than_or_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", False), - ("date_less_than_or_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", True), - ("date_less_than_or_equal_to", "2025-05", "2025-06-25", "auto", True), - ("date_less_than_or_equal_to", "2025-07", "2025-06-25", "auto", False), - ("date_less_than_or_equal_to", "2025-06", "2025-06-25", "auto", True), - ("date_not_equal_to", "2025-06-24", "2025-06-25T17:22", "auto", True), - ("date_not_equal_to", "2025-06-25", "2025-06-25T17:22", "auto", False), - ("date_not_equal_to", "2025-06-26", "2025-06-25T17:22", "auto", True), - ("date_not_equal_to", "2025-05", "2025-06-25", "auto", True), - ("date_not_equal_to", "2025-06", "2025-06-25", "auto", False), - ("date_not_equal_to", "2025-07", "2025-06-25", "auto", True), - ], + AUTO_PRECISION_PARAMS, ) @pytest.mark.parametrize("dataset_type", [PandasDataset, DaskDataset]) def test_auto_precision_operators( From e8a54ffb8bd9771c58ec1995159f44cdf0b8baf4 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 13 Nov 2025 17:25:42 -0500 Subject: [PATCH 10/19] Refactor date precision handling: use DatePrecision enum, simplify parsing with regex groups, and fix test cases --- cdisc_rules_engine/check_operators/helpers.py | 524 +++++++++++------- .../test_date_comparison_checks.py | 39 +- 2 files changed, 359 insertions(+), 204 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 126b39bd2..1084d37b8 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -12,17 +12,22 @@ # Date regex pattern for validation date_regex = re.compile( r"^(" - r"(-?[0-9]{4}|-)(-{1,2}(1[0-2]|0[1-9]|-))?(-{1,2}(3[01]|0[1-9]|[12][0-9]|-))?" - r"(T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" - r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r"(?P-?[0-9]{4}|-)(-{1,2}(?P1[0-2]|0[1-9]|-))?" + r"(-{1,2}(?P3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" r"(\/" - r"(-?[0-9]{4}|-)(-{1,2}(1[0-2]|0[1-9]|-))?(-{1,2}(3[01]|0[1-9]|[12][0-9]|-))?" - r"(T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" - r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" + r"(?P-?[0-9]{4}|-)(-{1,2}(?P1[0-2]|0[1-9]|-))?" + r"(-{1,2}(?P3[01]|0[1-9]|[12][0-9]|-))?" + r"(T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?)?" r")?" r"|" - r"-{4,8}T(2[0-3]|[01][0-9]|-)(:(([0-5][0-9]|-))(:(([0-5][0-9]|-))?(\.[0-9]+)?)?)?" - r"(Z|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?" + r"-{4,8}T(?P2[0-3]|[01][0-9]|-)(:((?P[0-5][0-9]|-))" + r"(:((?P[0-5][0-9]|-))?(\.(?P[0-9]+))?)?)?" + r"(?PZ|[+-](2[0-3]|[01][0-9]):[0-5][0-9])?" r")$" ) @@ -150,179 +155,212 @@ def _empty_datetime_components() -> dict: } -def _assign_date_components(components: dict, date_parts: list) -> None: - if not date_parts: - return - keys = ("year", "month", "day") - for index, key in enumerate(keys): - if len(date_parts) > index and date_parts[index] not in (None, "-"): - components[key] = date_parts[index] - - -def _assign_time_components(components: dict, time_part: str) -> None: - if not time_part: - return - tokens = time_part.split(":", 2) - if tokens and tokens[0] not in ("", "-"): - components["hour"] = tokens[0] - if len(tokens) > 1 and tokens[1] not in ("", "-"): - components["minute"] = tokens[1] - if len(tokens) == 3 and tokens[2]: - second_part = tokens[2] - if "." in second_part: - second_val, micro_val = second_part.split(".", 1) - if second_val not in ("", "-"): - components["second"] = second_val - if micro_val: - components["microsecond"] = micro_val - elif second_part not in ("", "-"): - components["second"] = second_part - - def _extract_datetime_components(date_str: str) -> dict: """Extract datetime components using regex pattern matching.""" components = _empty_datetime_components() if not date_str or not isinstance(date_str, str): return components - if not date_regex.match(date_str): + + match = date_regex.match(date_str) + if not match: return components - date_parts, time_part, has_time = _parse_datetime_string(date_str) - _assign_date_components(components, date_parts) - if has_time: - _assign_time_components(components, time_part) + + year = match.group("year") or match.group("interval_year") + month = match.group("month") or match.group("interval_month") + day = match.group("day") or match.group("interval_day") + hour = ( + match.group("hour") + or match.group("interval_hour") + or match.group("timeonly_hour") + ) + minute = ( + match.group("minute") + or match.group("interval_minute") + or match.group("timeonly_minute") + ) + second = ( + match.group("second") + or match.group("interval_second") + or match.group("timeonly_second") + ) + microsecond = ( + match.group("microsecond") + or match.group("interval_microsecond") + or match.group("timeonly_microsecond") + ) + + if year and year != "-": + components["year"] = year + if month and month != "-": + components["month"] = month + if day and day != "-": + components["day"] = day + if hour and hour != "-": + components["hour"] = hour + if minute and minute != "-": + components["minute"] = minute + if second and second != "-": + components["second"] = second + if microsecond: + components["microsecond"] = microsecond + return components def _parse_datetime_string(date_str: str): if not date_str or not isinstance(date_str, str): - return [], "", False + return [], (None, None, None, None), False - has_time = "T" in date_str + match = date_regex.match(date_str) + if not match: + return [], (None, None, None, None), False + + year = match.group("year") or match.group("interval_year") + month = match.group("month") or match.group("interval_month") + day = match.group("day") or match.group("interval_day") + hour = ( + match.group("hour") + or match.group("interval_hour") + or match.group("timeonly_hour") + ) + minute = ( + match.group("minute") + or match.group("interval_minute") + or match.group("timeonly_minute") + ) + second = ( + match.group("second") + or match.group("interval_second") + or match.group("timeonly_second") + ) + microsecond = ( + match.group("microsecond") + or match.group("interval_microsecond") + or match.group("timeonly_microsecond") + ) - if has_time: - date_part, time_part = date_str.split("T", 1) - time_part = re.sub(r"(Z|[+\-]\d{2}:\d{2})$", "", time_part) - else: - date_part = date_str - time_part = "" + date_components = [ + year if year and year != "-" else "-", + month if month and month != "-" else "-", + day if day and day != "-" else "-", + ] - if not date_part or all(c == "-" for c in date_part): - return ["-", "-", "-"], time_part, has_time + has_time = ( + hour is not None + or minute is not None + or second is not None + or microsecond is not None + ) - segments = date_part.split("-") - components = [] - i = 0 + # Return time components directly instead of rebuilding string + time_components = ( + hour if hour and hour != "-" else None, + minute if minute and minute != "-" else None, + second if second and second != "-" else None, + microsecond if microsecond else None, + ) - while i < len(segments) and len(components) < 3: - segment = segments[i] + return date_components, time_components, has_time - if segment: - components.append(segment) - i += 1 - else: - empty_start = i - while i < len(segments) and not segments[i]: - i += 1 - empty_count = i - empty_start - if empty_count >= 2: - components.append("-") +def _normalize_precision(precision_name): + """Convert string to DatePrecision enum if needed.""" + if isinstance(precision_name, str): + try: + return DatePrecision[precision_name] + except (KeyError, ValueError): + return None + return precision_name + - while len(components) < 3: - components.append("-") +def _has_later_date_component(date_components, index): + """Check if there are valid components after the given index.""" + for later_idx in range(index + 1, min(3, len(date_components))): + if later_idx < len(date_components): + later_comp = date_components[later_idx] + if later_comp and later_comp != "-": + return True + return False - if len(components) > 3: - components = components[:3] - return components, time_part, has_time +def _get_precision_before_missing(precision): + """Get precision before the missing component.""" + if precision == DatePrecision.year: + return None + prev_index = precision.value - 1 + return DatePrecision(prev_index) if prev_index >= 0 else None def _check_date_component(date_components, index, precision_name): + precision = _normalize_precision(precision_name) + if precision is None: + return None + if len(date_components) <= index: - if precision_name == "year": - return None - precision_names = DatePrecision.names() - prev_index = precision_names.index(precision_name) - 1 - return precision_names[prev_index] if prev_index >= 0 else None + return _get_precision_before_missing(precision) component = date_components[index] - if not component or component == "-": - has_later_component = False - for later_idx in range(index + 1, min(3, len(date_components))): - if later_idx < len(date_components): - later_comp = date_components[later_idx] - if later_comp and later_comp != "-": - has_later_component = True - break - - if has_later_component: - return None - - if precision_name == "year": + if _has_later_date_component(date_components, index): return None - precision_names = DatePrecision.names() - prev_index = precision_names.index(precision_name) - 1 - return precision_names[prev_index] if prev_index >= 0 else None + return _get_precision_before_missing(precision) return None -def _check_time_component(time_part, has_time, component_index): - if not has_time or not time_part: - return "day" - time_components = time_part.split(":") - if len(time_components) <= component_index: - precision_names = DatePrecision.names() - prev_index = precision_names.index("hour") + component_index - 1 - return precision_names[prev_index] if prev_index >= 0 else "day" - component = time_components[component_index] +def _check_time_component(time_components, has_time, component_index): + if not has_time or not time_components: + return DatePrecision.day + hour, minute, second, microsecond = time_components + components = [hour, minute, second] + if len(components) <= component_index or components[component_index] is None: + prev_index = DatePrecision.hour.value + component_index - 1 + return DatePrecision(prev_index) if prev_index >= 0 else DatePrecision.day + component = components[component_index] if not component or component == "-": - precision_names = DatePrecision.names() - prev_index = precision_names.index("hour") + component_index - 1 - return precision_names[prev_index] if prev_index >= 0 else "day" + prev_index = DatePrecision.hour.value + component_index - 1 + return DatePrecision(prev_index) if prev_index >= 0 else DatePrecision.day return None -def _check_second_component(time_part, has_time): - if not has_time or not time_part: - return "day" - time_components = time_part.split(":") - if len(time_components) <= 2: - return "minute" - second_part = time_components[2] - second = second_part.split(".", 1)[0] if "." in second_part else second_part +def _check_second_component(time_components, has_time): + if not has_time or not time_components: + return DatePrecision.day + hour, minute, second, microsecond = time_components + if second is None: + return DatePrecision.minute if not second or second == "-": - return "minute" + return DatePrecision.minute return None -def _check_microsecond_component(time_part, has_time): - if not has_time or not time_part: - return "day" - time_components = time_part.split(":") - if len(time_components) <= 2: - return "minute" - second_part = time_components[2] - if "." not in second_part: - return "second" - microsecond_part = second_part.split(".", 1)[1] - if not microsecond_part: - return "second" +def _check_microsecond_component(time_components, has_time): + if not has_time or not time_components: + return DatePrecision.day + hour, minute, second, microsecond = time_components + if second is None: + return DatePrecision.minute + if microsecond is None: + return DatePrecision.second + if not microsecond: + return DatePrecision.second return None @lru_cache(maxsize=1000) -def detect_datetime_precision(date_str: str) -> str: +def detect_datetime_precision(date_str: str) -> DatePrecision | None: if not _datestring_is_valid(date_str): return None - date_components, time_part, has_time = _parse_datetime_string(date_str) + date_components, time_components, has_time = _parse_datetime_string(date_str) - if _is_time_only_precision(date_components, has_time): - return _time_only_precision(time_part) + if all( + component == "-" or component is None or component == "" + for component in date_components + ): + return None - return _date_and_time_precision(date_components, time_part, has_time) + return _date_and_time_precision(date_components, time_components, has_time) def _datestring_is_valid(date_str: str) -> bool: @@ -333,92 +371,184 @@ def _is_time_only_precision(date_components: list, has_time: bool) -> bool: return has_time and all(component == "-" for component in date_components) -def _time_only_precision(time_part: str) -> str: - if not time_part: +def _time_only_precision(time_components) -> DatePrecision | None: + if not time_components: return None - - time_components = time_part.split(":") - if not time_components or not time_components[0] or time_components[0] == "-": + hour, minute, second, microsecond = time_components + if not hour or hour == "-": return None - if len(time_components) <= 1 or not time_components[1] or time_components[1] == "-": - return "hour" - if len(time_components) <= 2: - return "minute" - - return _precision_from_second_component(time_components[2]) + if not minute or minute == "-": + return DatePrecision.hour + if second is None: + return DatePrecision.minute + second_part = second + if microsecond: + second_part = f"{second}.{microsecond}" + return _precision_from_second_component(second_part) -def _precision_from_second_component(second_part: str) -> str: +def _precision_from_second_component(second_part: str) -> DatePrecision: if "." in second_part: second, microsecond = second_part.split(".", 1) if not second or second == "-": - return "minute" - return "second" if not microsecond else "microsecond" + return DatePrecision.minute + return DatePrecision.second if not microsecond else DatePrecision.microsecond if not second_part or second_part == "-": - return "minute" - return "second" + return DatePrecision.minute + return DatePrecision.second -def _date_and_time_precision(date_components, time_part, has_time) -> str: - precision_checks = _precision_check_functions(date_components, time_part, has_time) +def _check_date_component_missing(component) -> bool: + return component is None or component == "-" or component == "" - for index, precision_name in enumerate(DatePrecision.names()): - result = precision_checks[precision_name](index, precision_name) - if result is not None: - return result - return "microsecond" +def _get_precision_before(precision: DatePrecision) -> DatePrecision | None: + if precision == DatePrecision.year: + return None + prev_index = precision.value - 1 + if prev_index >= 0: + return DatePrecision(prev_index) + return None -def _precision_check_functions(date_components, time_part, has_time): +def _check_date_precision(date_components) -> tuple: + date_component_map = { + DatePrecision.year: 0, + DatePrecision.month: 1, + DatePrecision.day: 2, + } + for precision in [DatePrecision.year, DatePrecision.month, DatePrecision.day]: + index = date_component_map[precision] + component = date_components[index] if index < len(date_components) else None + if _check_date_component_missing(component): + result = _get_precision_before(precision) + return (True, result) + return (False, None) + + +def _check_time_precision(time_components) -> DatePrecision: + if not time_components: + return DatePrecision.day + hour, minute, second, microsecond = time_components + if _check_date_component_missing(hour): + return DatePrecision.day + if _check_date_component_missing(minute): + return DatePrecision.hour + if second is None: + return DatePrecision.minute + if _check_date_component_missing(second): + return DatePrecision.minute + if microsecond is None: + return DatePrecision.second + if not microsecond or microsecond == "": + return DatePrecision.second + return DatePrecision.microsecond + + +def _date_and_time_precision( + date_components, time_components, has_time +) -> DatePrecision | None: + found_missing, date_result = _check_date_precision(date_components) + if found_missing: + return date_result + if not has_time or not time_components: + return DatePrecision.day + return _check_time_precision(time_components) + + +def _precision_check_functions(date_components, time_components, has_time): return { - "year": lambda i, name: _check_date_component(date_components, i, name), - "month": lambda i, name: _check_date_component(date_components, i, name), - "day": lambda i, name: _check_date_component(date_components, i, name), - "hour": lambda i, name: _check_time_component(time_part, has_time, 0), - "minute": lambda i, name: _check_time_component(time_part, has_time, 1), - "second": lambda i, name: _check_second_component(time_part, has_time), - "microsecond": lambda i, name: _check_microsecond_component( - time_part, has_time + DatePrecision.year: lambda i, name: _check_date_component( + date_components, i, name + ), + DatePrecision.month: lambda i, name: _check_date_component( + date_components, i, name + ), + DatePrecision.day: lambda i, name: _check_date_component( + date_components, i, name + ), + DatePrecision.hour: lambda i, name: _check_time_component( + time_components, has_time, 0 + ), + DatePrecision.minute: lambda i, name: _check_time_component( + time_components, has_time, 1 + ), + DatePrecision.second: lambda i, name: _check_second_component( + time_components, has_time + ), + DatePrecision.microsecond: lambda i, name: _check_microsecond_component( + time_components, has_time ), } -def get_common_precision(dt1: str, dt2: str) -> str: +def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: p1 = detect_datetime_precision(dt1) p2 = detect_datetime_precision(dt2) - if not p1 or not p2: + if p1 is None or p2 is None: return None - min_idx = min(DatePrecision[p1].value, DatePrecision[p2].value) - return DatePrecision.get_name_by_index(min_idx) + min_idx = min(p1.value, p2.value) + return DatePrecision(min_idx) def get_date_component(component: str, date_string: str): + # Convert string to DatePrecision enum for internal use + try: + precision = DatePrecision[component] + except (KeyError, ValueError): + return get_date(date_string) + component_func_map = { - "year": get_year, - "month": get_month, - "day": get_day, - "hour": get_hour, - "minute": get_minute, - "microsecond": get_microsecond, - "second": get_second, + DatePrecision.year: get_year, + DatePrecision.month: get_month, + DatePrecision.day: get_day, + DatePrecision.hour: get_hour, + DatePrecision.minute: get_minute, + DatePrecision.microsecond: get_microsecond, + DatePrecision.second: get_second, } - component_function = component_func_map.get(component) + component_function = component_func_map.get(precision) if component_function: return component_function(date_string) else: return get_date(date_string) +def _parse_uncertain_date(date_string: str) -> datetime | None: + """Parse uncertain dates with missing components using regex groups.""" + components = _extract_datetime_components(date_string) + + year = int(components.get("year") or 1970) + month = int(components.get("month") or 1) + day = int(components.get("day") or 1) + hour = int(components.get("hour") or 0) + minute = int(components.get("minute") or 0) + second = int(components.get("second") or 0) + microsecond = int(components.get("microsecond") or 0) + + try: + return datetime(year, month, day, hour, minute, second, microsecond) + except (ValueError, TypeError): + return None + + def get_date(date_string: str): """ Returns a utc timestamp for comparison """ + uncertainty_substrings = ["/", "--", "-:"] + has_uncertainty = any([substr in date_string for substr in uncertainty_substrings]) + + if has_uncertainty: + uncertain_date = _parse_uncertain_date(date_string) + if uncertain_date is not None: + utc = pytz.UTC + return utc.localize(uncertain_date) + date = parse(date_string, default=datetime(1970, 1, 1)) utc = pytz.UTC if date.tzinfo is not None and date.tzinfo.utcoffset(date) is not None: - # timezone aware return date.astimezone(utc) else: return utc.localize(date) @@ -463,21 +593,25 @@ def case_insensitive_is_in(value, values): return str(value).lower() in str(values).lower() -def truncate_datetime_to_precision(date_string: str, precision: str): +def truncate_datetime_to_precision(date_string: str, precision: DatePrecision): dt = get_date(date_string) - if precision == "year": - return dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0) - if precision == "month": - return dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - if precision == "day": - return dt.replace(hour=0, minute=0, second=0, microsecond=0) - if precision == "hour": - return dt.replace(minute=0, second=0, microsecond=0) - if precision == "minute": - return dt.replace(second=0, microsecond=0) - if precision == "second": - return dt.replace(microsecond=0) - return dt + match precision: + case DatePrecision.year: + return dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0) + case DatePrecision.month: + return dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0) + case DatePrecision.day: + return dt.replace(hour=0, minute=0, second=0, microsecond=0) + case DatePrecision.hour: + return dt.replace(minute=0, second=0, microsecond=0) + case DatePrecision.minute: + return dt.replace(second=0, microsecond=0) + case DatePrecision.second: + return dt.replace(microsecond=0) + case DatePrecision.microsecond: + return dt + case _: + return dt def _dates_are_comparable(target: str, comparator: str) -> bool: @@ -505,7 +639,11 @@ def _build_precision_context(target: str, comparator: str) -> dict: } -def _truncate_by_precision(target: str, comparator: str, precision: str) -> tuple: +def _truncate_by_precision( + target: str, comparator: str, precision: DatePrecision | None +) -> tuple: + if precision is None: + return get_date(target), get_date(comparator) return ( truncate_datetime_to_precision(target, precision), truncate_datetime_to_precision(comparator, precision), @@ -537,9 +675,7 @@ def _compare_with_inferred_precision( if truncated_target == truncated_comparator: if target_precision and comparator_precision: - target_value = DatePrecision[target_precision].value - comparator_value = DatePrecision[comparator_precision].value - if target_value > comparator_value: + if target_precision.value > comparator_precision.value: return operator_func(get_date(target), get_date(comparator)) return result diff --git a/tests/unit/test_check_operators/test_date_comparison_checks.py b/tests/unit/test_check_operators/test_date_comparison_checks.py index 7470948ee..e2e4e2a16 100644 --- a/tests/unit/test_check_operators/test_date_comparison_checks.py +++ b/tests/unit/test_check_operators/test_date_comparison_checks.py @@ -1,5 +1,6 @@ from cdisc_rules_engine.check_operators.dataframe_operators import DataframeType from cdisc_rules_engine.check_operators.helpers import ( + DatePrecision, detect_datetime_precision, is_valid_date, ) @@ -11,13 +12,13 @@ @pytest.mark.parametrize( "value,expected_precision", [ - ("2003-12-15T13:14:17.123", "microsecond"), - ("2003-12-15T13:14:17", "second"), - ("2003-12-15T13:14", "minute"), - ("2003-12-15T13", "hour"), - ("2003-12-15", "day"), - ("2003-12", "month"), - ("2003", "year"), + ("2003-12-15T13:14:17.123", DatePrecision.microsecond), + ("2003-12-15T13:14:17", DatePrecision.second), + ("2003-12-15T13:14", DatePrecision.minute), + ("2003-12-15T13", DatePrecision.hour), + ("2003-12-15", DatePrecision.day), + ("2003-12", DatePrecision.month), + ("2003", DatePrecision.year), ], ) def test_detect_datetime_precision_with_truncated_values(value, expected_precision): @@ -74,9 +75,19 @@ def test_invalid_date(data, dataset_type, expected_result): @pytest.mark.parametrize( "value,expected_precision", [ - ("2003---15", "day"), - ("--12-15", "day"), - ("-----T07:15", "minute"), + ("2003-12-15T13:15:17", DatePrecision.second), + ("2003-12-15T13:15", DatePrecision.minute), + ("2003-12-15T-:15", DatePrecision.day), + ("2003-12-15T13:-:17", DatePrecision.hour), + ("2003---15", DatePrecision.year), + ("--12-15", None), + ("-----T07:15", None), + ("-----T07:15:30", None), + ("-----T-:15", None), + ("-----T07:-:30", None), + ("2003-12-15T-:-:17", DatePrecision.day), + ("2003-12--", DatePrecision.month), + ("2003--", DatePrecision.year), ], ) def test_detect_datetime_precision_with_uncertain_components(value, expected_precision): @@ -802,6 +813,14 @@ def test_is_incomplete_date(target, dataset_type, expected_result): ("2025-06-25T17:21", "2025-06-25T17:22:30", "auto", False), ("2025-06-25T", "2025-06-25", "auto", False), ("2025-06-24T", "2025-06-25", "auto", False), + ("2003---15", "2003-12-15", "auto", True), + ("2003---15", "2003-11-15", "auto", True), + ("2003---15", "2004-12-15", "auto", False), + ("2003-12-15T-:15", "2003-12-15T13:15", "auto", True), + ("2003-12-15T-:15", "2003-12-15T14:15", "auto", True), + ("2003-12-15T-:15", "2003-12-16T13:15", "auto", False), + ("2003-12-15T13:-:17", "2003-12-15T13:30:17", "auto", True), + ("2003-12-15T13:-:17", "2003-12-15T14:30:17", "auto", False), ], "date_greater_than": [ ("2025-06-26", "2025-06-25T17:22", None, True), From 3b258a2d556a7e7ccabe0e6651656d4729aca685 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 13 Nov 2025 17:33:20 -0500 Subject: [PATCH 11/19] Cleanup on refactor --- cdisc_rules_engine/check_operators/helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 1084d37b8..d6895dbf1 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -252,7 +252,6 @@ def _parse_datetime_string(date_str: str): or microsecond is not None ) - # Return time components directly instead of rebuilding string time_components = ( hour if hour and hour != "-" else None, minute if minute and minute != "-" else None, From d9e80bf06107d924395fc4a3f6fad28d859f330e Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 13 Nov 2025 19:30:37 -0500 Subject: [PATCH 12/19] Remove unused date precision helper functions and DatePrecision methods left over from refactoring --- README.md | 260 +++++++++--------- cdisc_rules_engine/check_operators/helpers.py | 151 ---------- resources/schema/Operator.json | 11 +- resources/schema/Operator.md | 69 +---- 4 files changed, 140 insertions(+), 351 deletions(-) diff --git a/README.md b/README.md index 0238a505a..b2070e719 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,27 @@ -[![](https://www.cdisc.org/themes/custom/cdiscd8/logo.svg)](https://www.cdisc.org) +### Supported python versions -[![](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120) [![](https://img.shields.io/pypi/v/cdisc-rules-engine.svg)](https://pypi.org/project/cdisc-rules-engine) [![](https://img.shields.io/docker/v/cdiscdocker/cdisc-rules-engine?label=docker)](https://hub.docker.com/r/cdiscdocker/cdisc-rules-engine) +[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120) + +### Windows Command Compatibility + +Note: The Windows commands provided in this README are written for PowerShell. While most commands are compatible with both PowerShell and Command Prompt, some adjustments may be necessary when using Command Prompt. If you encounter any issues running these commands in Command Prompt, try using PowerShell or consult the Command Prompt documentation for equivalent commands. # cdisc-rules-engine Open source offering of the CDISC Rules Engine, a tool designed for validating clinical trial data against data standards. +To learn more, visit our official CDISC website or for other implementation options, see our DockerHub repository: +
+[CDISC Website](https://www.cdisc.org/) +
+[CDISC Rules Engine on DockerHub](https://hub.docker.com/repository/docker/cdiscdocker/cdisc-rules-engine/general) -## Quick start +### **Quick start** -Note: The Windows commands provided in this README are written for PowerShell. While most commands are compatible with both PowerShell and Command Prompt, some adjustments may be necessary when using Command Prompt. If you encounter any issues running these commands in Command Prompt, try using PowerShell or consult the Command Prompt documentation for equivalent commands. - -To quickly get up and running with CORE, users can download the latest executable version of the engine for their operating system from the [Releases](https://github.com/cdisc-org/cdisc-rules-engine/releases) +To quickly get up and running with CORE, users can download the latest executable version of the engine for their operating system from here: Once downloaded, simply unzip the file and run the following command based on your Operating System: -### Windows: +Windows: ``` .\core.exe validate -s -v -d path/to/datasets @@ -22,7 +29,7 @@ Once downloaded, simply unzip the file and run the following command based on yo # ex: .\core.exe validate -s sdtmig -v 3-4 -d .\xpt\ ``` -### Linux/Mac: +Linux/Mac: ``` ./core validate -s -v -d path/to/datasets @@ -43,11 +50,68 @@ Once downloaded, simply unzip the file and run the following command based on yo > chmod +x ./core > ``` -## Command-line Interface +### **Command-line Interface** + +### **Getting Started** + +In the terminal, navigate to the directory you intend to install CORE rules engine in + +1. Clone the repository: + + ``` + git clone https://github.com/cdisc-org/cdisc-rules-engine + ``` + +2. Ensure you have Python 3.12 installed: + You can check your Python version with: + ``` + python --version + ``` + If you don't have Python 3.12, please download and install it from [python.org](https://www.python.org/downloads/) or using your system's package manager. + +### **Code formatter** + +This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. +It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. +Both dependencies are added to _requirements-dev.txt_. + +**Required** + +Setting up `pre-commit` requires one extra step. After installing it you have to run + +`pre-commit install` + +This installs `pre-commit` in your `.git/hooks` directory. + +### **Installing dependencies** + +These steps should be run before running any tests or core commands using the non compiled version. + +- Create a virtual environment: + + `python -m venv ` + +NOTE: if you have multiple versions of python on your machine, you can call python 3.12 for the virtual environment's creation instead of the above command: +`python3.12 -m venv ` + +- Activate the virtual environment: + +`.//bin/activate` -- on linux/mac
+`.\\Scripts\Activate` -- on windows + +- Install the requirements. + +`python -m pip install -r requirements-dev.txt` # From the root directory + +### **Running The Tests** + +From the root of the project run the following command (this will run both the unit and regression tests): + +`python -m pytest tests` -**Note**: the following examples are applicable to the source code and have references to "`python core.py`". When using the executable version as described in the [Quick Start](#quick-start) above, instances of "`python cored.py`" should be replaced with "`.\core.exe`" (Windows) or "`./core`" (Linux/Mac). You can also run directly on the source code by following the [Cloning](#cloning) instructions. +### **Running a validation** -### Running a validation (`validate`) +#### From the command line Clone the repository and run `python core.py --help` to see the full list of commands. @@ -149,7 +213,7 @@ Run `python core.py validate --help` to see the list of validation options. --help Show this message and exit. ``` -#### Available log levels +##### Available log levels - `debug` - Display all logs - `info` - Display info, warnings, and error logs @@ -157,7 +221,7 @@ Run `python core.py validate --help` to see the list of validation options. - `error` - Display only error logs - `critical` - Display critical logs -#### Validate folder +##### **Validate folder** To validate a folder using rules for SDTM-IG version 3.4 use the following command: @@ -179,10 +243,9 @@ CORE supports the following dataset file formats for validation: - Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). - If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. -#### Validate single rule +##### **Validate single rule** `python core.py validate -s sdtmig -v 3-4 -dp -lr --meddra ./meddra/ --whodrug ./whodrug/` - Note: JSON dataset should match the format provided by the rule editor: ```json @@ -208,7 +271,7 @@ Note: JSON dataset should match the format provided by the rule editor: } ``` -#### **Understanding the Rules Report** +##### **Understanding the Rules Report** The rules report tab displays the run status of each rule selected for validation @@ -217,45 +280,9 @@ The possible rule run statuses are: - `SUCCESS` - The rule ran and data was validated against the rule. May or may not produce results - `SKIPPED` - The rule was unable to be run. Usually due to missing required data, but could also be cause by rule execution errors. -#### Setting DATASET_SIZE_THRESHOLD for Large Datasets - -The CDISC Rules Engine respects the `DATASET_SIZE_THRESHOLD` environment variable to determine when to use Dask for large dataset processing. Setting this to 0 coerces Dask usage over Pandas. A .env in the root directory with this variable set will cause this implementation coercion for the CLI. This can also be done with the executable releases via multiple methods: - -##### Windows (Command Prompt) - -```cmd -set DATASET_SIZE_THRESHOLD=0 && core.exe validate -rest -of -config -commands -``` - -##### Windows (PowerShell) - -```powershell -$env:DATASET_SIZE_THRESHOLD=0; core.exe validate -rest -of -config -commands -``` - -##### Linux/Mac (Bash) - -```bash -DATASET_SIZE_THRESHOLD=0 ./core -rest -of -config -commands -``` - -##### .env File (Alternative) - -Create a `.env` file in the root directory of the release containing: - -``` -DATASET_SIZE_THRESHOLD=0 -``` - -Then run normally: `core.exe validate -rest -of -config -commands - ---- - -**Note:** Setting `DATASET_SIZE_THRESHOLD=0` tells the engine to use Dask processing for all datasets regardless of size, size threshold defaults to 1/4 of available RAM so datasets larger than this will use Dask. See env.example to see what the CLI .env file should look like - -### Updating the Cache (`update-cache`) +# Additional Core Commands -Update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. +**- update-cache** - update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. ```bash python core.py update-cache @@ -265,14 +292,14 @@ Update locally stored cache data (Requires an environment variable - `CDISC_LIBR To obtain an api key, please follow the instructions found here: . Please note it can take up to an hour after sign up to have an api key issued -##### Custom Standards and Rules +# Custom Standards and Rules -###### Custom Rules Management +## Custom Rules Management - **Custom rules** are stored in a flat file in the cache, indexed by their core ID (e.g., 'COMPANY-000123' or 'CUSTOM-000123'). - Each rule is stored independently in this file, allowing for efficient lookup and management. -###### Custom Standards Management +## Custom Standards Management - **Custom standards** act as a lookup mechanism that maps a standard identifier to a list of applicable rule IDs. - When adding a custom standard, you need to provide a JSON file with the following structure: @@ -318,13 +345,13 @@ To obtain an api key, please follow the instructions found here: ` - -NOTE: if you have multiple versions of python on your machine, you can call python 3.12 for the virtual environment's creation instead of the above command: -`python3.12 -m venv ` - -- Activate the virtual environment: - -`.//bin/activate` -- on linux/mac
-`.\\Scripts\Activate` -- on windows - -- Install the requirements. - -`python -m pip install -r requirements-dev.txt` # From the root directory - -### Creating an executable version +### **Creating an executable version** **Note:** Further directions to create your own executable are contained in [README_Build_Executable.md](README_Build_Executable.md) if you wish to build an unofficial release executable for your own use. @@ -483,7 +467,7 @@ _Note .venv should be replaced with path to python installation or virtual envir This will create an executable version in the `dist` folder. The version does not require having Python installed and can be launched by running `core` script with all necessary CLI arguments. -### Creating .whl file +### **Creating .whl file** All non-python files should be listed in `MANIFEST.in` to be included in the distribution. Files must be in python package. @@ -514,40 +498,58 @@ To upload built distributive to pypi `py -m pip install --upgrade twine` `py -m twine upload --repository {repository_name} dist/*` -## Contributing +## Submit an Issue -### Code formatter +If you encounter any bugs, have feature requests, or need assistance, please submit an issue on our GitHub repository: -This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. -It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements-dev.txt_. +[https://github.com/cdisc-org/cdisc-rules-engine/issues](https://github.com/cdisc-org/cdisc-rules-engine/issues) -Setting up `pre-commit` requires one extra step. After installing it you have to run +When submitting an issue, please include: -`pre-commit install` +- A clear description of the problem or request +- Steps to reproduce the issue (for bugs) +- Your operating system and environment details +- Any relevant logs or error messages -This installs `pre-commit` in your `.git/hooks` directory. +# Setting DATASET_SIZE_THRESHOLD for Large Datasets -### Running The Tests +The CDISC Rules Engine respects the `DATASET_SIZE_THRESHOLD` environment variable to determine when to use Dask for large dataset processing. Setting this to 0 coerces Dask usage over Pandas. A .env in the root directory with this variable set will cause this implementation coercion for the CLI. This can also be done with the executable releases via multiple methods: -From the root of the project run the following command (this will run both the unit and regression tests): +## Quick Commands -`python -m pytest tests` +### Windows (Command Prompt) -### Submit an Issue +```cmd +set DATASET_SIZE_THRESHOLD=0 && core.exe validate -rest -of -config -commands +``` -If you encounter any bugs, have feature requests, or need assistance, please submit an issue on our GitHub repository: +### Windows (PowerShell) -[https://github.com/cdisc-org/cdisc-rules-engine/issues](https://github.com/cdisc-org/cdisc-rules-engine/issues) +```powershell +$env:DATASET_SIZE_THRESHOLD=0; core.exe validate -rest -of -config -commands +``` -When submitting an issue, please include: +### Linux/Mac (Bash) -- A clear description of the problem or request -- Steps to reproduce the issue (for bugs) -- Your operating system and environment details -- Any relevant logs or error messages +```bash +DATASET_SIZE_THRESHOLD=0 ./core -rest -of -config -commands +``` + +## .env File (Alternative) + +Create a `.env` file in the root directory of the release containing: + +``` +DATASET_SIZE_THRESHOLD=0 +``` + +Then run normally: `core.exe validate -rest -of -config -commands + +--- + +**Note:** Setting `DATASET_SIZE_THRESHOLD=0` tells the engine to use Dask processing for all datasets regardless of size, size threshold defaults to 1/4 of available RAM so datasets larger than this will use Dask. See env.example to see what the CLI .env file should look like -### Updating USDM JSON Schema +## Updating USDM JSON Schema Currently, the engine supports USDM JSON Schema validation against versions 3.0 and 4.0. The schema definition files are located at: diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index d6895dbf1..2db9752c1 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -41,14 +41,6 @@ class DatePrecision(IntEnum): second = 5 microsecond = 6 - @classmethod - def get_name_by_index(cls, index: int) -> str: - return list(cls.__members__.keys())[index] - - @classmethod - def names(cls) -> list: - return list(cls.__members__.keys()) - def is_valid_date(date_string: str) -> bool: if date_string is None: @@ -262,90 +254,6 @@ def _parse_datetime_string(date_str: str): return date_components, time_components, has_time -def _normalize_precision(precision_name): - """Convert string to DatePrecision enum if needed.""" - if isinstance(precision_name, str): - try: - return DatePrecision[precision_name] - except (KeyError, ValueError): - return None - return precision_name - - -def _has_later_date_component(date_components, index): - """Check if there are valid components after the given index.""" - for later_idx in range(index + 1, min(3, len(date_components))): - if later_idx < len(date_components): - later_comp = date_components[later_idx] - if later_comp and later_comp != "-": - return True - return False - - -def _get_precision_before_missing(precision): - """Get precision before the missing component.""" - if precision == DatePrecision.year: - return None - prev_index = precision.value - 1 - return DatePrecision(prev_index) if prev_index >= 0 else None - - -def _check_date_component(date_components, index, precision_name): - precision = _normalize_precision(precision_name) - if precision is None: - return None - - if len(date_components) <= index: - return _get_precision_before_missing(precision) - - component = date_components[index] - if not component or component == "-": - if _has_later_date_component(date_components, index): - return None - return _get_precision_before_missing(precision) - - return None - - -def _check_time_component(time_components, has_time, component_index): - if not has_time or not time_components: - return DatePrecision.day - hour, minute, second, microsecond = time_components - components = [hour, minute, second] - if len(components) <= component_index or components[component_index] is None: - prev_index = DatePrecision.hour.value + component_index - 1 - return DatePrecision(prev_index) if prev_index >= 0 else DatePrecision.day - component = components[component_index] - if not component or component == "-": - prev_index = DatePrecision.hour.value + component_index - 1 - return DatePrecision(prev_index) if prev_index >= 0 else DatePrecision.day - return None - - -def _check_second_component(time_components, has_time): - if not has_time or not time_components: - return DatePrecision.day - hour, minute, second, microsecond = time_components - if second is None: - return DatePrecision.minute - if not second or second == "-": - return DatePrecision.minute - return None - - -def _check_microsecond_component(time_components, has_time): - if not has_time or not time_components: - return DatePrecision.day - hour, minute, second, microsecond = time_components - if second is None: - return DatePrecision.minute - if microsecond is None: - return DatePrecision.second - if not microsecond: - return DatePrecision.second - return None - - @lru_cache(maxsize=1000) def detect_datetime_precision(date_str: str) -> DatePrecision | None: if not _datestring_is_valid(date_str): @@ -366,38 +274,6 @@ def _datestring_is_valid(date_str: str) -> bool: return bool(date_str and isinstance(date_str, str) and date_regex.match(date_str)) -def _is_time_only_precision(date_components: list, has_time: bool) -> bool: - return has_time and all(component == "-" for component in date_components) - - -def _time_only_precision(time_components) -> DatePrecision | None: - if not time_components: - return None - hour, minute, second, microsecond = time_components - if not hour or hour == "-": - return None - if not minute or minute == "-": - return DatePrecision.hour - if second is None: - return DatePrecision.minute - second_part = second - if microsecond: - second_part = f"{second}.{microsecond}" - return _precision_from_second_component(second_part) - - -def _precision_from_second_component(second_part: str) -> DatePrecision: - if "." in second_part: - second, microsecond = second_part.split(".", 1) - if not second or second == "-": - return DatePrecision.minute - return DatePrecision.second if not microsecond else DatePrecision.microsecond - - if not second_part or second_part == "-": - return DatePrecision.minute - return DatePrecision.second - - def _check_date_component_missing(component) -> bool: return component is None or component == "-" or component == "" @@ -456,32 +332,6 @@ def _date_and_time_precision( return _check_time_precision(time_components) -def _precision_check_functions(date_components, time_components, has_time): - return { - DatePrecision.year: lambda i, name: _check_date_component( - date_components, i, name - ), - DatePrecision.month: lambda i, name: _check_date_component( - date_components, i, name - ), - DatePrecision.day: lambda i, name: _check_date_component( - date_components, i, name - ), - DatePrecision.hour: lambda i, name: _check_time_component( - time_components, has_time, 0 - ), - DatePrecision.minute: lambda i, name: _check_time_component( - time_components, has_time, 1 - ), - DatePrecision.second: lambda i, name: _check_second_component( - time_components, has_time - ), - DatePrecision.microsecond: lambda i, name: _check_microsecond_component( - time_components, has_time - ), - } - - def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: p1 = detect_datetime_precision(dt1) p2 = detect_datetime_precision(dt2) @@ -492,7 +342,6 @@ def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: def get_date_component(component: str, date_string: str): - # Convert string to DatePrecision enum for internal use try: precision = DatePrecision[component] except (KeyError, ValueError): diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index 459a19607..8ca2cc330 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -592,16 +592,7 @@ "value_is_reference": { "type": "boolean" }, "type_insensitive": { "type": "boolean" }, "round_values": { "type": "boolean" }, - "within": { - "oneOf": [ - { "$ref": "CORE-base.json#/$defs/VariableName" }, - { - "items": { "$ref": "CORE-base.json#/$defs/VariableName" }, - "minItems": 1, - "type": "array" - } - ] - }, + "within": { "$ref": "CORE-base.json#/$defs/VariableName" }, "regex": { "type": "string" } }, "required": ["operator"], diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index b84d0e668..b49068ad2 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -1,7 +1,5 @@ # Check Operator -NOTE: Complementary operators have access to the same paremeter arguments unless otherwise stated. - ## Relational Basic value comparisons and presence checks for evaluating equality, inequality, ranges, and whether values exist or are empty. @@ -759,22 +757,6 @@ True if all values in `value` are contained within the variable `name`. - "Unplanned Treatment" ``` -The operator also supports lists: - -```yaml -- name: "$spec_codelist" - operator: "contains_all" - value: "$ppspec_value" -``` - -Where: - -| $spec_codelist | $ppspec_value | -| :-------------------------- | :----------------: | -| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | -| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | -| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | - ### not_contains_all Complement of `contains_all` @@ -791,22 +773,6 @@ Complement of `contains_all` - "Unplanned Treatment" ``` -The operator also supports lists: - -```yaml -- name: "$spec_codelist" - operator: "not_contains_all" - value: "$ppspec_value" -``` - -Where: - -| $spec_codelist | $ppspec_value | -| :-------------------------- | :----------------: | -| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | -| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | -| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | - ### shares_at_least_one_element_with Will raise an issue if at least one of the values in `name` is the same as one of the values in `value`. See [shares_no_elements_with](#shares_no_elements_with). @@ -867,34 +833,17 @@ Relationship Integrity Check > `name` can be a variable containing a list of columns and `value` does not need to be present -> The `regex` parameter allows you to extract portions of values using a regex pattern before checking uniqueness. - -> Compare date only (YYYY-MM-DD) for uniqueness - -```yaml -- name: "--REPNUM" - operator: is_not_unique_set - value: - - "USUBJID" - - "--TESTCD" - - "$TIMING_VARIABLES" - regex: '^\d{4}-\d{2}-\d{2}' -``` - -> Compare by first N characters of a string - ```yaml -- name: "ITEM_ID" - operator: is_not_unique_set - value: - - "USUBJID" - - "CATEGORY" - regex: "^.{2}" +Rule Type: Dataset Contents Check against Define XML +Check: + all: + - name: define_dataset_key_sequence # contains list of dataset key columns + operator: is_unique_set ``` ### is_not_unique_set -Complement of `is_unique_set`. +Complement of `is_unique_set` > --SEQ is not unique within DOMAIN, USUBJID, and --TESTCD @@ -1079,15 +1028,13 @@ Complement of `is_ordered_by` ### target_is_sorted_by -True if the values in `name` are ordered according to the values specified by `value` grouped by the values in `within`. Each `value` requires a variable `name`, ordering specified by `order`, and the null position specified by `null_position`. `within` accepts either a single column or an ordered list of columns. +True if the values in `name` are ordered according to the values specified by `value` grouped by the values in `within`. Each `value` requires a variable `name`, ordering specified by `order`, and the null position specified by `null_position`. ```yaml Check: all: - name: --SEQ - within: - - USUBJID - - MIDSTYPE + within: USUBJID operator: target_is_sorted_by value: - name: --STDTC From c99e9a2e5c6b9ecf665079a818aab645de643c30 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 13 Nov 2025 19:34:33 -0500 Subject: [PATCH 13/19] more cleanup --- README.md | 260 +++++++++++++++++++++++++++--------------------------- 1 file changed, 129 insertions(+), 131 deletions(-) diff --git a/README.md b/README.md index b2070e719..0238a505a 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,20 @@ -### Supported python versions +[![](https://www.cdisc.org/themes/custom/cdiscd8/logo.svg)](https://www.cdisc.org) -[![Python 3.12](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120) - -### Windows Command Compatibility - -Note: The Windows commands provided in this README are written for PowerShell. While most commands are compatible with both PowerShell and Command Prompt, some adjustments may be necessary when using Command Prompt. If you encounter any issues running these commands in Command Prompt, try using PowerShell or consult the Command Prompt documentation for equivalent commands. +[![](https://img.shields.io/badge/python-3.12-blue.svg)](https://www.python.org/downloads/release/python-3120) [![](https://img.shields.io/pypi/v/cdisc-rules-engine.svg)](https://pypi.org/project/cdisc-rules-engine) [![](https://img.shields.io/docker/v/cdiscdocker/cdisc-rules-engine?label=docker)](https://hub.docker.com/r/cdiscdocker/cdisc-rules-engine) # cdisc-rules-engine Open source offering of the CDISC Rules Engine, a tool designed for validating clinical trial data against data standards. -To learn more, visit our official CDISC website or for other implementation options, see our DockerHub repository: -
-[CDISC Website](https://www.cdisc.org/) -
-[CDISC Rules Engine on DockerHub](https://hub.docker.com/repository/docker/cdiscdocker/cdisc-rules-engine/general) -### **Quick start** +## Quick start -To quickly get up and running with CORE, users can download the latest executable version of the engine for their operating system from here: +Note: The Windows commands provided in this README are written for PowerShell. While most commands are compatible with both PowerShell and Command Prompt, some adjustments may be necessary when using Command Prompt. If you encounter any issues running these commands in Command Prompt, try using PowerShell or consult the Command Prompt documentation for equivalent commands. + +To quickly get up and running with CORE, users can download the latest executable version of the engine for their operating system from the [Releases](https://github.com/cdisc-org/cdisc-rules-engine/releases) Once downloaded, simply unzip the file and run the following command based on your Operating System: -Windows: +### Windows: ``` .\core.exe validate -s -v -d path/to/datasets @@ -29,7 +22,7 @@ Windows: # ex: .\core.exe validate -s sdtmig -v 3-4 -d .\xpt\ ``` -Linux/Mac: +### Linux/Mac: ``` ./core validate -s -v -d path/to/datasets @@ -50,68 +43,11 @@ Linux/Mac: > chmod +x ./core > ``` -### **Command-line Interface** - -### **Getting Started** - -In the terminal, navigate to the directory you intend to install CORE rules engine in - -1. Clone the repository: - - ``` - git clone https://github.com/cdisc-org/cdisc-rules-engine - ``` - -2. Ensure you have Python 3.12 installed: - You can check your Python version with: - ``` - python --version - ``` - If you don't have Python 3.12, please download and install it from [python.org](https://www.python.org/downloads/) or using your system's package manager. - -### **Code formatter** - -This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. -It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. -Both dependencies are added to _requirements-dev.txt_. - -**Required** - -Setting up `pre-commit` requires one extra step. After installing it you have to run - -`pre-commit install` - -This installs `pre-commit` in your `.git/hooks` directory. - -### **Installing dependencies** - -These steps should be run before running any tests or core commands using the non compiled version. - -- Create a virtual environment: - - `python -m venv ` - -NOTE: if you have multiple versions of python on your machine, you can call python 3.12 for the virtual environment's creation instead of the above command: -`python3.12 -m venv ` - -- Activate the virtual environment: - -`.//bin/activate` -- on linux/mac
-`.\\Scripts\Activate` -- on windows - -- Install the requirements. - -`python -m pip install -r requirements-dev.txt` # From the root directory - -### **Running The Tests** - -From the root of the project run the following command (this will run both the unit and regression tests): - -`python -m pytest tests` +## Command-line Interface -### **Running a validation** +**Note**: the following examples are applicable to the source code and have references to "`python core.py`". When using the executable version as described in the [Quick Start](#quick-start) above, instances of "`python cored.py`" should be replaced with "`.\core.exe`" (Windows) or "`./core`" (Linux/Mac). You can also run directly on the source code by following the [Cloning](#cloning) instructions. -#### From the command line +### Running a validation (`validate`) Clone the repository and run `python core.py --help` to see the full list of commands. @@ -213,7 +149,7 @@ Run `python core.py validate --help` to see the list of validation options. --help Show this message and exit. ``` -##### Available log levels +#### Available log levels - `debug` - Display all logs - `info` - Display info, warnings, and error logs @@ -221,7 +157,7 @@ Run `python core.py validate --help` to see the list of validation options. - `error` - Display only error logs - `critical` - Display critical logs -##### **Validate folder** +#### Validate folder To validate a folder using rules for SDTM-IG version 3.4 use the following command: @@ -243,9 +179,10 @@ CORE supports the following dataset file formats for validation: - Define-XML files should be provided via the `--define-xml-path` (or `-dxp`) option, not through the dataset directory (`-d` or `-dp`). - If you point to a folder containing unsupported file formats, CORE will display an error message indicating which formats are supported. -##### **Validate single rule** +#### Validate single rule `python core.py validate -s sdtmig -v 3-4 -dp -lr --meddra ./meddra/ --whodrug ./whodrug/` + Note: JSON dataset should match the format provided by the rule editor: ```json @@ -271,7 +208,7 @@ Note: JSON dataset should match the format provided by the rule editor: } ``` -##### **Understanding the Rules Report** +#### **Understanding the Rules Report** The rules report tab displays the run status of each rule selected for validation @@ -280,9 +217,45 @@ The possible rule run statuses are: - `SUCCESS` - The rule ran and data was validated against the rule. May or may not produce results - `SKIPPED` - The rule was unable to be run. Usually due to missing required data, but could also be cause by rule execution errors. -# Additional Core Commands +#### Setting DATASET_SIZE_THRESHOLD for Large Datasets + +The CDISC Rules Engine respects the `DATASET_SIZE_THRESHOLD` environment variable to determine when to use Dask for large dataset processing. Setting this to 0 coerces Dask usage over Pandas. A .env in the root directory with this variable set will cause this implementation coercion for the CLI. This can also be done with the executable releases via multiple methods: + +##### Windows (Command Prompt) + +```cmd +set DATASET_SIZE_THRESHOLD=0 && core.exe validate -rest -of -config -commands +``` + +##### Windows (PowerShell) + +```powershell +$env:DATASET_SIZE_THRESHOLD=0; core.exe validate -rest -of -config -commands +``` + +##### Linux/Mac (Bash) + +```bash +DATASET_SIZE_THRESHOLD=0 ./core -rest -of -config -commands +``` + +##### .env File (Alternative) + +Create a `.env` file in the root directory of the release containing: + +``` +DATASET_SIZE_THRESHOLD=0 +``` + +Then run normally: `core.exe validate -rest -of -config -commands + +--- + +**Note:** Setting `DATASET_SIZE_THRESHOLD=0` tells the engine to use Dask processing for all datasets regardless of size, size threshold defaults to 1/4 of available RAM so datasets larger than this will use Dask. See env.example to see what the CLI .env file should look like + +### Updating the Cache (`update-cache`) -**- update-cache** - update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. +Update locally stored cache data (Requires an environment variable - `CDISC_LIBRARY_API_KEY`) This is stored in the .env folder in the root directory, the API key does not need quotations around it. ```bash python core.py update-cache @@ -292,14 +265,14 @@ The possible rule run statuses are: To obtain an api key, please follow the instructions found here: . Please note it can take up to an hour after sign up to have an api key issued -# Custom Standards and Rules +##### Custom Standards and Rules -## Custom Rules Management +###### Custom Rules Management - **Custom rules** are stored in a flat file in the cache, indexed by their core ID (e.g., 'COMPANY-000123' or 'CUSTOM-000123'). - Each rule is stored independently in this file, allowing for efficient lookup and management. -## Custom Standards Management +###### Custom Standards Management - **Custom standards** act as a lookup mechanism that maps a standard identifier to a list of applicable rule IDs. - When adding a custom standard, you need to provide a JSON file with the following structure: @@ -345,13 +318,13 @@ To obtain an api key, please follow the instructions found here: ` + +NOTE: if you have multiple versions of python on your machine, you can call python 3.12 for the virtual environment's creation instead of the above command: +`python3.12 -m venv ` + +- Activate the virtual environment: + +`.//bin/activate` -- on linux/mac
+`.\\Scripts\Activate` -- on windows + +- Install the requirements. + +`python -m pip install -r requirements-dev.txt` # From the root directory + +### Creating an executable version **Note:** Further directions to create your own executable are contained in [README_Build_Executable.md](README_Build_Executable.md) if you wish to build an unofficial release executable for your own use. @@ -467,7 +483,7 @@ _Note .venv should be replaced with path to python installation or virtual envir This will create an executable version in the `dist` folder. The version does not require having Python installed and can be launched by running `core` script with all necessary CLI arguments. -### **Creating .whl file** +### Creating .whl file All non-python files should be listed in `MANIFEST.in` to be included in the distribution. Files must be in python package. @@ -498,58 +514,40 @@ To upload built distributive to pypi `py -m pip install --upgrade twine` `py -m twine upload --repository {repository_name} dist/*` -## Submit an Issue - -If you encounter any bugs, have feature requests, or need assistance, please submit an issue on our GitHub repository: - -[https://github.com/cdisc-org/cdisc-rules-engine/issues](https://github.com/cdisc-org/cdisc-rules-engine/issues) - -When submitting an issue, please include: - -- A clear description of the problem or request -- Steps to reproduce the issue (for bugs) -- Your operating system and environment details -- Any relevant logs or error messages - -# Setting DATASET_SIZE_THRESHOLD for Large Datasets - -The CDISC Rules Engine respects the `DATASET_SIZE_THRESHOLD` environment variable to determine when to use Dask for large dataset processing. Setting this to 0 coerces Dask usage over Pandas. A .env in the root directory with this variable set will cause this implementation coercion for the CLI. This can also be done with the executable releases via multiple methods: +## Contributing -## Quick Commands +### Code formatter -### Windows (Command Prompt) +This project uses the `black` code formatter, `flake8` linter for python and `prettier` for JSON, YAML and MD. +It also uses `pre-commit` to run `black`, `flake8` and `prettier` when you commit. +Both dependencies are added to _requirements-dev.txt_. -```cmd -set DATASET_SIZE_THRESHOLD=0 && core.exe validate -rest -of -config -commands -``` +Setting up `pre-commit` requires one extra step. After installing it you have to run -### Windows (PowerShell) +`pre-commit install` -```powershell -$env:DATASET_SIZE_THRESHOLD=0; core.exe validate -rest -of -config -commands -``` +This installs `pre-commit` in your `.git/hooks` directory. -### Linux/Mac (Bash) +### Running The Tests -```bash -DATASET_SIZE_THRESHOLD=0 ./core -rest -of -config -commands -``` +From the root of the project run the following command (this will run both the unit and regression tests): -## .env File (Alternative) +`python -m pytest tests` -Create a `.env` file in the root directory of the release containing: +### Submit an Issue -``` -DATASET_SIZE_THRESHOLD=0 -``` +If you encounter any bugs, have feature requests, or need assistance, please submit an issue on our GitHub repository: -Then run normally: `core.exe validate -rest -of -config -commands +[https://github.com/cdisc-org/cdisc-rules-engine/issues](https://github.com/cdisc-org/cdisc-rules-engine/issues) ---- +When submitting an issue, please include: -**Note:** Setting `DATASET_SIZE_THRESHOLD=0` tells the engine to use Dask processing for all datasets regardless of size, size threshold defaults to 1/4 of available RAM so datasets larger than this will use Dask. See env.example to see what the CLI .env file should look like +- A clear description of the problem or request +- Steps to reproduce the issue (for bugs) +- Your operating system and environment details +- Any relevant logs or error messages -## Updating USDM JSON Schema +### Updating USDM JSON Schema Currently, the engine supports USDM JSON Schema validation against versions 3.0 and 4.0. The schema definition files are located at: From ce242248d675cb6b49c5d15bc6027325e2f71cfc Mon Sep 17 00:00:00 2001 From: Rakesh Date: Thu, 13 Nov 2025 20:04:13 -0500 Subject: [PATCH 14/19] missed update from operator.md --- resources/schema/Operator.json | 11 +++++- resources/schema/Operator.md | 69 ++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/resources/schema/Operator.json b/resources/schema/Operator.json index 8ca2cc330..459a19607 100644 --- a/resources/schema/Operator.json +++ b/resources/schema/Operator.json @@ -592,7 +592,16 @@ "value_is_reference": { "type": "boolean" }, "type_insensitive": { "type": "boolean" }, "round_values": { "type": "boolean" }, - "within": { "$ref": "CORE-base.json#/$defs/VariableName" }, + "within": { + "oneOf": [ + { "$ref": "CORE-base.json#/$defs/VariableName" }, + { + "items": { "$ref": "CORE-base.json#/$defs/VariableName" }, + "minItems": 1, + "type": "array" + } + ] + }, "regex": { "type": "string" } }, "required": ["operator"], diff --git a/resources/schema/Operator.md b/resources/schema/Operator.md index b49068ad2..b84d0e668 100644 --- a/resources/schema/Operator.md +++ b/resources/schema/Operator.md @@ -1,5 +1,7 @@ # Check Operator +NOTE: Complementary operators have access to the same paremeter arguments unless otherwise stated. + ## Relational Basic value comparisons and presence checks for evaluating equality, inequality, ranges, and whether values exist or are empty. @@ -757,6 +759,22 @@ True if all values in `value` are contained within the variable `name`. - "Unplanned Treatment" ``` +The operator also supports lists: + +```yaml +- name: "$spec_codelist" + operator: "contains_all" + value: "$ppspec_value" +``` + +Where: + +| $spec_codelist | $ppspec_value | +| :-------------------------- | :----------------: | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | + ### not_contains_all Complement of `contains_all` @@ -773,6 +791,22 @@ Complement of `contains_all` - "Unplanned Treatment" ``` +The operator also supports lists: + +```yaml +- name: "$spec_codelist" + operator: "not_contains_all" + value: "$ppspec_value" +``` + +Where: + +| $spec_codelist | $ppspec_value | +| :-------------------------- | :----------------: | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1", "CODE2"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE2", "CODE3"] | +| ["CODE1", "CODE2", "CODE3"] | ["CODE1"] | + ### shares_at_least_one_element_with Will raise an issue if at least one of the values in `name` is the same as one of the values in `value`. See [shares_no_elements_with](#shares_no_elements_with). @@ -833,17 +867,34 @@ Relationship Integrity Check > `name` can be a variable containing a list of columns and `value` does not need to be present +> The `regex` parameter allows you to extract portions of values using a regex pattern before checking uniqueness. + +> Compare date only (YYYY-MM-DD) for uniqueness + ```yaml -Rule Type: Dataset Contents Check against Define XML -Check: - all: - - name: define_dataset_key_sequence # contains list of dataset key columns - operator: is_unique_set +- name: "--REPNUM" + operator: is_not_unique_set + value: + - "USUBJID" + - "--TESTCD" + - "$TIMING_VARIABLES" + regex: '^\d{4}-\d{2}-\d{2}' +``` + +> Compare by first N characters of a string + +```yaml +- name: "ITEM_ID" + operator: is_not_unique_set + value: + - "USUBJID" + - "CATEGORY" + regex: "^.{2}" ``` ### is_not_unique_set -Complement of `is_unique_set` +Complement of `is_unique_set`. > --SEQ is not unique within DOMAIN, USUBJID, and --TESTCD @@ -1028,13 +1079,15 @@ Complement of `is_ordered_by` ### target_is_sorted_by -True if the values in `name` are ordered according to the values specified by `value` grouped by the values in `within`. Each `value` requires a variable `name`, ordering specified by `order`, and the null position specified by `null_position`. +True if the values in `name` are ordered according to the values specified by `value` grouped by the values in `within`. Each `value` requires a variable `name`, ordering specified by `order`, and the null position specified by `null_position`. `within` accepts either a single column or an ordered list of columns. ```yaml Check: all: - name: --SEQ - within: USUBJID + within: + - USUBJID + - MIDSTYPE operator: target_is_sorted_by value: - name: --STDTC From 0c746f81505e9b4adf18e1160f262cc186c65b72 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Tue, 18 Nov 2025 15:05:07 -0500 Subject: [PATCH 15/19] Refactored to remove redundant code and use enums --- cdisc_rules_engine/check_operators/helpers.py | 282 ++++++------------ 1 file changed, 86 insertions(+), 196 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 2db9752c1..5a652f8c7 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -41,6 +41,19 @@ class DatePrecision(IntEnum): second = 5 microsecond = 6 + @property + def default_value(self): + default_values = { + DatePrecision.year: 1970, + DatePrecision.month: 1, + DatePrecision.day: 1, + DatePrecision.hour: 0, + DatePrecision.minute: 0, + DatePrecision.second: 0, + DatePrecision.microsecond: 0, + } + return default_values[self] + def is_valid_date(date_string: str) -> bool: if date_string is None: @@ -100,157 +113,61 @@ def is_valid_duration(duration: str, negative) -> bool: return True -def get_year(date_string: str): - timestamp = get_date(date_string) - return timestamp.year - - -def get_month(date_string: str): - timestamp = get_date(date_string) - return timestamp.month - - -def get_day(date_string: str): - timestamp = get_date(date_string) - return timestamp.day - - -def get_hour(date_string: str): - timestamp = get_date(date_string) - return timestamp.hour - - -def get_minute(date_string: str): - timestamp = get_date(date_string) - return timestamp.minute - - -def get_second(date_string: str): - timestamp = get_date(date_string) - return timestamp.second - - -def get_microsecond(date_string: str): - timestamp = get_date(date_string) - return timestamp.microsecond - - -def _empty_datetime_components() -> dict: - return { - "year": None, - "month": None, - "day": None, - "hour": None, - "minute": None, - "second": None, - "microsecond": None, - } +def _empty_datetime_components(): + return {precision: None for precision in DatePrecision} def _extract_datetime_components(date_str: str) -> dict: """Extract datetime components using regex pattern matching.""" - components = _empty_datetime_components() if not date_str or not isinstance(date_str, str): - return components - + return _empty_datetime_components() match = date_regex.match(date_str) if not match: - return components - - year = match.group("year") or match.group("interval_year") - month = match.group("month") or match.group("interval_month") - day = match.group("day") or match.group("interval_day") - hour = ( - match.group("hour") - or match.group("interval_hour") - or match.group("timeonly_hour") - ) - minute = ( - match.group("minute") - or match.group("interval_minute") - or match.group("timeonly_minute") - ) - second = ( - match.group("second") - or match.group("interval_second") - or match.group("timeonly_second") - ) - microsecond = ( - match.group("microsecond") - or match.group("interval_microsecond") - or match.group("timeonly_microsecond") - ) - - if year and year != "-": - components["year"] = year - if month and month != "-": - components["month"] = month - if day and day != "-": - components["day"] = day - if hour and hour != "-": - components["hour"] = hour - if minute and minute != "-": - components["minute"] = minute - if second and second != "-": - components["second"] = second - if microsecond: - components["microsecond"] = microsecond - + return _empty_datetime_components() + + matches = { + DatePrecision.year: match.group("year") or match.group("interval_year"), + DatePrecision.month: match.group("month") or match.group("interval_month"), + DatePrecision.day: match.group("day") or match.group("interval_day"), + DatePrecision.hour: ( + match.group("hour") + or match.group("interval_hour") + or match.group("timeonly_hour") + ), + DatePrecision.minute: ( + match.group("minute") + or match.group("interval_minute") + or match.group("timeonly_minute") + ), + DatePrecision.second: ( + match.group("second") + or match.group("interval_second") + or match.group("timeonly_second") + ), + DatePrecision.microsecond: ( + match.group("microsecond") + or match.group("interval_microsecond") + or match.group("timeonly_microsecond") + ), + } + components = { + precision: None if _check_date_component_missing(component) else component + for precision, component in matches.items() + } return components def _parse_datetime_string(date_str: str): if not date_str or not isinstance(date_str, str): return [], (None, None, None, None), False - - match = date_regex.match(date_str) - if not match: - return [], (None, None, None, None), False - - year = match.group("year") or match.group("interval_year") - month = match.group("month") or match.group("interval_month") - day = match.group("day") or match.group("interval_day") - hour = ( - match.group("hour") - or match.group("interval_hour") - or match.group("timeonly_hour") - ) - minute = ( - match.group("minute") - or match.group("interval_minute") - or match.group("timeonly_minute") - ) - second = ( - match.group("second") - or match.group("interval_second") - or match.group("timeonly_second") - ) - microsecond = ( - match.group("microsecond") - or match.group("interval_microsecond") - or match.group("timeonly_microsecond") - ) - - date_components = [ - year if year and year != "-" else "-", - month if month and month != "-" else "-", - day if day and day != "-" else "-", + components = _extract_datetime_components(date_str) + date_components = list(components.values())[ + DatePrecision.year : DatePrecision.day + 1 ] - - has_time = ( - hour is not None - or minute is not None - or second is not None - or microsecond is not None - ) - - time_components = ( - hour if hour and hour != "-" else None, - minute if minute and minute != "-" else None, - second if second and second != "-" else None, - microsecond if microsecond else None, - ) - + time_components = list(components.values())[ + DatePrecision.hour : DatePrecision.microsecond + 1 + ] + has_time = any(component is not None for component in time_components) return date_components, time_components, has_time @@ -258,15 +175,9 @@ def _parse_datetime_string(date_str: str): def detect_datetime_precision(date_str: str) -> DatePrecision | None: if not _datestring_is_valid(date_str): return None - date_components, time_components, has_time = _parse_datetime_string(date_str) - - if all( - component == "-" or component is None or component == "" - for component in date_components - ): + if all(_check_date_component_missing(component) for component in date_components): return None - return _date_and_time_precision(date_components, time_components, has_time) @@ -288,14 +199,12 @@ def _get_precision_before(precision: DatePrecision) -> DatePrecision | None: def _check_date_precision(date_components) -> tuple: - date_component_map = { - DatePrecision.year: 0, - DatePrecision.month: 1, - DatePrecision.day: 2, - } for precision in [DatePrecision.year, DatePrecision.month, DatePrecision.day]: - index = date_component_map[precision] - component = date_components[index] if index < len(date_components) else None + component = ( + date_components[precision.value] + if precision.value < len(date_components) + else None + ) if _check_date_component_missing(component): result = _get_precision_before(precision) return (True, result) @@ -342,41 +251,22 @@ def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: def get_date_component(component: str, date_string: str): + date = get_date(date_string) try: - precision = DatePrecision[component] + return getattr(date, DatePrecision[component].name) except (KeyError, ValueError): - return get_date(date_string) - - component_func_map = { - DatePrecision.year: get_year, - DatePrecision.month: get_month, - DatePrecision.day: get_day, - DatePrecision.hour: get_hour, - DatePrecision.minute: get_minute, - DatePrecision.microsecond: get_microsecond, - DatePrecision.second: get_second, - } - component_function = component_func_map.get(precision) - if component_function: - return component_function(date_string) - else: - return get_date(date_string) + return date def _parse_uncertain_date(date_string: str) -> datetime | None: """Parse uncertain dates with missing components using regex groups.""" components = _extract_datetime_components(date_string) - - year = int(components.get("year") or 1970) - month = int(components.get("month") or 1) - day = int(components.get("day") or 1) - hour = int(components.get("hour") or 0) - minute = int(components.get("minute") or 0) - second = int(components.get("second") or 0) - microsecond = int(components.get("microsecond") or 0) - + component_ints = [ + int(components.get(precision) or precision.default_value) + for precision in DatePrecision + ] try: - return datetime(year, month, day, hour, minute, second, microsecond) + return datetime(*component_ints) except (ValueError, TypeError): return None @@ -394,7 +284,17 @@ def get_date(date_string: str): utc = pytz.UTC return utc.localize(uncertain_date) - date = parse(date_string, default=datetime(1970, 1, 1)) + date = parse( + date_string, + default=datetime( + *[ + precision.default_value + for precision in list(DatePrecision)[ + DatePrecision.year : DatePrecision.day + 1 + ] + ] + ), + ) utc = pytz.UTC if date.tzinfo is not None and date.tzinfo.utcoffset(date) is not None: return date.astimezone(utc) @@ -443,23 +343,13 @@ def case_insensitive_is_in(value, values): def truncate_datetime_to_precision(date_string: str, precision: DatePrecision): dt = get_date(date_string) - match precision: - case DatePrecision.year: - return dt.replace(month=1, day=1, hour=0, minute=0, second=0, microsecond=0) - case DatePrecision.month: - return dt.replace(day=1, hour=0, minute=0, second=0, microsecond=0) - case DatePrecision.day: - return dt.replace(hour=0, minute=0, second=0, microsecond=0) - case DatePrecision.hour: - return dt.replace(minute=0, second=0, microsecond=0) - case DatePrecision.minute: - return dt.replace(second=0, microsecond=0) - case DatePrecision.second: - return dt.replace(microsecond=0) - case DatePrecision.microsecond: - return dt - case _: - return dt + if precision is None: + return dt + replacements = { + precision.name: precision.default_value + for precision in list(DatePrecision)[precision.value + 1 :] + } + return dt.replace(**replacements) def _dates_are_comparable(target: str, comparator: str) -> bool: From 83aa3cfbfb3c19f26383b8a1b60b6e75175e0aa0 Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 18 Nov 2025 23:03:08 -0500 Subject: [PATCH 16/19] combine redundant/duplicate functions into one and use enum iteration --- cdisc_rules_engine/check_operators/helpers.py | 49 +++++++------------ 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 5a652f8c7..86092c4f4 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -198,47 +198,34 @@ def _get_precision_before(precision: DatePrecision) -> DatePrecision | None: return None -def _check_date_precision(date_components) -> tuple: - for precision in [DatePrecision.year, DatePrecision.month, DatePrecision.day]: +def _date_and_time_precision( + date_components, time_components, has_time +) -> DatePrecision | None: + date_precisions = list(DatePrecision)[DatePrecision.year : DatePrecision.day + 1] + + for precision in date_precisions: component = ( date_components[precision.value] if precision.value < len(date_components) else None ) if _check_date_component_missing(component): - result = _get_precision_before(precision) - return (True, result) - return (False, None) - + return _get_precision_before(precision) -def _check_time_precision(time_components) -> DatePrecision: - if not time_components: - return DatePrecision.day - hour, minute, second, microsecond = time_components - if _check_date_component_missing(hour): + if not has_time or not time_components: return DatePrecision.day - if _check_date_component_missing(minute): - return DatePrecision.hour - if second is None: - return DatePrecision.minute - if _check_date_component_missing(second): - return DatePrecision.minute - if microsecond is None: - return DatePrecision.second - if not microsecond or microsecond == "": - return DatePrecision.second - return DatePrecision.microsecond + time_precisions = list(DatePrecision)[ + DatePrecision.hour : DatePrecision.microsecond + 1 + ] + for i, precision in enumerate(time_precisions): + component = time_components[i] if i < len(time_components) else None + if component is None: + return _get_precision_before(precision) + if _check_date_component_missing(component): + return _get_precision_before(precision) -def _date_and_time_precision( - date_components, time_components, has_time -) -> DatePrecision | None: - found_missing, date_result = _check_date_precision(date_components) - if found_missing: - return date_result - if not has_time or not time_components: - return DatePrecision.day - return _check_time_precision(time_components) + return DatePrecision.microsecond def get_common_precision(dt1: str, dt2: str) -> DatePrecision | None: From e628106294a2144764f698f189d1cd1c056c0ecf Mon Sep 17 00:00:00 2001 From: Rakesh Date: Tue, 18 Nov 2025 23:28:20 -0500 Subject: [PATCH 17/19] redundant check removed --- cdisc_rules_engine/check_operators/helpers.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 86092c4f4..d0759169b 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -220,8 +220,6 @@ def _date_and_time_precision( ] for i, precision in enumerate(time_precisions): component = time_components[i] if i < len(time_components) else None - if component is None: - return _get_precision_before(precision) if _check_date_component_missing(component): return _get_precision_before(precision) From d15257a46c7fc99ddbf2b4dc27819e6509d0726a Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Fri, 21 Nov 2025 09:04:18 -0500 Subject: [PATCH 18/19] remove has_time logic --- cdisc_rules_engine/check_operators/helpers.py | 49 +++---------------- 1 file changed, 7 insertions(+), 42 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index d0759169b..485d8864d 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -157,28 +157,14 @@ def _extract_datetime_components(date_str: str) -> dict: return components -def _parse_datetime_string(date_str: str): - if not date_str or not isinstance(date_str, str): - return [], (None, None, None, None), False - components = _extract_datetime_components(date_str) - date_components = list(components.values())[ - DatePrecision.year : DatePrecision.day + 1 - ] - time_components = list(components.values())[ - DatePrecision.hour : DatePrecision.microsecond + 1 - ] - has_time = any(component is not None for component in time_components) - return date_components, time_components, has_time - - @lru_cache(maxsize=1000) def detect_datetime_precision(date_str: str) -> DatePrecision | None: if not _datestring_is_valid(date_str): return None - date_components, time_components, has_time = _parse_datetime_string(date_str) - if all(_check_date_component_missing(component) for component in date_components): + components = _extract_datetime_components(date_str) + if all(_check_date_component_missing(component) for component in components): return None - return _date_and_time_precision(date_components, time_components, has_time) + return _date_and_time_precision(components) def _datestring_is_valid(date_str: str) -> bool: @@ -190,36 +176,15 @@ def _check_date_component_missing(component) -> bool: def _get_precision_before(precision: DatePrecision) -> DatePrecision | None: - if precision == DatePrecision.year: - return None prev_index = precision.value - 1 - if prev_index >= 0: - return DatePrecision(prev_index) - return None + return DatePrecision(prev_index) if prev_index >= 0 else None def _date_and_time_precision( - date_components, time_components, has_time + components: dict, ) -> DatePrecision | None: - date_precisions = list(DatePrecision)[DatePrecision.year : DatePrecision.day + 1] - - for precision in date_precisions: - component = ( - date_components[precision.value] - if precision.value < len(date_components) - else None - ) - if _check_date_component_missing(component): - return _get_precision_before(precision) - - if not has_time or not time_components: - return DatePrecision.day - - time_precisions = list(DatePrecision)[ - DatePrecision.hour : DatePrecision.microsecond + 1 - ] - for i, precision in enumerate(time_precisions): - component = time_components[i] if i < len(time_components) else None + for precision in DatePrecision: + component = components[precision] if precision in components else None if _check_date_component_missing(component): return _get_precision_before(precision) From 4eeb473364972ea7ceed6a75092e2eaf6df4ae59 Mon Sep 17 00:00:00 2001 From: Gerry Campion Date: Fri, 21 Nov 2025 09:11:03 -0500 Subject: [PATCH 19/19] fixed overwriting var in scope --- cdisc_rules_engine/check_operators/helpers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cdisc_rules_engine/check_operators/helpers.py b/cdisc_rules_engine/check_operators/helpers.py index 485d8864d..e6bc63ce8 100644 --- a/cdisc_rules_engine/check_operators/helpers.py +++ b/cdisc_rules_engine/check_operators/helpers.py @@ -296,8 +296,8 @@ def truncate_datetime_to_precision(date_string: str, precision: DatePrecision): if precision is None: return dt replacements = { - precision.name: precision.default_value - for precision in list(DatePrecision)[precision.value + 1 :] + precision_component.name: precision_component.default_value + for precision_component in list(DatePrecision)[precision.value + 1 :] } return dt.replace(**replacements)