From 4428036dc43b49f65952bc36c1baf48e7dea881e Mon Sep 17 00:00:00 2001 From: arik Date: Mon, 26 Jan 2026 14:24:27 +0200 Subject: [PATCH] fix: auto-adjust step size to prevent exceeding Prometheus max resolution When querying Prometheus with long history durations and small step sizes, the number of data points can exceed Prometheus's maximum resolution of 11,000 points per timeseries. This fix automatically increases the step size when the calculated number of points would exceed 10,000 (using 10,000 as a safety margin below the 11,000 hard limit). Fixes #490 --- .../integrations/prometheus/metrics/base.py | 47 +++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/robusta_krr/core/integrations/prometheus/metrics/base.py b/robusta_krr/core/integrations/prometheus/metrics/base.py index 347e6b93..8a94c9e9 100644 --- a/robusta_krr/core/integrations/prometheus/metrics/base.py +++ b/robusta_krr/core/integrations/prometheus/metrics/base.py @@ -4,6 +4,7 @@ import asyncio import datetime import enum +import logging from concurrent.futures import ThreadPoolExecutor from functools import reduce from typing import Any, Optional, TypedDict @@ -18,6 +19,12 @@ from robusta_krr.core.models.config import settings from robusta_krr.core.models.objects import K8sObjectData +logger = logging.getLogger("krr") + +# Maximum number of data points to request from Prometheus +# Using 10,000 as a safety margin below the typical 11,000 hard limit +MAX_PROMETHEUS_POINTS = 10_000 + class PrometheusSeries(TypedDict): metric: dict[str, Any] @@ -117,6 +124,42 @@ def _step_to_string(self, step: datetime.timedelta) -> str: return f"{int(step.total_seconds()) // (60 * 60 * 24)}d" return f"{int(step.total_seconds()) // 60}m" + def _calculate_safe_step(self, period: datetime.timedelta, step: datetime.timedelta) -> datetime.timedelta: + """ + Calculate a step size that won't exceed Prometheus's maximum resolution limit. + + If the number of data points (period / step) would exceed MAX_PROMETHEUS_POINTS, + this function returns an increased step size that keeps the point count under the limit. + + Args: + period: The time period for the query. + step: The originally requested step size. + + Returns: + A step size that keeps the number of data points under MAX_PROMETHEUS_POINTS. + """ + period_seconds = period.total_seconds() + step_seconds = step.total_seconds() + + # Calculate expected number of points + expected_points = period_seconds / step_seconds + + if expected_points <= MAX_PROMETHEUS_POINTS: + return step + + # Calculate the minimum step size needed to stay under the limit + min_step_seconds = period_seconds / MAX_PROMETHEUS_POINTS + + # Round up to the nearest second to ensure we're under the limit + adjusted_step_seconds = int(min_step_seconds) + 1 + + logger.debug( + f"Adjusting step from {step_seconds}s to {adjusted_step_seconds}s to avoid exceeding " + f"Prometheus max resolution ({expected_points:.0f} points -> {period_seconds / adjusted_step_seconds:.0f} points)" + ) + + return datetime.timedelta(seconds=adjusted_step_seconds) + @retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5)) def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusSeries]: if data.type == QueryType.QueryRange: @@ -168,6 +211,10 @@ async def load_data( ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics. """ + # For range queries, adjust step size if needed to avoid exceeding Prometheus limits + if self.query_type == QueryType.QueryRange: + step = self._calculate_safe_step(period, step) + step_str = f"{round(step.total_seconds())}s" duration_str = self._step_to_string(period)