Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 47 additions & 0 deletions robusta_krr/core/integrations/prometheus/metrics/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import asyncio
import datetime
import enum
import logging
from concurrent.futures import ThreadPoolExecutor
from functools import reduce
from typing import Any, Optional, TypedDict
Expand All @@ -18,6 +19,12 @@
from robusta_krr.core.models.config import settings
from robusta_krr.core.models.objects import K8sObjectData

logger = logging.getLogger("krr")

# Maximum number of data points to request from Prometheus
# Using 10,000 as a safety margin below the typical 11,000 hard limit
MAX_PROMETHEUS_POINTS = 10_000


class PrometheusSeries(TypedDict):
metric: dict[str, Any]
Expand Down Expand Up @@ -117,6 +124,42 @@ def _step_to_string(self, step: datetime.timedelta) -> str:
return f"{int(step.total_seconds()) // (60 * 60 * 24)}d"
return f"{int(step.total_seconds()) // 60}m"

def _calculate_safe_step(self, period: datetime.timedelta, step: datetime.timedelta) -> datetime.timedelta:
"""
Calculate a step size that won't exceed Prometheus's maximum resolution limit.

If the number of data points (period / step) would exceed MAX_PROMETHEUS_POINTS,
this function returns an increased step size that keeps the point count under the limit.

Args:
period: The time period for the query.
step: The originally requested step size.

Returns:
A step size that keeps the number of data points under MAX_PROMETHEUS_POINTS.
"""
period_seconds = period.total_seconds()
step_seconds = step.total_seconds()

# Calculate expected number of points
expected_points = period_seconds / step_seconds

if expected_points <= MAX_PROMETHEUS_POINTS:
return step

# Calculate the minimum step size needed to stay under the limit
min_step_seconds = period_seconds / MAX_PROMETHEUS_POINTS

# Round up to the nearest second to ensure we're under the limit
adjusted_step_seconds = int(min_step_seconds) + 1

logger.debug(
f"Adjusting step from {step_seconds}s to {adjusted_step_seconds}s to avoid exceeding "
f"Prometheus max resolution ({expected_points:.0f} points -> {period_seconds / adjusted_step_seconds:.0f} points)"
)

return datetime.timedelta(seconds=adjusted_step_seconds)

Comment on lines +127 to +162
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Align safe-step math with the actual query step to avoid still exceeding the limit.

_calculate_safe_step uses the raw step.total_seconds(), but Line 218 rounds the step to whole seconds for the actual query. For fractional steps (e.g., 1.49s), rounding down can inflate the real point count and still hit Prometheus’s 11k cap. Normalize to the same integer-second resolution inside _calculate_safe_step and use that value when building step_str.

🔧 Proposed fix
@@
-import logging
+import logging
+import math
@@
-        step_seconds = step.total_seconds()
+        # Normalize to the same integer-second resolution used in the query
+        step_seconds = max(1, round(step.total_seconds()))
@@
-        if expected_points <= MAX_PROMETHEUS_POINTS:
-            return step
+        if expected_points <= MAX_PROMETHEUS_POINTS:
+            return datetime.timedelta(seconds=step_seconds)
@@
-        min_step_seconds = period_seconds / MAX_PROMETHEUS_POINTS
-
-        # Round up to the nearest second to ensure we're under the limit
-        adjusted_step_seconds = int(min_step_seconds) + 1
+        min_step_seconds = math.ceil(period_seconds / MAX_PROMETHEUS_POINTS)
+
+        # Round up to the nearest second to ensure we're under the limit
+        adjusted_step_seconds = max(step_seconds, min_step_seconds)
@@
-        step_str = f"{round(step.total_seconds())}s"
+        step_str = f"{int(step.total_seconds())}s"

Also applies to: 214-218

🤖 Prompt for AI Agents
In `@robusta_krr/core/integrations/prometheus/metrics/base.py` around lines 127 -
162, The _calculate_safe_step function currently uses the raw
step.total_seconds() while the actual query later rounds step to whole seconds,
which can still exceed MAX_PROMETHEUS_POINTS for fractional steps; modify
_calculate_safe_step to normalize the incoming step to the integer-second
resolution used for queries (e.g., round/truncate step.total_seconds() to an int
second value before computing expected_points and min_step_seconds) and return a
timedelta based on that normalized integer seconds, and ensure the same
normalized integer second value is used when constructing step_str so the safety
calculation and actual query step are identical (refer to _calculate_safe_step
and the code that builds step_str).

@retry(wait=wait_random(min=2, max=10), stop=stop_after_attempt(5))
def _query_prometheus_sync(self, data: PrometheusMetricData) -> list[PrometheusSeries]:
if data.type == QueryType.QueryRange:
Expand Down Expand Up @@ -168,6 +211,10 @@ async def load_data(
ResourceHistoryData: An instance of the ResourceHistoryData class representing the loaded metrics.
"""

# For range queries, adjust step size if needed to avoid exceeding Prometheus limits
if self.query_type == QueryType.QueryRange:
step = self._calculate_safe_step(period, step)

step_str = f"{round(step.total_seconds())}s"
duration_str = self._step_to_string(period)

Expand Down