diff --git a/cortex/cli.py b/cortex/cli.py index 267228b0..6efa9504 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -40,6 +40,7 @@ ) from cortex.update_checker import UpdateChannel, should_notify_update from cortex.updater import Updater, UpdateStatus +from cortex.utils.retry import DEFAULT_MAX_RETRIES from cortex.validators import validate_api_key, validate_install_request from cortex.version_manager import get_version_string @@ -1445,6 +1446,7 @@ def install( dry_run: bool = False, parallel: bool = False, json_output: bool = False, + max_retries: int = DEFAULT_MAX_RETRIES, ) -> int: """Install software using the LLM-powered package manager.""" # Initialize installation history @@ -1670,6 +1672,7 @@ def parallel_log_callback(message: str, level: str = "info"): timeout=300, stop_on_error=True, progress_callback=progress_callback, + max_retries=max_retries, ) result = coordinator.execute() diff --git a/cortex/coordinator.py b/cortex/coordinator.py index ac19bf80..1f982c96 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -9,6 +9,13 @@ from enum import Enum from typing import Any +from cortex.utils.retry import ( + DEFAULT_MAX_RETRIES, + ErrorCategory, + RetryStrategy, + SmartRetry, + load_strategies_from_env, +) from cortex.validators import DANGEROUS_PATTERNS logger = logging.getLogger(__name__) @@ -60,6 +67,7 @@ def __init__( enable_rollback: bool = False, log_file: str | None = None, progress_callback: Callable[[int, int, InstallationStep], None] | None = None, + max_retries: int = DEFAULT_MAX_RETRIES, ): """Initialize an installation run with optional logging and rollback.""" self.timeout = timeout @@ -67,6 +75,7 @@ def __init__( self.enable_rollback = enable_rollback self.log_file = log_file self.progress_callback = progress_callback + self.max_retries = max_retries if descriptions and len(descriptions) != len(commands): raise ValueError("Number of descriptions must match number of commands") @@ -90,6 +99,7 @@ def from_plan( enable_rollback: bool | None = None, log_file: str | None = None, progress_callback: Callable[[int, int, InstallationStep], None] | None = None, + max_retries: int = 5, ) -> "InstallationCoordinator": """Create a coordinator from a structured plan produced by an LLM. @@ -124,6 +134,7 @@ def from_plan( ), log_file=log_file, progress_callback=progress_callback, + max_retries=max_retries, ) for rollback_cmd in rollback_commands: @@ -174,14 +185,39 @@ def _execute_command(self, step: InstallationStep) -> bool: self._log(f"Command blocked: {step.command} - {error}") return False - try: + def run_cmd() -> subprocess.CompletedProcess[str]: # Use shell=True carefully - commands are validated first # For complex shell commands (pipes, redirects), shell=True is needed # Simple commands could use shlex.split() with shell=False - result = subprocess.run( + return subprocess.run( step.command, shell=True, capture_output=True, text=True, timeout=self.timeout ) + def status_callback(msg: str) -> None: + self._log(msg) + # Only print to stdout if no progress callback is configured to avoid duplicates + if self.progress_callback is None: + print(msg) + + # Load strategies and apply CLI override for network errors + strategies = load_strategies_from_env() + if ErrorCategory.NETWORK_ERROR in strategies: + # Create a new instance to avoid mutating the shared default object + original_strategy = strategies[ErrorCategory.NETWORK_ERROR] + strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=self.max_retries, + backoff_factor=original_strategy.backoff_factor, + description=original_strategy.description, + ) + + retry_handler = SmartRetry( + strategies=strategies, + status_callback=status_callback, + ) + + try: + result = retry_handler.run(run_cmd) + step.return_code = result.returncode step.output = result.stdout step.error = result.stderr diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py new file mode 100644 index 00000000..ed590361 --- /dev/null +++ b/cortex/utils/retry.py @@ -0,0 +1,222 @@ +import logging +import os +import time +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any + +from cortex.error_parser import ErrorCategory, ErrorParser + +logger = logging.getLogger(__name__) + +# Default maximum number of retries for the global retry setting +DEFAULT_MAX_RETRIES = 5 + + +@dataclass +class RetryStrategy: + """Configuration for how to retry a specific error type.""" + + max_retries: int + backoff_factor: float + description: str + + +# Default strategies for each retryable error category +DEFAULT_STRATEGIES: dict[ErrorCategory, RetryStrategy] = { + ErrorCategory.NETWORK_ERROR: RetryStrategy( + max_retries=DEFAULT_MAX_RETRIES, + backoff_factor=1.0, + description="Network issues - retry aggressively with short backoff", + ), + ErrorCategory.LOCK_ERROR: RetryStrategy( + max_retries=3, + backoff_factor=5.0, + description="Lock contention - wait longer between retries", + ), + ErrorCategory.UNKNOWN: RetryStrategy( + max_retries=2, + backoff_factor=2.0, + description="Unknown errors - conservative retry", + ), +} + +# Permanent error categories that should never be retried +PERMANENT_ERRORS: set[ErrorCategory] = { + ErrorCategory.PERMISSION_DENIED, + ErrorCategory.PACKAGE_NOT_FOUND, + ErrorCategory.CONFIGURATION_ERROR, + ErrorCategory.DEPENDENCY_MISSING, + ErrorCategory.CONFLICT, + ErrorCategory.DISK_SPACE, +} + + +def load_strategies_from_env() -> dict[ErrorCategory, RetryStrategy]: + """ + Load retry strategies from environment variables, falling back to defaults. + + Environment variables: + CORTEX_RETRY_NETWORK_MAX: Max retries for network errors (default: 5) + CORTEX_RETRY_NETWORK_BACKOFF: Backoff factor for network errors (default: 1.0) + CORTEX_RETRY_LOCK_MAX: Max retries for lock errors (default: 3) + CORTEX_RETRY_LOCK_BACKOFF: Backoff factor for lock errors (default: 5.0) + CORTEX_RETRY_UNKNOWN_MAX: Max retries for unknown errors (default: 2) + CORTEX_RETRY_UNKNOWN_BACKOFF: Backoff factor for unknown errors (default: 2.0) + """ + strategies = dict(DEFAULT_STRATEGIES) + + # Network error overrides + if os.getenv("CORTEX_RETRY_NETWORK_MAX") or os.getenv("CORTEX_RETRY_NETWORK_BACKOFF"): + strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_NETWORK_MAX", "5")), + backoff_factor=float(os.getenv("CORTEX_RETRY_NETWORK_BACKOFF", "1.0")), + description="Network issues (user-configured)", + ) + + # Lock error overrides + if os.getenv("CORTEX_RETRY_LOCK_MAX") or os.getenv("CORTEX_RETRY_LOCK_BACKOFF"): + strategies[ErrorCategory.LOCK_ERROR] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_LOCK_MAX", "3")), + backoff_factor=float(os.getenv("CORTEX_RETRY_LOCK_BACKOFF", "5.0")), + description="Lock contention (user-configured)", + ) + + # Unknown error overrides + if os.getenv("CORTEX_RETRY_UNKNOWN_MAX") or os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF"): + strategies[ErrorCategory.UNKNOWN] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_UNKNOWN_MAX", "2")), + backoff_factor=float(os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF", "2.0")), + description="Unknown errors (user-configured)", + ) + + return strategies + + +class SmartRetry: + """ + Implements smart retry logic with exponential backoff. + Uses ErrorParser to distinguish between transient and permanent errors. + Supports different retry strategies per error category. + """ + + def __init__( + self, + strategies: dict[ErrorCategory, RetryStrategy] | None = None, + status_callback: Callable[[str], None] | None = None, + ): + """ + Initialize SmartRetry with optional custom strategies. + + Args: + strategies: Custom retry strategies per error category. + If None, loads from environment or uses defaults. + status_callback: Optional callback for status messages. + """ + self.strategies = strategies if strategies is not None else load_strategies_from_env() + + # Validate strategies + for category, strategy in self.strategies.items(): + if strategy.max_retries < 0: + raise ValueError(f"Strategy for {category.name}: max_retries must be non-negative") + if strategy.backoff_factor <= 0: + raise ValueError(f"Strategy for {category.name}: backoff_factor must be positive") + + self.status_callback = status_callback + self.error_parser = ErrorParser() + + def run(self, func: Callable[[], Any]) -> Any: + """ + Run a function with smart retry logic. + + Args: + func: The function to execute. Expected to return a result object + that has `returncode`, `stdout`, and `stderr` attributes + (like subprocess.CompletedProcess), or raise an exception. + + Returns: + The result of the function call. + """ + attempt = 0 + last_exception = None + last_result = None + current_strategy: RetryStrategy | None = None + + while True: + try: + result = func() + last_result = result + + # If result indicates success (returncode 0), return immediately + if hasattr(result, "returncode") and result.returncode == 0: + return result + + # If result indicates failure, analyze it + error_msg = "" + if hasattr(result, "stderr") and result.stderr: + error_msg = result.stderr + + category = self._get_error_category(error_msg) + current_strategy = self._get_strategy(category) + + if current_strategy is None: + # Permanent error - fail fast + return result + + except Exception as e: + last_exception = e + category = self._get_error_category(str(e)) + current_strategy = self._get_strategy(category) + + if current_strategy is None: + # Permanent error - fail fast + raise + + # Check if we've exhausted retries for this strategy + if current_strategy is None or attempt >= current_strategy.max_retries: + break + + attempt += 1 + sleep_time = current_strategy.backoff_factor * (2 ** (attempt - 1)) + + category_name = category.name if category else "UNKNOWN" + msg = ( + f"⚠️ {category_name} detected. " + f"Retrying in {sleep_time}s... (Retry {attempt}/{current_strategy.max_retries})" + ) + logger.warning(msg) + if self.status_callback: + self.status_callback(msg) + + time.sleep(sleep_time) + + if last_exception: + raise last_exception + return last_result + + def _get_error_category(self, error_message: str) -> ErrorCategory | None: + """Classify the error message into a category.""" + if not error_message: + logger.warning("Retry: Empty error message detected. Assuming UNKNOWN (transient).") + return ErrorCategory.UNKNOWN + + analysis = self.error_parser.parse_error(error_message) + + # If the error is explicitly marked as not fixable, treat as permanent + if not analysis.is_fixable: + return None + + return analysis.primary_category + + def _get_strategy(self, category: ErrorCategory | None) -> RetryStrategy | None: + """ + Get the retry strategy for a given error category. + Returns None for permanent errors (should not retry). + """ + if category is None: + return None + + if category in PERMANENT_ERRORS: + return None + + return self.strategies.get(category) diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index a194d67f..cf740064 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -73,6 +73,7 @@ cortex install "python3 with pip and virtualenv" --execute - Without `--execute`, Cortex only shows the commands it would run - The `--dry-run` flag is recommended for first-time use to verify commands - Installation is recorded in history for potential rollback +- **Smart Retry Logic**: Cortex automatically detects transient failures (like network timeouts) and retries commands with exponential backoff (up to 5 attempts). Permanent errors (like permission denied) fail immediately. --- diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md new file mode 100644 index 00000000..e1acf3c5 --- /dev/null +++ b/docs/RETRY_CONFIGURATION.md @@ -0,0 +1,173 @@ +# Retry Configuration Guide + +Cortex CLI includes a **Smart Retry** mechanism that automatically recovers from transient failures during package installations. This guide explains how retry logic works and how to configure it. + +## How It Works + +When an installation command fails, Cortex analyzes the error to determine if it's: + +1. **Transient** (temporary, likely to resolve): Network timeouts, lock contention, etc. +2. **Permanent** (unlikely to resolve): Permission denied, package not found, disk full, etc. + +For transient errors, Cortex retries the command with **exponential backoff**—waiting progressively longer between attempts (1s, 2s, 4s, etc.) to allow the issue to resolve. + +## Default Retry Strategies + +Each error type has its own retry strategy: + +| Error Type | Max Retries | Base Backoff | Rationale | +|------------|-------------|--------------|-----------| +| **Network Error** | 5 | 1.0s | Short blips resolve quickly; retry aggressively | +| **Lock Error** | 3 | 5.0s | Locks take time to release; wait longer | +| **Unknown Error** | 2 | 2.0s | Conservative approach for unclassified errors | + +**Permanent errors** (Permission Denied, Package Not Found, Disk Space, Dependency Missing, Configuration Error, Conflict) **never retry**—they fail immediately. + +## Backoff Calculation + +The wait time before each retry uses exponential backoff: + +```text +wait_time = backoff_factor × 2^(attempt - 1) +``` + +Example for Network Error (backoff_factor = 1.0): +- Attempt 1: 1.0s wait +- Attempt 2: 2.0s wait +- Attempt 3: 4.0s wait +- Attempt 4: 8.0s wait +- Attempt 5: 16.0s wait + +## Configuration via Environment Variables + +Override default strategies using environment variables: + +### Network Error Configuration +```bash +export CORTEX_RETRY_NETWORK_MAX=10 # Max retry attempts (default: 5) +export CORTEX_RETRY_NETWORK_BACKOFF=0.5 # Base backoff in seconds (default: 1.0) +``` + +### Lock Error Configuration +```bash +export CORTEX_RETRY_LOCK_MAX=5 # Max retry attempts (default: 3) +export CORTEX_RETRY_LOCK_BACKOFF=10.0 # Base backoff in seconds (default: 5.0) +``` + +### Unknown Error Configuration +```bash +export CORTEX_RETRY_UNKNOWN_MAX=3 # Max retry attempts (default: 2) +export CORTEX_RETRY_UNKNOWN_BACKOFF=1.0 # Base backoff in seconds (default: 2.0) +``` + +## Examples + +### Aggressive Retry for Unstable Networks + +If you're on an unstable connection and want more retries with shorter waits: + +```bash +export CORTEX_RETRY_NETWORK_MAX=10 +export CORTEX_RETRY_NETWORK_BACKOFF=0.5 +cortex install docker --execute +``` + +This gives 10 attempts with waits: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s. + +### Patient Retry for Shared Systems + +If you're on a shared server where `apt` locks are common: + +```bash +export CORTEX_RETRY_LOCK_MAX=5 +export CORTEX_RETRY_LOCK_BACKOFF=30.0 +cortex install nginx --execute +``` + +This gives 5 attempts with waits: 30s, 60s, 120s, 240s, 480s (up to 8 minutes total wait). + +### Disable All Retries + +For CI/CD pipelines where you want fast failure: + +```bash +export CORTEX_RETRY_NETWORK_MAX=0 +export CORTEX_RETRY_LOCK_MAX=0 +export CORTEX_RETRY_UNKNOWN_MAX=0 +cortex install package --execute +``` + +## User Feedback + +During retries, Cortex displays messages like: + +```text +⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5) +``` + +This shows: +- The error type that was detected +- How long until the next attempt +- The current attempt number and maximum attempts + +## Error Categories Reference + +### Transient (Retried) + +| Category | Example Errors | +|----------|----------------| +| `NETWORK_ERROR` | "Connection timed out", "Temporary failure resolving" | +| `LOCK_ERROR` | "Could not get lock", "dpkg was interrupted" | +| `UNKNOWN` | Unclassified errors that might be transient | + +### Permanent (Never Retried) + +| Category | Example Errors | +|----------|----------------| +| `PERMISSION_DENIED` | "Permission denied", "Operation not permitted" | +| `PACKAGE_NOT_FOUND` | "Unable to locate package", "No such package" | +| `DISK_SPACE` | "No space left on device" | +| `DEPENDENCY_MISSING` | "Depends: X but it is not installable" | +| `CONFIGURATION_ERROR` | "Configuration file syntax error" | +| `CONFLICT` | "Conflicts with package X" | + +## Programmatic Usage + +For advanced use cases, you can customize strategies in code: + +```python +from cortex.utils.retry import SmartRetry, RetryStrategy, DEFAULT_STRATEGIES +from cortex.error_parser import ErrorCategory + +# Custom strategies +custom_strategies = dict(DEFAULT_STRATEGIES) +custom_strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=10, + backoff_factor=0.5, + description="Custom network retry" +) + +retry = SmartRetry(strategies=custom_strategies) +result = retry.run(my_function) +``` + +## Troubleshooting + +### Retries Not Happening + +1. Check if the error is classified as permanent (see table above) +2. Verify environment variables are set correctly +3. Run with `--verbose` to see detailed error classification + +### Retries Taking Too Long + +Reduce `backoff_factor` or `max_retries` via environment variables. + +### Need More Aggressive Retries + +Increase `max_retries` and decrease `backoff_factor`. + +--- + +**Version**: 0.9.0 +**Last Updated**: January 2026 diff --git a/tests/test_coordinator.py b/tests/test_coordinator.py index a0ad03d4..bf858de1 100644 --- a/tests/test_coordinator.py +++ b/tests/test_coordinator.py @@ -31,8 +31,9 @@ def test_step_duration(self): self.assertEqual(step.duration(), 5.5) +@patch("time.sleep") class TestInstallationCoordinator(unittest.TestCase): - def test_initialization(self): + def test_initialization(self, mock_sleep): commands = ["echo 1", "echo 2"] coordinator = InstallationCoordinator(commands) @@ -40,7 +41,7 @@ def test_initialization(self): self.assertEqual(coordinator.steps[0].command, "echo 1") self.assertEqual(coordinator.steps[1].command, "echo 2") - def test_from_plan_initialization(self): + def test_from_plan_initialization(self, mock_sleep): plan = [ {"command": "echo 1", "description": "First step"}, {"command": "echo 2", "rollback": "echo rollback"}, @@ -54,7 +55,7 @@ def test_from_plan_initialization(self): self.assertTrue(coordinator.enable_rollback) self.assertEqual(coordinator.rollback_commands, ["echo rollback"]) - def test_initialization_with_descriptions(self): + def test_initialization_with_descriptions(self, mock_sleep): commands = ["echo 1", "echo 2"] descriptions = ["First", "Second"] coordinator = InstallationCoordinator(commands, descriptions) @@ -62,7 +63,7 @@ def test_initialization_with_descriptions(self): self.assertEqual(coordinator.steps[0].description, "First") self.assertEqual(coordinator.steps[1].description, "Second") - def test_initialization_mismatched_descriptions(self): + def test_initialization_mismatched_descriptions(self, mock_sleep): commands = ["echo 1", "echo 2"] descriptions = ["First"] @@ -70,7 +71,7 @@ def test_initialization_mismatched_descriptions(self): InstallationCoordinator(commands, descriptions) @patch("subprocess.run") - def test_execute_single_success(self, mock_run): + def test_execute_single_success(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" @@ -85,7 +86,7 @@ def test_execute_single_success(self, mock_run): self.assertEqual(result.steps[0].status, StepStatus.SUCCESS) @patch("subprocess.run") - def test_execute_single_failure(self, mock_run): + def test_execute_single_failure(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 1 mock_result.stdout = "" @@ -100,7 +101,7 @@ def test_execute_single_failure(self, mock_run): self.assertEqual(result.steps[0].status, StepStatus.FAILED) @patch("subprocess.run") - def test_execute_multiple_success(self, mock_run): + def test_execute_multiple_success(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" @@ -115,7 +116,7 @@ def test_execute_multiple_success(self, mock_run): self.assertTrue(all(s.status == StepStatus.SUCCESS for s in result.steps)) @patch("subprocess.run") - def test_execute_stop_on_error(self, mock_run): + def test_execute_stop_on_error(self, mock_run, mock_sleep): def side_effect(*args, **kwargs): cmd = args[0] if args else kwargs.get("shell") if "fail" in str(cmd): @@ -143,7 +144,7 @@ def side_effect(*args, **kwargs): self.assertEqual(result.steps[2].status, StepStatus.SKIPPED) @patch("subprocess.run") - def test_execute_continue_on_error(self, mock_run): + def test_execute_continue_on_error(self, mock_run, mock_sleep): def side_effect(*args, **kwargs): cmd = args[0] if args else kwargs.get("shell") if "fail" in str(cmd): @@ -170,7 +171,7 @@ def side_effect(*args, **kwargs): self.assertEqual(result.steps[2].status, StepStatus.SUCCESS) @patch("subprocess.run") - def test_timeout_handling(self, mock_run): + def test_timeout_handling(self, mock_run, mock_sleep): mock_run.side_effect = Exception("Timeout") coordinator = InstallationCoordinator(["sleep 1000"], timeout=1) @@ -179,7 +180,7 @@ def test_timeout_handling(self, mock_run): self.assertFalse(result.success) self.assertEqual(result.steps[0].status, StepStatus.FAILED) - def test_progress_callback(self): + def test_progress_callback(self, mock_sleep): callback_calls = [] def callback(current, total, step): @@ -199,7 +200,7 @@ def callback(current, total, step): self.assertEqual(callback_calls[0], (1, 2, "echo 1")) self.assertEqual(callback_calls[1], (2, 2, "echo 2")) - def test_log_file(self): + def test_log_file(self, mock_sleep): with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as f: log_file = f.name @@ -223,7 +224,7 @@ def test_log_file(self): os.unlink(log_file) @patch("subprocess.run") - def test_rollback(self, mock_run): + def test_rollback(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 1 mock_result.stdout = "" @@ -238,7 +239,7 @@ def test_rollback(self, mock_run): self.assertGreaterEqual(mock_run.call_count, 2) @patch("subprocess.run") - def test_verify_installation(self, mock_run): + def test_verify_installation(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "Docker version 20.10.0" @@ -252,7 +253,7 @@ def test_verify_installation(self, mock_run): self.assertTrue(verify_results["docker --version"]) - def test_get_summary(self): + def test_get_summary(self, mock_sleep): with patch("subprocess.run") as mock_run: mock_result = Mock() mock_result.returncode = 0 @@ -270,7 +271,7 @@ def test_get_summary(self): self.assertEqual(summary["failed"], 0) self.assertEqual(summary["skipped"], 0) - def test_export_log(self): + def test_export_log(self, mock_sleep): with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: export_file = f.name @@ -299,7 +300,7 @@ def test_export_log(self): os.unlink(export_file) @patch("subprocess.run") - def test_step_timing(self, mock_run): + def test_step_timing(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 00000000..9061b6a7 --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,214 @@ +import unittest +from unittest.mock import Mock, patch + +from cortex.error_parser import ErrorCategory +from cortex.utils.retry import ( + DEFAULT_STRATEGIES, + PERMANENT_ERRORS, + RetryStrategy, + SmartRetry, + load_strategies_from_env, +) + + +class TestRetryStrategy(unittest.TestCase): + """Tests for the RetryStrategy dataclass.""" + + def test_strategy_creation(self): + strategy = RetryStrategy(max_retries=5, backoff_factor=1.0, description="Test") + self.assertEqual(strategy.max_retries, 5) + self.assertEqual(strategy.backoff_factor, 1.0) + self.assertEqual(strategy.description, "Test") + + +class TestDefaultStrategies(unittest.TestCase): + """Tests for default strategy configurations.""" + + def test_network_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.NETWORK_ERROR] + self.assertEqual(strategy.max_retries, 5) + self.assertEqual(strategy.backoff_factor, 1.0) + + def test_lock_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.LOCK_ERROR] + self.assertEqual(strategy.max_retries, 3) + self.assertEqual(strategy.backoff_factor, 5.0) + + def test_unknown_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.UNKNOWN] + self.assertEqual(strategy.max_retries, 2) + self.assertEqual(strategy.backoff_factor, 2.0) + + def test_permanent_errors_not_in_strategies(self): + for error in PERMANENT_ERRORS: + self.assertNotIn(error, DEFAULT_STRATEGIES) + + +class TestLoadStrategiesFromEnv(unittest.TestCase): + """Tests for environment variable configuration.""" + + def test_default_strategies_when_no_env_vars(self): + with patch.dict("os.environ", {}, clear=True): + strategies = load_strategies_from_env() + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5) + + def test_network_override_from_env(self): + with patch.dict( + "os.environ", + {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"}, + clear=True, + ): + strategies = load_strategies_from_env() + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10) + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].backoff_factor, 0.5) + + def test_lock_override_from_env(self): + with patch.dict( + "os.environ", + {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"}, + clear=True, + ): + strategies = load_strategies_from_env() + self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6) + self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].backoff_factor, 10.0) + + +class TestSmartRetry(unittest.TestCase): + """Tests for SmartRetry class.""" + + def setUp(self): + # Use custom strategies with short backoff for fast tests + self.fast_strategies = { + ErrorCategory.NETWORK_ERROR: RetryStrategy(3, 0.01, "Test network"), + ErrorCategory.LOCK_ERROR: RetryStrategy(2, 0.01, "Test lock"), + ErrorCategory.UNKNOWN: RetryStrategy(2, 0.01, "Test unknown"), + } + self.retry = SmartRetry(strategies=self.fast_strategies) + + def test_success_first_try(self): + mock_func = Mock() + mock_result = Mock() + mock_result.returncode = 0 + mock_func.return_value = mock_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, mock_result) + self.assertEqual(mock_func.call_count, 1) + + @patch("cortex.utils.retry.time.sleep") + def test_retry_on_network_error(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + success_result = Mock() + success_result.returncode = 0 + + mock_func.side_effect = [fail_result, fail_result, success_result] + + result = self.retry.run(mock_func) + + self.assertEqual(result, success_result) + self.assertEqual(mock_func.call_count, 3) + self.assertEqual(mock_sleep.call_count, 2) + + @patch("cortex.utils.retry.time.sleep") + def test_fail_fast_on_permission_denied(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Permission denied" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + self.assertEqual(mock_func.call_count, 1) + mock_sleep.assert_not_called() + + @patch("cortex.utils.retry.time.sleep") + def test_fail_fast_on_disk_space(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "No space left on device" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + self.assertEqual(mock_func.call_count, 1) + mock_sleep.assert_not_called() + + @patch("cortex.utils.retry.time.sleep") + def test_max_retries_exceeded(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + # 1 initial + 3 retries for network error strategy + self.assertEqual(mock_func.call_count, 4) + self.assertEqual(mock_sleep.call_count, 3) + + @patch("cortex.utils.retry.time.sleep") + def test_different_strategy_for_lock_error(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Could not get lock /var/lib/apt/lists/lock" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + # Lock error strategy has max_retries=2, so 1 initial + 2 retries = 3 + self.assertEqual(mock_func.call_count, 3) + self.assertEqual(mock_sleep.call_count, 2) + + @patch("cortex.utils.retry.time.sleep") + def test_callback_notification(self, mock_sleep): + callback = Mock() + retry = SmartRetry(strategies=self.fast_strategies, status_callback=callback) + + mock_func = Mock() + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + mock_func.side_effect = [fail_result, Mock(returncode=0)] + + retry.run(mock_func) + + callback.assert_called_once() + self.assertIn("NETWORK_ERROR", callback.call_args[0][0]) + self.assertIn("Retrying", callback.call_args[0][0]) + + @patch("cortex.utils.retry.time.sleep") + def test_exception_retry(self, mock_sleep): + mock_func = Mock() + mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)] + + result = self.retry.run(mock_func) + + self.assertEqual(result.returncode, 0) + self.assertEqual(mock_func.call_count, 2) + + +if __name__ == "__main__": + unittest.main()