From c3e5ac7d9188b8883fd23ed94c046c905dbfd10c Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Tue, 20 Jan 2026 11:53:17 +0530 Subject: [PATCH 01/11] added retry functionality --- cortex/cli.py | 1 + cortex/coordinator.py | 23 +++++++- cortex/utils/retry.py | 116 ++++++++++++++++++++++++++++++++++++++ tests/test_coordinator.py | 35 ++++++------ tests/test_retry.py | 105 ++++++++++++++++++++++++++++++++++ 5 files changed, 261 insertions(+), 19 deletions(-) create mode 100644 cortex/utils/retry.py create mode 100644 tests/test_retry.py diff --git a/cortex/cli.py b/cortex/cli.py index fb3593d83..45a5fb6ce 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -1000,6 +1000,7 @@ def parallel_log_callback(message: str, level: str = "info"): timeout=300, stop_on_error=True, progress_callback=progress_callback, + max_retries=5, ) result = coordinator.execute() diff --git a/cortex/coordinator.py b/cortex/coordinator.py index ac19bf80f..d14e4cf3c 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -60,6 +60,7 @@ def __init__( enable_rollback: bool = False, log_file: str | None = None, progress_callback: Callable[[int, int, InstallationStep], None] | None = None, + max_retries: int = 5, ): """Initialize an installation run with optional logging and rollback.""" self.timeout = timeout @@ -67,6 +68,7 @@ def __init__( self.enable_rollback = enable_rollback self.log_file = log_file self.progress_callback = progress_callback + self.max_retries = max_retries if descriptions and len(descriptions) != len(commands): raise ValueError("Number of descriptions must match number of commands") @@ -90,6 +92,7 @@ def from_plan( enable_rollback: bool | None = None, log_file: str | None = None, progress_callback: Callable[[int, int, InstallationStep], None] | None = None, + max_retries: int = 5, ) -> "InstallationCoordinator": """Create a coordinator from a structured plan produced by an LLM. @@ -124,6 +127,7 @@ def from_plan( ), log_file=log_file, progress_callback=progress_callback, + max_retries=max_retries, ) for rollback_cmd in rollback_commands: @@ -174,14 +178,29 @@ def _execute_command(self, step: InstallationStep) -> bool: self._log(f"Command blocked: {step.command} - {error}") return False - try: + from cortex.utils.retry import SmartRetry + + def run_cmd(): # Use shell=True carefully - commands are validated first # For complex shell commands (pipes, redirects), shell=True is needed # Simple commands could use shlex.split() with shell=False - result = subprocess.run( + return subprocess.run( step.command, shell=True, capture_output=True, text=True, timeout=self.timeout ) + def status_callback(msg: str): + self._log(msg) + # Optionally update UI here if needed, but for now logging is sufficient + # The CLI progress callback handles the main step status updates + + retry_handler = SmartRetry( + max_retries=self.max_retries, + status_callback=status_callback, + ) + + try: + result = retry_handler.run(run_cmd) + step.return_code = result.returncode step.output = result.stdout step.error = result.stderr diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py new file mode 100644 index 000000000..15d55c1a3 --- /dev/null +++ b/cortex/utils/retry.py @@ -0,0 +1,116 @@ +import logging +import time +from collections.abc import Callable +from typing import Any + +from cortex.error_parser import ErrorCategory, ErrorParser + +logger = logging.getLogger(__name__) + + +class SmartRetry: + """ + Implements smart retry logic with exponential backoff. + Uses ErrorParser to distinguish between transient and permanent errors. + """ + + def __init__( + self, + max_retries: int = 5, + backoff_factor: float = 1.0, + status_callback: Callable[[str], None] | None = None, + ): + self.max_retries = max_retries + self.backoff_factor = backoff_factor + self.status_callback = status_callback + self.error_parser = ErrorParser() + + def run(self, func: Callable[[], Any]) -> Any: + """ + Run a function with smart retry logic. + + Args: + func: The function to execute. Expected to return a result object + that has `returncode`, `stdout`, and `stderr` attributes + (like subprocess.CompletedProcess), or raise an exception. + + Returns: + The result of the function call. + """ + attempt = 0 + last_exception = None + last_result = None + + while attempt <= self.max_retries: + try: + result = func() + last_result = result + + # If result indicates success (returncode 0), return immediately + if hasattr(result, "returncode") and result.returncode == 0: + return result + + # If result indicates failure, analyze it + error_msg = "" + if hasattr(result, "stderr") and result.stderr: + error_msg = result.stderr + elif hasattr(result, "stdout") and result.stdout: + error_msg = result.stdout + + if not self._should_retry(error_msg): + return result + + except Exception as e: + last_exception = e + if not self._should_retry(str(e)): + raise e + + # If we are here, we need to retry (unless max retries reached) + if attempt == self.max_retries: + break + + attempt += 1 + sleep_time = self.backoff_factor * (2 ** (attempt - 1)) + + msg = f"⚠️ Transient error detected. Retrying in {sleep_time}s... (Attempt {attempt}/{self.max_retries})" + logger.warning(msg) + if self.status_callback: + self.status_callback(msg) + + time.sleep(sleep_time) + + if last_exception: + raise last_exception + return last_result + + def _should_retry(self, error_message: str) -> bool: + """ + Determine if we should retry based on the error message. + """ + if not error_message: + # If no error message, assume it's a generic failure that might be transient + return True + + analysis = self.error_parser.parse_error(error_message) + category = analysis.primary_category + + # Retry on network errors, lock errors, or unknown errors (conservative) + if category in [ + ErrorCategory.NETWORK_ERROR, + ErrorCategory.LOCK_ERROR, + ErrorCategory.UNKNOWN, + ]: + return True + + # Fail fast on permanent errors + if category in [ + ErrorCategory.PERMISSION_DENIED, + ErrorCategory.PACKAGE_NOT_FOUND, + ErrorCategory.CONFIGURATION_ERROR, + ErrorCategory.DEPENDENCY_MISSING, + ErrorCategory.CONFLICT, + ]: + return False + + # Default to retry for safety if not explicitly categorized as permanent + return True diff --git a/tests/test_coordinator.py b/tests/test_coordinator.py index a0ad03d42..bf858de10 100644 --- a/tests/test_coordinator.py +++ b/tests/test_coordinator.py @@ -31,8 +31,9 @@ def test_step_duration(self): self.assertEqual(step.duration(), 5.5) +@patch("time.sleep") class TestInstallationCoordinator(unittest.TestCase): - def test_initialization(self): + def test_initialization(self, mock_sleep): commands = ["echo 1", "echo 2"] coordinator = InstallationCoordinator(commands) @@ -40,7 +41,7 @@ def test_initialization(self): self.assertEqual(coordinator.steps[0].command, "echo 1") self.assertEqual(coordinator.steps[1].command, "echo 2") - def test_from_plan_initialization(self): + def test_from_plan_initialization(self, mock_sleep): plan = [ {"command": "echo 1", "description": "First step"}, {"command": "echo 2", "rollback": "echo rollback"}, @@ -54,7 +55,7 @@ def test_from_plan_initialization(self): self.assertTrue(coordinator.enable_rollback) self.assertEqual(coordinator.rollback_commands, ["echo rollback"]) - def test_initialization_with_descriptions(self): + def test_initialization_with_descriptions(self, mock_sleep): commands = ["echo 1", "echo 2"] descriptions = ["First", "Second"] coordinator = InstallationCoordinator(commands, descriptions) @@ -62,7 +63,7 @@ def test_initialization_with_descriptions(self): self.assertEqual(coordinator.steps[0].description, "First") self.assertEqual(coordinator.steps[1].description, "Second") - def test_initialization_mismatched_descriptions(self): + def test_initialization_mismatched_descriptions(self, mock_sleep): commands = ["echo 1", "echo 2"] descriptions = ["First"] @@ -70,7 +71,7 @@ def test_initialization_mismatched_descriptions(self): InstallationCoordinator(commands, descriptions) @patch("subprocess.run") - def test_execute_single_success(self, mock_run): + def test_execute_single_success(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" @@ -85,7 +86,7 @@ def test_execute_single_success(self, mock_run): self.assertEqual(result.steps[0].status, StepStatus.SUCCESS) @patch("subprocess.run") - def test_execute_single_failure(self, mock_run): + def test_execute_single_failure(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 1 mock_result.stdout = "" @@ -100,7 +101,7 @@ def test_execute_single_failure(self, mock_run): self.assertEqual(result.steps[0].status, StepStatus.FAILED) @patch("subprocess.run") - def test_execute_multiple_success(self, mock_run): + def test_execute_multiple_success(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" @@ -115,7 +116,7 @@ def test_execute_multiple_success(self, mock_run): self.assertTrue(all(s.status == StepStatus.SUCCESS for s in result.steps)) @patch("subprocess.run") - def test_execute_stop_on_error(self, mock_run): + def test_execute_stop_on_error(self, mock_run, mock_sleep): def side_effect(*args, **kwargs): cmd = args[0] if args else kwargs.get("shell") if "fail" in str(cmd): @@ -143,7 +144,7 @@ def side_effect(*args, **kwargs): self.assertEqual(result.steps[2].status, StepStatus.SKIPPED) @patch("subprocess.run") - def test_execute_continue_on_error(self, mock_run): + def test_execute_continue_on_error(self, mock_run, mock_sleep): def side_effect(*args, **kwargs): cmd = args[0] if args else kwargs.get("shell") if "fail" in str(cmd): @@ -170,7 +171,7 @@ def side_effect(*args, **kwargs): self.assertEqual(result.steps[2].status, StepStatus.SUCCESS) @patch("subprocess.run") - def test_timeout_handling(self, mock_run): + def test_timeout_handling(self, mock_run, mock_sleep): mock_run.side_effect = Exception("Timeout") coordinator = InstallationCoordinator(["sleep 1000"], timeout=1) @@ -179,7 +180,7 @@ def test_timeout_handling(self, mock_run): self.assertFalse(result.success) self.assertEqual(result.steps[0].status, StepStatus.FAILED) - def test_progress_callback(self): + def test_progress_callback(self, mock_sleep): callback_calls = [] def callback(current, total, step): @@ -199,7 +200,7 @@ def callback(current, total, step): self.assertEqual(callback_calls[0], (1, 2, "echo 1")) self.assertEqual(callback_calls[1], (2, 2, "echo 2")) - def test_log_file(self): + def test_log_file(self, mock_sleep): with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as f: log_file = f.name @@ -223,7 +224,7 @@ def test_log_file(self): os.unlink(log_file) @patch("subprocess.run") - def test_rollback(self, mock_run): + def test_rollback(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 1 mock_result.stdout = "" @@ -238,7 +239,7 @@ def test_rollback(self, mock_run): self.assertGreaterEqual(mock_run.call_count, 2) @patch("subprocess.run") - def test_verify_installation(self, mock_run): + def test_verify_installation(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "Docker version 20.10.0" @@ -252,7 +253,7 @@ def test_verify_installation(self, mock_run): self.assertTrue(verify_results["docker --version"]) - def test_get_summary(self): + def test_get_summary(self, mock_sleep): with patch("subprocess.run") as mock_run: mock_result = Mock() mock_result.returncode = 0 @@ -270,7 +271,7 @@ def test_get_summary(self): self.assertEqual(summary["failed"], 0) self.assertEqual(summary["skipped"], 0) - def test_export_log(self): + def test_export_log(self, mock_sleep): with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f: export_file = f.name @@ -299,7 +300,7 @@ def test_export_log(self): os.unlink(export_file) @patch("subprocess.run") - def test_step_timing(self, mock_run): + def test_step_timing(self, mock_run, mock_sleep): mock_result = Mock() mock_result.returncode = 0 mock_result.stdout = "success" diff --git a/tests/test_retry.py b/tests/test_retry.py new file mode 100644 index 000000000..8714cc94d --- /dev/null +++ b/tests/test_retry.py @@ -0,0 +1,105 @@ +import unittest +from unittest.mock import Mock, patch +import time +from cortex.utils.retry import SmartRetry +from cortex.error_parser import ErrorCategory + +class TestSmartRetry(unittest.TestCase): + def setUp(self): + self.retry = SmartRetry(max_retries=3, backoff_factor=0.01) + + def test_success_first_try(self): + mock_func = Mock() + mock_result = Mock() + mock_result.returncode = 0 + mock_func.return_value = mock_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, mock_result) + self.assertEqual(mock_func.call_count, 1) + + @patch("time.sleep") + def test_retry_on_transient_error(self, mock_sleep): + mock_func = Mock() + + # Fail twice with network error, then succeed + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + success_result = Mock() + success_result.returncode = 0 + + mock_func.side_effect = [fail_result, fail_result, success_result] + + result = self.retry.run(mock_func) + + self.assertEqual(result, success_result) + self.assertEqual(mock_func.call_count, 3) + self.assertEqual(mock_sleep.call_count, 2) + + @patch("time.sleep") + def test_fail_fast_on_permanent_error(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Permission denied" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + self.assertEqual(mock_func.call_count, 1) + mock_sleep.assert_not_called() + + @patch("time.sleep") + def test_max_retries_exceeded(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries + self.assertEqual(mock_sleep.call_count, 3) + + @patch("time.sleep") + def test_exception_retry(self, mock_sleep): + mock_func = Mock() + mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)] + + # We need to mock ErrorParser to classify "Network error" as transient if it's not standard + # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry + + result = self.retry.run(mock_func) + + self.assertEqual(result.returncode, 0) + self.assertEqual(mock_func.call_count, 2) + + @patch("time.sleep") + def test_callback_notification(self, mock_sleep): + callback = Mock() + retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback) + + mock_func = Mock() + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Connection timed out" + + mock_func.side_effect = [fail_result, Mock(returncode=0)] + + retry.run(mock_func) + + callback.assert_called_once() + self.assertIn("Retrying", callback.call_args[0][0]) + +if __name__ == "__main__": + unittest.main() From 769bedcc6e8dced1c86f9c4ebbe7f5eb57807d7f Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 06:40:48 +0000 Subject: [PATCH 02/11] [autofix.ci] apply automated fixes --- tests/test_retry.py | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/tests/test_retry.py b/tests/test_retry.py index 8714cc94d..bb2fc053a 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -1,8 +1,10 @@ +import time import unittest from unittest.mock import Mock, patch -import time -from cortex.utils.retry import SmartRetry + from cortex.error_parser import ErrorCategory +from cortex.utils.retry import SmartRetry + class TestSmartRetry(unittest.TestCase): def setUp(self): @@ -22,15 +24,15 @@ def test_success_first_try(self): @patch("time.sleep") def test_retry_on_transient_error(self, mock_sleep): mock_func = Mock() - + # Fail twice with network error, then succeed fail_result = Mock() fail_result.returncode = 1 fail_result.stderr = "Connection timed out" - + success_result = Mock() success_result.returncode = 0 - + mock_func.side_effect = [fail_result, fail_result, success_result] result = self.retry.run(mock_func) @@ -42,11 +44,11 @@ def test_retry_on_transient_error(self, mock_sleep): @patch("time.sleep") def test_fail_fast_on_permanent_error(self, mock_sleep): mock_func = Mock() - + fail_result = Mock() fail_result.returncode = 1 fail_result.stderr = "Permission denied" - + mock_func.return_value = fail_result result = self.retry.run(mock_func) @@ -58,17 +60,17 @@ def test_fail_fast_on_permanent_error(self, mock_sleep): @patch("time.sleep") def test_max_retries_exceeded(self, mock_sleep): mock_func = Mock() - + fail_result = Mock() fail_result.returncode = 1 fail_result.stderr = "Connection timed out" - + mock_func.return_value = fail_result result = self.retry.run(mock_func) self.assertEqual(result, fail_result) - self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries + self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries self.assertEqual(mock_sleep.call_count, 3) @patch("time.sleep") @@ -78,9 +80,9 @@ def test_exception_retry(self, mock_sleep): # We need to mock ErrorParser to classify "Network error" as transient if it's not standard # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry - + result = self.retry.run(mock_func) - + self.assertEqual(result.returncode, 0) self.assertEqual(mock_func.call_count, 2) @@ -88,12 +90,12 @@ def test_exception_retry(self, mock_sleep): def test_callback_notification(self, mock_sleep): callback = Mock() retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback) - + mock_func = Mock() fail_result = Mock() fail_result.returncode = 1 fail_result.stderr = "Connection timed out" - + mock_func.side_effect = [fail_result, Mock(returncode=0)] retry.run(mock_func) @@ -101,5 +103,6 @@ def test_callback_notification(self, mock_sleep): callback.assert_called_once() self.assertIn("Retrying", callback.call_args[0][0]) + if __name__ == "__main__": unittest.main() From 840f3efd0ea278723699ed992027c4d99100f42f Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Thu, 22 Jan 2026 12:16:14 +0530 Subject: [PATCH 03/11] added documentation --- docs/COMMANDS.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md index a194d67fb..cf7400641 100644 --- a/docs/COMMANDS.md +++ b/docs/COMMANDS.md @@ -73,6 +73,7 @@ cortex install "python3 with pip and virtualenv" --execute - Without `--execute`, Cortex only shows the commands it would run - The `--dry-run` flag is recommended for first-time use to verify commands - Installation is recorded in history for potential rollback +- **Smart Retry Logic**: Cortex automatically detects transient failures (like network timeouts) and retries commands with exponential backoff (up to 5 attempts). Permanent errors (like permission denied) fail immediately. --- From ae299f1c9fed77149cd16089564fa33c23014a47 Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Thu, 22 Jan 2026 12:42:32 +0530 Subject: [PATCH 04/11] minor fixes --- cortex/coordinator.py | 7 +++---- cortex/utils/retry.py | 14 ++++++++++++-- 2 files changed, 15 insertions(+), 6 deletions(-) diff --git a/cortex/coordinator.py b/cortex/coordinator.py index d14e4cf3c..ead65fd24 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -8,6 +8,7 @@ from datetime import datetime from enum import Enum from typing import Any +from cortex.utils.retry import SmartRetry from cortex.validators import DANGEROUS_PATTERNS @@ -178,8 +179,6 @@ def _execute_command(self, step: InstallationStep) -> bool: self._log(f"Command blocked: {step.command} - {error}") return False - from cortex.utils.retry import SmartRetry - def run_cmd(): # Use shell=True carefully - commands are validated first # For complex shell commands (pipes, redirects), shell=True is needed @@ -190,8 +189,8 @@ def run_cmd(): def status_callback(msg: str): self._log(msg) - # Optionally update UI here if needed, but for now logging is sufficient - # The CLI progress callback handles the main step status updates + # Also print to stdout so the user sees the retry happening + print(msg) retry_handler = SmartRetry( max_retries=self.max_retries, diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py index 15d55c1a3..958298ca6 100644 --- a/cortex/utils/retry.py +++ b/cortex/utils/retry.py @@ -20,8 +20,13 @@ def __init__( backoff_factor: float = 1.0, status_callback: Callable[[str], None] | None = None, ): + if not isinstance(max_retries, int) or max_retries < 0: + raise ValueError("max_retries must be a non-negative integer") + if not isinstance(backoff_factor, (int, float)) or backoff_factor < 0: + raise ValueError("backoff_factor must be a non-negative number") + self.max_retries = max_retries - self.backoff_factor = backoff_factor + self.backoff_factor = float(backoff_factor) self.status_callback = status_callback self.error_parser = ErrorParser() @@ -63,7 +68,7 @@ def run(self, func: Callable[[], Any]) -> Any: except Exception as e: last_exception = e if not self._should_retry(str(e)): - raise e + raise # If we are here, we need to retry (unless max retries reached) if attempt == self.max_retries: @@ -94,6 +99,10 @@ def _should_retry(self, error_message: str) -> bool: analysis = self.error_parser.parse_error(error_message) category = analysis.primary_category + # If the error is explicitly marked as not fixable, fail fast + if not analysis.is_fixable: + return False + # Retry on network errors, lock errors, or unknown errors (conservative) if category in [ ErrorCategory.NETWORK_ERROR, @@ -109,6 +118,7 @@ def _should_retry(self, error_message: str) -> bool: ErrorCategory.CONFIGURATION_ERROR, ErrorCategory.DEPENDENCY_MISSING, ErrorCategory.CONFLICT, + ErrorCategory.DISK_SPACE, ]: return False From 90460571d2f49329ffcc92ff1bc85de51008f321 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Thu, 22 Jan 2026 07:13:13 +0000 Subject: [PATCH 05/11] [autofix.ci] apply automated fixes --- cortex/coordinator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/coordinator.py b/cortex/coordinator.py index ead65fd24..6d077c143 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -8,8 +8,8 @@ from datetime import datetime from enum import Enum from typing import Any -from cortex.utils.retry import SmartRetry +from cortex.utils.retry import SmartRetry from cortex.validators import DANGEROUS_PATTERNS logger = logging.getLogger(__name__) From af526dc2f4a2f228d85bbec460208d81e89c0ba3 Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Thu, 22 Jan 2026 13:04:17 +0530 Subject: [PATCH 06/11] minor fix --- cortex/coordinator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cortex/coordinator.py b/cortex/coordinator.py index 6d077c143..28ce026f3 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -179,7 +179,7 @@ def _execute_command(self, step: InstallationStep) -> bool: self._log(f"Command blocked: {step.command} - {error}") return False - def run_cmd(): + def run_cmd() -> subprocess.CompletedProcess[str]: # Use shell=True carefully - commands are validated first # For complex shell commands (pipes, redirects), shell=True is needed # Simple commands could use shlex.split() with shell=False @@ -187,7 +187,7 @@ def run_cmd(): step.command, shell=True, capture_output=True, text=True, timeout=self.timeout ) - def status_callback(msg: str): + def status_callback(msg: str) -> None: self._log(msg) # Also print to stdout so the user sees the retry happening print(msg) From c0107a5e5622c055e2694fac82205e5489d2db84 Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Fri, 23 Jan 2026 23:10:53 +0530 Subject: [PATCH 07/11] added different strategies and documentation --- cortex/cli.py | 4 +- cortex/coordinator.py | 16 +++- cortex/utils/retry.py | 181 +++++++++++++++++++++++++++--------- docs/RETRY_CONFIGURATION.md | 173 ++++++++++++++++++++++++++++++++++ tests/test_retry.py | 144 ++++++++++++++++++++++++---- 5 files changed, 449 insertions(+), 69 deletions(-) create mode 100644 docs/RETRY_CONFIGURATION.md diff --git a/cortex/cli.py b/cortex/cli.py index 84deb0d74..693410efa 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -20,6 +20,7 @@ from cortex.ask import AskHandler from cortex.branding import VERSION, console, cx_header, cx_print, show_banner from cortex.coordinator import InstallationCoordinator, InstallationStep, StepStatus +from cortex.utils.retry import DEFAULT_MAX_RETRIES from cortex.demo import run_demo from cortex.dependency_importer import DependencyImporter, PackageEcosystem, ParseResult from cortex.env_manager import EnvironmentManager, get_env_manager @@ -1445,6 +1446,7 @@ def install( dry_run: bool = False, parallel: bool = False, json_output: bool = False, + max_retries: int = DEFAULT_MAX_RETRIES, ) -> int: """Install software using the LLM-powered package manager.""" # Initialize installation history @@ -1670,7 +1672,7 @@ def parallel_log_callback(message: str, level: str = "info"): timeout=300, stop_on_error=True, progress_callback=progress_callback, - max_retries=5, + max_retries=max_retries, ) result = coordinator.execute() diff --git a/cortex/coordinator.py b/cortex/coordinator.py index 28ce026f3..08122c1ba 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -9,7 +9,12 @@ from enum import Enum from typing import Any -from cortex.utils.retry import SmartRetry +from cortex.utils.retry import ( + DEFAULT_MAX_RETRIES, + ErrorCategory, + SmartRetry, + load_strategies_from_env, +) from cortex.validators import DANGEROUS_PATTERNS logger = logging.getLogger(__name__) @@ -61,7 +66,7 @@ def __init__( enable_rollback: bool = False, log_file: str | None = None, progress_callback: Callable[[int, int, InstallationStep], None] | None = None, - max_retries: int = 5, + max_retries: int = DEFAULT_MAX_RETRIES, ): """Initialize an installation run with optional logging and rollback.""" self.timeout = timeout @@ -192,8 +197,13 @@ def status_callback(msg: str) -> None: # Also print to stdout so the user sees the retry happening print(msg) + # Load strategies and apply CLI override for network errors + strategies = load_strategies_from_env() + if ErrorCategory.NETWORK_ERROR in strategies: + strategies[ErrorCategory.NETWORK_ERROR].max_retries = self.max_retries + retry_handler = SmartRetry( - max_retries=self.max_retries, + strategies=strategies, status_callback=status_callback, ) diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py index 958298ca6..3fa6f12c7 100644 --- a/cortex/utils/retry.py +++ b/cortex/utils/retry.py @@ -1,32 +1,119 @@ import logging +import os import time from collections.abc import Callable +from dataclasses import dataclass from typing import Any from cortex.error_parser import ErrorCategory, ErrorParser logger = logging.getLogger(__name__) +# Default maximum number of retries for the global retry setting +DEFAULT_MAX_RETRIES = 5 + + +@dataclass +class RetryStrategy: + """Configuration for how to retry a specific error type.""" + + max_retries: int + backoff_factor: float + description: str + + +# Default strategies for each retryable error category +DEFAULT_STRATEGIES: dict[ErrorCategory, RetryStrategy] = { + ErrorCategory.NETWORK_ERROR: RetryStrategy( + max_retries=DEFAULT_MAX_RETRIES, + backoff_factor=1.0, + description="Network issues - retry aggressively with short backoff", + ), + ErrorCategory.LOCK_ERROR: RetryStrategy( + max_retries=3, + backoff_factor=5.0, + description="Lock contention - wait longer between retries", + ), + ErrorCategory.UNKNOWN: RetryStrategy( + max_retries=2, + backoff_factor=2.0, + description="Unknown errors - conservative retry", + ), +} + +# Permanent error categories that should never be retried +PERMANENT_ERRORS: set[ErrorCategory] = { + ErrorCategory.PERMISSION_DENIED, + ErrorCategory.PACKAGE_NOT_FOUND, + ErrorCategory.CONFIGURATION_ERROR, + ErrorCategory.DEPENDENCY_MISSING, + ErrorCategory.CONFLICT, + ErrorCategory.DISK_SPACE, +} + + +def load_strategies_from_env() -> dict[ErrorCategory, RetryStrategy]: + """ + Load retry strategies from environment variables, falling back to defaults. + + Environment variables: + CORTEX_RETRY_NETWORK_MAX: Max retries for network errors (default: 5) + CORTEX_RETRY_NETWORK_BACKOFF: Backoff factor for network errors (default: 1.0) + CORTEX_RETRY_LOCK_MAX: Max retries for lock errors (default: 3) + CORTEX_RETRY_LOCK_BACKOFF: Backoff factor for lock errors (default: 5.0) + CORTEX_RETRY_UNKNOWN_MAX: Max retries for unknown errors (default: 2) + CORTEX_RETRY_UNKNOWN_BACKOFF: Backoff factor for unknown errors (default: 2.0) + """ + strategies = dict(DEFAULT_STRATEGIES) + + # Network error overrides + if os.getenv("CORTEX_RETRY_NETWORK_MAX") or os.getenv("CORTEX_RETRY_NETWORK_BACKOFF"): + strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_NETWORK_MAX", "5")), + backoff_factor=float(os.getenv("CORTEX_RETRY_NETWORK_BACKOFF", "1.0")), + description="Network issues (user-configured)", + ) + + # Lock error overrides + if os.getenv("CORTEX_RETRY_LOCK_MAX") or os.getenv("CORTEX_RETRY_LOCK_BACKOFF"): + strategies[ErrorCategory.LOCK_ERROR] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_LOCK_MAX", "3")), + backoff_factor=float(os.getenv("CORTEX_RETRY_LOCK_BACKOFF", "5.0")), + description="Lock contention (user-configured)", + ) + + # Unknown error overrides + if os.getenv("CORTEX_RETRY_UNKNOWN_MAX") or os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF"): + strategies[ErrorCategory.UNKNOWN] = RetryStrategy( + max_retries=int(os.getenv("CORTEX_RETRY_UNKNOWN_MAX", "2")), + backoff_factor=float(os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF", "2.0")), + description="Unknown errors (user-configured)", + ) + + return strategies + class SmartRetry: """ Implements smart retry logic with exponential backoff. Uses ErrorParser to distinguish between transient and permanent errors. + Supports different retry strategies per error category. """ def __init__( self, - max_retries: int = 5, - backoff_factor: float = 1.0, + strategies: dict[ErrorCategory, RetryStrategy] | None = None, status_callback: Callable[[str], None] | None = None, ): - if not isinstance(max_retries, int) or max_retries < 0: - raise ValueError("max_retries must be a non-negative integer") - if not isinstance(backoff_factor, (int, float)) or backoff_factor < 0: - raise ValueError("backoff_factor must be a non-negative number") + """ + Initialize SmartRetry with optional custom strategies. - self.max_retries = max_retries - self.backoff_factor = float(backoff_factor) + Args: + strategies: Custom retry strategies per error category. + If None, loads from environment or uses defaults. + status_callback: Optional callback for status messages. + """ + self.strategies = strategies if strategies is not None else load_strategies_from_env() self.status_callback = status_callback self.error_parser = ErrorParser() @@ -45,8 +132,9 @@ def run(self, func: Callable[[], Any]) -> Any: attempt = 0 last_exception = None last_result = None + current_strategy: RetryStrategy | None = None - while attempt <= self.max_retries: + while True: try: result = func() last_result = result @@ -62,22 +150,34 @@ def run(self, func: Callable[[], Any]) -> Any: elif hasattr(result, "stdout") and result.stdout: error_msg = result.stdout - if not self._should_retry(error_msg): + category = self._get_error_category(error_msg) + current_strategy = self._get_strategy(category) + + if current_strategy is None: + # Permanent error - fail fast return result except Exception as e: last_exception = e - if not self._should_retry(str(e)): + category = self._get_error_category(str(e)) + current_strategy = self._get_strategy(category) + + if current_strategy is None: + # Permanent error - fail fast raise - # If we are here, we need to retry (unless max retries reached) - if attempt == self.max_retries: + # Check if we've exhausted retries for this strategy + if current_strategy is None or attempt >= current_strategy.max_retries: break attempt += 1 - sleep_time = self.backoff_factor * (2 ** (attempt - 1)) + sleep_time = current_strategy.backoff_factor * (2 ** (attempt - 1)) - msg = f"⚠️ Transient error detected. Retrying in {sleep_time}s... (Attempt {attempt}/{self.max_retries})" + category_name = category.name if category else "UNKNOWN" + msg = ( + f"⚠️ {category_name} detected. " + f"Retrying in {sleep_time}s... (Attempt {attempt}/{current_strategy.max_retries})" + ) logger.warning(msg) if self.status_callback: self.status_callback(msg) @@ -88,39 +188,28 @@ def run(self, func: Callable[[], Any]) -> Any: raise last_exception return last_result - def _should_retry(self, error_message: str) -> bool: - """ - Determine if we should retry based on the error message. - """ + def _get_error_category(self, error_message: str) -> ErrorCategory | None: + """Classify the error message into a category.""" if not error_message: - # If no error message, assume it's a generic failure that might be transient - return True + return ErrorCategory.UNKNOWN analysis = self.error_parser.parse_error(error_message) - category = analysis.primary_category - # If the error is explicitly marked as not fixable, fail fast + # If the error is explicitly marked as not fixable, treat as permanent if not analysis.is_fixable: - return False - - # Retry on network errors, lock errors, or unknown errors (conservative) - if category in [ - ErrorCategory.NETWORK_ERROR, - ErrorCategory.LOCK_ERROR, - ErrorCategory.UNKNOWN, - ]: - return True - - # Fail fast on permanent errors - if category in [ - ErrorCategory.PERMISSION_DENIED, - ErrorCategory.PACKAGE_NOT_FOUND, - ErrorCategory.CONFIGURATION_ERROR, - ErrorCategory.DEPENDENCY_MISSING, - ErrorCategory.CONFLICT, - ErrorCategory.DISK_SPACE, - ]: - return False - - # Default to retry for safety if not explicitly categorized as permanent - return True + return None + + return analysis.primary_category + + def _get_strategy(self, category: ErrorCategory | None) -> RetryStrategy | None: + """ + Get the retry strategy for a given error category. + Returns None for permanent errors (should not retry). + """ + if category is None: + return None + + if category in PERMANENT_ERRORS: + return None + + return self.strategies.get(category) diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md new file mode 100644 index 000000000..0b8bad7b3 --- /dev/null +++ b/docs/RETRY_CONFIGURATION.md @@ -0,0 +1,173 @@ +# Retry Configuration Guide + +Cortex CLI includes a **Smart Retry** mechanism that automatically recovers from transient failures during package installations. This guide explains how retry logic works and how to configure it. + +## How It Works + +When an installation command fails, Cortex analyzes the error to determine if it's: + +1. **Transient** (temporary, likely to resolve): Network timeouts, lock contention, etc. +2. **Permanent** (unlikely to resolve): Permission denied, package not found, disk full, etc. + +For transient errors, Cortex retries the command with **exponential backoff**—waiting progressively longer between attempts (1s, 2s, 4s, etc.) to allow the issue to resolve. + +## Default Retry Strategies + +Each error type has its own retry strategy: + +| Error Type | Max Retries | Base Backoff | Rationale | +|------------|-------------|--------------|-----------| +| **Network Error** | 5 | 1.0s | Short blips resolve quickly; retry aggressively | +| **Lock Error** | 3 | 5.0s | Locks take time to release; wait longer | +| **Unknown Error** | 2 | 2.0s | Conservative approach for unclassified errors | + +**Permanent errors** (Permission Denied, Package Not Found, Disk Space, Dependency Missing, Configuration Error, Conflict) **never retry**—they fail immediately. + +## Backoff Calculation + +The wait time before each retry uses exponential backoff: + +``` +wait_time = backoff_factor × 2^(attempt - 1) +``` + +Example for Network Error (backoff_factor = 1.0): +- Attempt 1: 1.0s wait +- Attempt 2: 2.0s wait +- Attempt 3: 4.0s wait +- Attempt 4: 8.0s wait +- Attempt 5: 16.0s wait + +## Configuration via Environment Variables + +Override default strategies using environment variables: + +### Network Error Configuration +```bash +export CORTEX_RETRY_NETWORK_MAX=10 # Max retry attempts (default: 5) +export CORTEX_RETRY_NETWORK_BACKOFF=0.5 # Base backoff in seconds (default: 1.0) +``` + +### Lock Error Configuration +```bash +export CORTEX_RETRY_LOCK_MAX=5 # Max retry attempts (default: 3) +export CORTEX_RETRY_LOCK_BACKOFF=10.0 # Base backoff in seconds (default: 5.0) +``` + +### Unknown Error Configuration +```bash +export CORTEX_RETRY_UNKNOWN_MAX=3 # Max retry attempts (default: 2) +export CORTEX_RETRY_UNKNOWN_BACKOFF=1.0 # Base backoff in seconds (default: 2.0) +``` + +## Examples + +### Aggressive Retry for Unstable Networks + +If you're on an unstable connection and want more retries with shorter waits: + +```bash +export CORTEX_RETRY_NETWORK_MAX=10 +export CORTEX_RETRY_NETWORK_BACKOFF=0.5 +cortex install docker --execute +``` + +This gives 10 attempts with waits: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s. + +### Patient Retry for Shared Systems + +If you're on a shared server where `apt` locks are common: + +```bash +export CORTEX_RETRY_LOCK_MAX=5 +export CORTEX_RETRY_LOCK_BACKOFF=30.0 +cortex install nginx --execute +``` + +This gives 5 attempts with waits: 30s, 60s, 120s, 240s, 480s (up to 8 minutes total wait). + +### Disable All Retries + +For CI/CD pipelines where you want fast failure: + +```bash +export CORTEX_RETRY_NETWORK_MAX=0 +export CORTEX_RETRY_LOCK_MAX=0 +export CORTEX_RETRY_UNKNOWN_MAX=0 +cortex install package --execute +``` + +## User Feedback + +During retries, Cortex displays messages like: + +``` +⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5) +``` + +This shows: +- The error type that was detected +- How long until the next attempt +- The current attempt number and maximum attempts + +## Error Categories Reference + +### Transient (Retried) + +| Category | Example Errors | +|----------|----------------| +| `NETWORK_ERROR` | "Connection timed out", "Temporary failure resolving" | +| `LOCK_ERROR` | "Could not get lock", "dpkg was interrupted" | +| `UNKNOWN` | Unclassified errors that might be transient | + +### Permanent (Never Retried) + +| Category | Example Errors | +|----------|----------------| +| `PERMISSION_DENIED` | "Permission denied", "Operation not permitted" | +| `PACKAGE_NOT_FOUND` | "Unable to locate package", "No such package" | +| `DISK_SPACE` | "No space left on device" | +| `DEPENDENCY_MISSING` | "Depends: X but it is not installable" | +| `CONFIGURATION_ERROR` | "Configuration file syntax error" | +| `CONFLICT` | "Conflicts with package X" | + +## Programmatic Usage + +For advanced use cases, you can customize strategies in code: + +```python +from cortex.utils.retry import SmartRetry, RetryStrategy, DEFAULT_STRATEGIES +from cortex.error_parser import ErrorCategory + +# Custom strategies +custom_strategies = dict(DEFAULT_STRATEGIES) +custom_strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=10, + backoff_factor=0.5, + description="Custom network retry" +) + +retry = SmartRetry(strategies=custom_strategies) +result = retry.run(my_function) +``` + +## Troubleshooting + +### Retries Not Happening + +1. Check if the error is classified as permanent (see table above) +2. Verify environment variables are set correctly +3. Run with `--verbose` to see detailed error classification + +### Retries Taking Too Long + +Reduce `backoff_factor` or `max_retries` via environment variables. + +### Need More Aggressive Retries + +Increase `max_retries` and decrease `backoff_factor`. + +--- + +**Version**: 0.9.0 +**Last Updated**: January 2026 diff --git a/tests/test_retry.py b/tests/test_retry.py index bb2fc053a..6e7660a25 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -1,14 +1,89 @@ -import time import unittest from unittest.mock import Mock, patch from cortex.error_parser import ErrorCategory -from cortex.utils.retry import SmartRetry +from cortex.utils.retry import ( + DEFAULT_STRATEGIES, + PERMANENT_ERRORS, + RetryStrategy, + SmartRetry, + load_strategies_from_env, +) + + +class TestRetryStrategy(unittest.TestCase): + """Tests for the RetryStrategy dataclass.""" + + def test_strategy_creation(self): + strategy = RetryStrategy(max_retries=5, backoff_factor=1.0, description="Test") + self.assertEqual(strategy.max_retries, 5) + self.assertEqual(strategy.backoff_factor, 1.0) + self.assertEqual(strategy.description, "Test") + + +class TestDefaultStrategies(unittest.TestCase): + """Tests for default strategy configurations.""" + + def test_network_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.NETWORK_ERROR] + self.assertEqual(strategy.max_retries, 5) + self.assertEqual(strategy.backoff_factor, 1.0) + + def test_lock_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.LOCK_ERROR] + self.assertEqual(strategy.max_retries, 3) + self.assertEqual(strategy.backoff_factor, 5.0) + + def test_unknown_error_strategy(self): + strategy = DEFAULT_STRATEGIES[ErrorCategory.UNKNOWN] + self.assertEqual(strategy.max_retries, 2) + self.assertEqual(strategy.backoff_factor, 2.0) + + def test_permanent_errors_not_in_strategies(self): + for error in PERMANENT_ERRORS: + self.assertNotIn(error, DEFAULT_STRATEGIES) + + +class TestLoadStrategiesFromEnv(unittest.TestCase): + """Tests for environment variable configuration.""" + + def test_default_strategies_when_no_env_vars(self): + with patch.dict("os.environ", {}, clear=True): + strategies = load_strategies_from_env() + self.assertEqual( + strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5 + ) + + def test_network_override_from_env(self): + with patch.dict( + "os.environ", + {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"}, + ): + strategies = load_strategies_from_env() + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10) + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].backoff_factor, 0.5) + + def test_lock_override_from_env(self): + with patch.dict( + "os.environ", + {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"}, + ): + strategies = load_strategies_from_env() + self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6) + self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].backoff_factor, 10.0) class TestSmartRetry(unittest.TestCase): + """Tests for SmartRetry class.""" + def setUp(self): - self.retry = SmartRetry(max_retries=3, backoff_factor=0.01) + # Use custom strategies with short backoff for fast tests + self.fast_strategies = { + ErrorCategory.NETWORK_ERROR: RetryStrategy(3, 0.01, "Test network"), + ErrorCategory.LOCK_ERROR: RetryStrategy(2, 0.01, "Test lock"), + ErrorCategory.UNKNOWN: RetryStrategy(2, 0.01, "Test unknown"), + } + self.retry = SmartRetry(strategies=self.fast_strategies) def test_success_first_try(self): mock_func = Mock() @@ -21,11 +96,10 @@ def test_success_first_try(self): self.assertEqual(result, mock_result) self.assertEqual(mock_func.call_count, 1) - @patch("time.sleep") - def test_retry_on_transient_error(self, mock_sleep): + @patch("cortex.utils.retry.time.sleep") + def test_retry_on_network_error(self, mock_sleep): mock_func = Mock() - # Fail twice with network error, then succeed fail_result = Mock() fail_result.returncode = 1 fail_result.stderr = "Connection timed out" @@ -41,8 +115,8 @@ def test_retry_on_transient_error(self, mock_sleep): self.assertEqual(mock_func.call_count, 3) self.assertEqual(mock_sleep.call_count, 2) - @patch("time.sleep") - def test_fail_fast_on_permanent_error(self, mock_sleep): + @patch("cortex.utils.retry.time.sleep") + def test_fail_fast_on_permission_denied(self, mock_sleep): mock_func = Mock() fail_result = Mock() @@ -57,7 +131,23 @@ def test_fail_fast_on_permanent_error(self, mock_sleep): self.assertEqual(mock_func.call_count, 1) mock_sleep.assert_not_called() - @patch("time.sleep") + @patch("cortex.utils.retry.time.sleep") + def test_fail_fast_on_disk_space(self, mock_sleep): + mock_func = Mock() + + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "No space left on device" + + mock_func.return_value = fail_result + + result = self.retry.run(mock_func) + + self.assertEqual(result, fail_result) + self.assertEqual(mock_func.call_count, 1) + mock_sleep.assert_not_called() + + @patch("cortex.utils.retry.time.sleep") def test_max_retries_exceeded(self, mock_sleep): mock_func = Mock() @@ -70,26 +160,31 @@ def test_max_retries_exceeded(self, mock_sleep): result = self.retry.run(mock_func) self.assertEqual(result, fail_result) - self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries + # 1 initial + 3 retries for network error strategy + self.assertEqual(mock_func.call_count, 4) self.assertEqual(mock_sleep.call_count, 3) - @patch("time.sleep") - def test_exception_retry(self, mock_sleep): + @patch("cortex.utils.retry.time.sleep") + def test_different_strategy_for_lock_error(self, mock_sleep): mock_func = Mock() - mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)] - # We need to mock ErrorParser to classify "Network error" as transient if it's not standard - # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry + fail_result = Mock() + fail_result.returncode = 1 + fail_result.stderr = "Could not get lock /var/lib/apt/lists/lock" + + mock_func.return_value = fail_result result = self.retry.run(mock_func) - self.assertEqual(result.returncode, 0) - self.assertEqual(mock_func.call_count, 2) + self.assertEqual(result, fail_result) + # Lock error strategy has max_retries=2, so 1 initial + 2 retries = 3 + self.assertEqual(mock_func.call_count, 3) + self.assertEqual(mock_sleep.call_count, 2) - @patch("time.sleep") + @patch("cortex.utils.retry.time.sleep") def test_callback_notification(self, mock_sleep): callback = Mock() - retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback) + retry = SmartRetry(strategies=self.fast_strategies, status_callback=callback) mock_func = Mock() fail_result = Mock() @@ -101,8 +196,19 @@ def test_callback_notification(self, mock_sleep): retry.run(mock_func) callback.assert_called_once() + self.assertIn("NETWORK_ERROR", callback.call_args[0][0]) self.assertIn("Retrying", callback.call_args[0][0]) + @patch("cortex.utils.retry.time.sleep") + def test_exception_retry(self, mock_sleep): + mock_func = Mock() + mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)] + + result = self.retry.run(mock_func) + + self.assertEqual(result.returncode, 0) + self.assertEqual(mock_func.call_count, 2) + if __name__ == "__main__": unittest.main() From af3e8825ad7109234541c46d13fcea13d212e2fb Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 17:42:22 +0000 Subject: [PATCH 08/11] [autofix.ci] apply automated fixes --- cortex/cli.py | 2 +- tests/test_retry.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/cortex/cli.py b/cortex/cli.py index 693410efa..6efa9504d 100644 --- a/cortex/cli.py +++ b/cortex/cli.py @@ -20,7 +20,6 @@ from cortex.ask import AskHandler from cortex.branding import VERSION, console, cx_header, cx_print, show_banner from cortex.coordinator import InstallationCoordinator, InstallationStep, StepStatus -from cortex.utils.retry import DEFAULT_MAX_RETRIES from cortex.demo import run_demo from cortex.dependency_importer import DependencyImporter, PackageEcosystem, ParseResult from cortex.env_manager import EnvironmentManager, get_env_manager @@ -41,6 +40,7 @@ ) from cortex.update_checker import UpdateChannel, should_notify_update from cortex.updater import Updater, UpdateStatus +from cortex.utils.retry import DEFAULT_MAX_RETRIES from cortex.validators import validate_api_key, validate_install_request from cortex.version_manager import get_version_string diff --git a/tests/test_retry.py b/tests/test_retry.py index 6e7660a25..c6d8a659e 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -50,9 +50,7 @@ class TestLoadStrategiesFromEnv(unittest.TestCase): def test_default_strategies_when_no_env_vars(self): with patch.dict("os.environ", {}, clear=True): strategies = load_strategies_from_env() - self.assertEqual( - strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5 - ) + self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5) def test_network_override_from_env(self): with patch.dict( From f2bd1e8a2ac284704826132d9e65dd025ad1a17d Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Fri, 23 Jan 2026 23:31:01 +0530 Subject: [PATCH 09/11] resolved suggestions --- cortex/coordinator.py | 5 +++-- cortex/utils/retry.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/cortex/coordinator.py b/cortex/coordinator.py index 08122c1ba..2183ee815 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -194,8 +194,9 @@ def run_cmd() -> subprocess.CompletedProcess[str]: def status_callback(msg: str) -> None: self._log(msg) - # Also print to stdout so the user sees the retry happening - print(msg) + # Only print to stdout if no progress callback is configured to avoid duplicates + if self.progress_callback is None: + print(msg) # Load strategies and apply CLI override for network errors strategies = load_strategies_from_env() diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py index 3fa6f12c7..598f73313 100644 --- a/cortex/utils/retry.py +++ b/cortex/utils/retry.py @@ -114,6 +114,14 @@ def __init__( status_callback: Optional callback for status messages. """ self.strategies = strategies if strategies is not None else load_strategies_from_env() + + # Validate strategies + for category, strategy in self.strategies.items(): + if strategy.max_retries < 0: + raise ValueError(f"Strategy for {category.name}: max_retries must be non-negative") + if strategy.backoff_factor <= 0: + raise ValueError(f"Strategy for {category.name}: backoff_factor must be positive") + self.status_callback = status_callback self.error_parser = ErrorParser() @@ -147,8 +155,6 @@ def run(self, func: Callable[[], Any]) -> Any: error_msg = "" if hasattr(result, "stderr") and result.stderr: error_msg = result.stderr - elif hasattr(result, "stdout") and result.stdout: - error_msg = result.stdout category = self._get_error_category(error_msg) current_strategy = self._get_strategy(category) @@ -176,7 +182,7 @@ def run(self, func: Callable[[], Any]) -> Any: category_name = category.name if category else "UNKNOWN" msg = ( f"⚠️ {category_name} detected. " - f"Retrying in {sleep_time}s... (Attempt {attempt}/{current_strategy.max_retries})" + f"Retrying in {sleep_time}s... (Retry {attempt}/{current_strategy.max_retries})" ) logger.warning(msg) if self.status_callback: @@ -191,6 +197,7 @@ def run(self, func: Callable[[], Any]) -> Any: def _get_error_category(self, error_message: str) -> ErrorCategory | None: """Classify the error message into a category.""" if not error_message: + logger.warning("Retry: Empty error message detected. Assuming UNKNOWN (transient).") return ErrorCategory.UNKNOWN analysis = self.error_parser.parse_error(error_message) From 4d0162997c14939de7468287618f6591122e8d08 Mon Sep 17 00:00:00 2001 From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com> Date: Fri, 23 Jan 2026 18:01:59 +0000 Subject: [PATCH 10/11] [autofix.ci] apply automated fixes --- cortex/utils/retry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py index 598f73313..ed5903615 100644 --- a/cortex/utils/retry.py +++ b/cortex/utils/retry.py @@ -114,7 +114,7 @@ def __init__( status_callback: Optional callback for status messages. """ self.strategies = strategies if strategies is not None else load_strategies_from_env() - + # Validate strategies for category, strategy in self.strategies.items(): if strategy.max_retries < 0: From 3f2aa2717b7141820a1e2df2f8ccec066078df05 Mon Sep 17 00:00:00 2001 From: krishnaShuk Date: Sat, 24 Jan 2026 00:33:36 +0530 Subject: [PATCH 11/11] minor changes --- cortex/coordinator.py | 9 ++++++++- docs/RETRY_CONFIGURATION.md | 4 ++-- tests/test_retry.py | 2 ++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/cortex/coordinator.py b/cortex/coordinator.py index 2183ee815..1f982c963 100644 --- a/cortex/coordinator.py +++ b/cortex/coordinator.py @@ -12,6 +12,7 @@ from cortex.utils.retry import ( DEFAULT_MAX_RETRIES, ErrorCategory, + RetryStrategy, SmartRetry, load_strategies_from_env, ) @@ -201,7 +202,13 @@ def status_callback(msg: str) -> None: # Load strategies and apply CLI override for network errors strategies = load_strategies_from_env() if ErrorCategory.NETWORK_ERROR in strategies: - strategies[ErrorCategory.NETWORK_ERROR].max_retries = self.max_retries + # Create a new instance to avoid mutating the shared default object + original_strategy = strategies[ErrorCategory.NETWORK_ERROR] + strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy( + max_retries=self.max_retries, + backoff_factor=original_strategy.backoff_factor, + description=original_strategy.description, + ) retry_handler = SmartRetry( strategies=strategies, diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md index 0b8bad7b3..e1acf3c5e 100644 --- a/docs/RETRY_CONFIGURATION.md +++ b/docs/RETRY_CONFIGURATION.md @@ -27,7 +27,7 @@ Each error type has its own retry strategy: The wait time before each retry uses exponential backoff: -``` +```text wait_time = backoff_factor × 2^(attempt - 1) ``` @@ -101,7 +101,7 @@ cortex install package --execute During retries, Cortex displays messages like: -``` +```text ⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5) ``` diff --git a/tests/test_retry.py b/tests/test_retry.py index c6d8a659e..9061b6a70 100644 --- a/tests/test_retry.py +++ b/tests/test_retry.py @@ -56,6 +56,7 @@ def test_network_override_from_env(self): with patch.dict( "os.environ", {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"}, + clear=True, ): strategies = load_strategies_from_env() self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10) @@ -65,6 +66,7 @@ def test_lock_override_from_env(self): with patch.dict( "os.environ", {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"}, + clear=True, ): strategies = load_strategies_from_env() self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6)