From c3e5ac7d9188b8883fd23ed94c046c905dbfd10c Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Tue, 20 Jan 2026 11:53:17 +0530
Subject: [PATCH 01/11] added retry functionality

---
 cortex/cli.py             |   1 +
 cortex/coordinator.py     |  23 +++++++-
 cortex/utils/retry.py     | 116 ++++++++++++++++++++++++++++++++++++++
 tests/test_coordinator.py |  35 ++++++------
 tests/test_retry.py       | 105 ++++++++++++++++++++++++++++++++++
 5 files changed, 261 insertions(+), 19 deletions(-)
 create mode 100644 cortex/utils/retry.py
 create mode 100644 tests/test_retry.py

diff --git a/cortex/cli.py b/cortex/cli.py
index fb3593d83..45a5fb6ce 100644
--- a/cortex/cli.py
+++ b/cortex/cli.py
@@ -1000,6 +1000,7 @@ def parallel_log_callback(message: str, level: str = "info"):
                     timeout=300,
                     stop_on_error=True,
                     progress_callback=progress_callback,
+                    max_retries=5,
                 )
 
                 result = coordinator.execute()
diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index ac19bf80f..d14e4cf3c 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -60,6 +60,7 @@ def __init__(
         enable_rollback: bool = False,
         log_file: str | None = None,
         progress_callback: Callable[[int, int, InstallationStep], None] | None = None,
+        max_retries: int = 5,
     ):
         """Initialize an installation run with optional logging and rollback."""
         self.timeout = timeout
@@ -67,6 +68,7 @@ def __init__(
         self.enable_rollback = enable_rollback
         self.log_file = log_file
         self.progress_callback = progress_callback
+        self.max_retries = max_retries
 
         if descriptions and len(descriptions) != len(commands):
             raise ValueError("Number of descriptions must match number of commands")
@@ -90,6 +92,7 @@ def from_plan(
         enable_rollback: bool | None = None,
         log_file: str | None = None,
         progress_callback: Callable[[int, int, InstallationStep], None] | None = None,
+        max_retries: int = 5,
     ) -> "InstallationCoordinator":
         """Create a coordinator from a structured plan produced by an LLM.
 
@@ -124,6 +127,7 @@ def from_plan(
             ),
             log_file=log_file,
             progress_callback=progress_callback,
+            max_retries=max_retries,
         )
 
         for rollback_cmd in rollback_commands:
@@ -174,14 +178,29 @@ def _execute_command(self, step: InstallationStep) -> bool:
             self._log(f"Command blocked: {step.command} - {error}")
             return False
 
-        try:
+        from cortex.utils.retry import SmartRetry
+
+        def run_cmd():
             # Use shell=True carefully - commands are validated first
             # For complex shell commands (pipes, redirects), shell=True is needed
             # Simple commands could use shlex.split() with shell=False
-            result = subprocess.run(
+            return subprocess.run(
                 step.command, shell=True, capture_output=True, text=True, timeout=self.timeout
             )
 
+        def status_callback(msg: str):
+            self._log(msg)
+            # Optionally update UI here if needed, but for now logging is sufficient
+            # The CLI progress callback handles the main step status updates
+
+        retry_handler = SmartRetry(
+            max_retries=self.max_retries,
+            status_callback=status_callback,
+        )
+
+        try:
+            result = retry_handler.run(run_cmd)
+
             step.return_code = result.returncode
             step.output = result.stdout
             step.error = result.stderr
diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
new file mode 100644
index 000000000..15d55c1a3
--- /dev/null
+++ b/cortex/utils/retry.py
@@ -0,0 +1,116 @@
+import logging
+import time
+from collections.abc import Callable
+from typing import Any
+
+from cortex.error_parser import ErrorCategory, ErrorParser
+
+logger = logging.getLogger(__name__)
+
+
+class SmartRetry:
+    """
+    Implements smart retry logic with exponential backoff.
+    Uses ErrorParser to distinguish between transient and permanent errors.
+    """
+
+    def __init__(
+        self,
+        max_retries: int = 5,
+        backoff_factor: float = 1.0,
+        status_callback: Callable[[str], None] | None = None,
+    ):
+        self.max_retries = max_retries
+        self.backoff_factor = backoff_factor
+        self.status_callback = status_callback
+        self.error_parser = ErrorParser()
+
+    def run(self, func: Callable[[], Any]) -> Any:
+        """
+        Run a function with smart retry logic.
+
+        Args:
+            func: The function to execute. Expected to return a result object
+                  that has `returncode`, `stdout`, and `stderr` attributes
+                  (like subprocess.CompletedProcess), or raise an exception.
+
+        Returns:
+            The result of the function call.
+        """
+        attempt = 0
+        last_exception = None
+        last_result = None
+
+        while attempt <= self.max_retries:
+            try:
+                result = func()
+                last_result = result
+
+                # If result indicates success (returncode 0), return immediately
+                if hasattr(result, "returncode") and result.returncode == 0:
+                    return result
+
+                # If result indicates failure, analyze it
+                error_msg = ""
+                if hasattr(result, "stderr") and result.stderr:
+                    error_msg = result.stderr
+                elif hasattr(result, "stdout") and result.stdout:
+                    error_msg = result.stdout
+
+                if not self._should_retry(error_msg):
+                    return result
+
+            except Exception as e:
+                last_exception = e
+                if not self._should_retry(str(e)):
+                    raise e
+
+            # If we are here, we need to retry (unless max retries reached)
+            if attempt == self.max_retries:
+                break
+
+            attempt += 1
+            sleep_time = self.backoff_factor * (2 ** (attempt - 1))
+
+            msg = f"⚠️ Transient error detected. Retrying in {sleep_time}s... (Attempt {attempt}/{self.max_retries})"
+            logger.warning(msg)
+            if self.status_callback:
+                self.status_callback(msg)
+
+            time.sleep(sleep_time)
+
+        if last_exception:
+            raise last_exception
+        return last_result
+
+    def _should_retry(self, error_message: str) -> bool:
+        """
+        Determine if we should retry based on the error message.
+        """
+        if not error_message:
+            # If no error message, assume it's a generic failure that might be transient
+            return True
+
+        analysis = self.error_parser.parse_error(error_message)
+        category = analysis.primary_category
+
+        # Retry on network errors, lock errors, or unknown errors (conservative)
+        if category in [
+            ErrorCategory.NETWORK_ERROR,
+            ErrorCategory.LOCK_ERROR,
+            ErrorCategory.UNKNOWN,
+        ]:
+            return True
+
+        # Fail fast on permanent errors
+        if category in [
+            ErrorCategory.PERMISSION_DENIED,
+            ErrorCategory.PACKAGE_NOT_FOUND,
+            ErrorCategory.CONFIGURATION_ERROR,
+            ErrorCategory.DEPENDENCY_MISSING,
+            ErrorCategory.CONFLICT,
+        ]:
+            return False
+
+        # Default to retry for safety if not explicitly categorized as permanent
+        return True
diff --git a/tests/test_coordinator.py b/tests/test_coordinator.py
index a0ad03d42..bf858de10 100644
--- a/tests/test_coordinator.py
+++ b/tests/test_coordinator.py
@@ -31,8 +31,9 @@ def test_step_duration(self):
         self.assertEqual(step.duration(), 5.5)
 
 
+@patch("time.sleep")
 class TestInstallationCoordinator(unittest.TestCase):
-    def test_initialization(self):
+    def test_initialization(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         coordinator = InstallationCoordinator(commands)
 
@@ -40,7 +41,7 @@ def test_initialization(self):
         self.assertEqual(coordinator.steps[0].command, "echo 1")
         self.assertEqual(coordinator.steps[1].command, "echo 2")
 
-    def test_from_plan_initialization(self):
+    def test_from_plan_initialization(self, mock_sleep):
         plan = [
             {"command": "echo 1", "description": "First step"},
             {"command": "echo 2", "rollback": "echo rollback"},
@@ -54,7 +55,7 @@ def test_from_plan_initialization(self):
         self.assertTrue(coordinator.enable_rollback)
         self.assertEqual(coordinator.rollback_commands, ["echo rollback"])
 
-    def test_initialization_with_descriptions(self):
+    def test_initialization_with_descriptions(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         descriptions = ["First", "Second"]
         coordinator = InstallationCoordinator(commands, descriptions)
@@ -62,7 +63,7 @@ def test_initialization_with_descriptions(self):
         self.assertEqual(coordinator.steps[0].description, "First")
         self.assertEqual(coordinator.steps[1].description, "Second")
 
-    def test_initialization_mismatched_descriptions(self):
+    def test_initialization_mismatched_descriptions(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         descriptions = ["First"]
 
@@ -70,7 +71,7 @@ def test_initialization_mismatched_descriptions(self):
             InstallationCoordinator(commands, descriptions)
 
     @patch("subprocess.run")
-    def test_execute_single_success(self, mock_run):
+    def test_execute_single_success(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
@@ -85,7 +86,7 @@ def test_execute_single_success(self, mock_run):
         self.assertEqual(result.steps[0].status, StepStatus.SUCCESS)
 
     @patch("subprocess.run")
-    def test_execute_single_failure(self, mock_run):
+    def test_execute_single_failure(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 1
         mock_result.stdout = ""
@@ -100,7 +101,7 @@ def test_execute_single_failure(self, mock_run):
         self.assertEqual(result.steps[0].status, StepStatus.FAILED)
 
     @patch("subprocess.run")
-    def test_execute_multiple_success(self, mock_run):
+    def test_execute_multiple_success(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
@@ -115,7 +116,7 @@ def test_execute_multiple_success(self, mock_run):
         self.assertTrue(all(s.status == StepStatus.SUCCESS for s in result.steps))
 
     @patch("subprocess.run")
-    def test_execute_stop_on_error(self, mock_run):
+    def test_execute_stop_on_error(self, mock_run, mock_sleep):
         def side_effect(*args, **kwargs):
             cmd = args[0] if args else kwargs.get("shell")
             if "fail" in str(cmd):
@@ -143,7 +144,7 @@ def side_effect(*args, **kwargs):
         self.assertEqual(result.steps[2].status, StepStatus.SKIPPED)
 
     @patch("subprocess.run")
-    def test_execute_continue_on_error(self, mock_run):
+    def test_execute_continue_on_error(self, mock_run, mock_sleep):
         def side_effect(*args, **kwargs):
             cmd = args[0] if args else kwargs.get("shell")
             if "fail" in str(cmd):
@@ -170,7 +171,7 @@ def side_effect(*args, **kwargs):
         self.assertEqual(result.steps[2].status, StepStatus.SUCCESS)
 
     @patch("subprocess.run")
-    def test_timeout_handling(self, mock_run):
+    def test_timeout_handling(self, mock_run, mock_sleep):
         mock_run.side_effect = Exception("Timeout")
 
         coordinator = InstallationCoordinator(["sleep 1000"], timeout=1)
@@ -179,7 +180,7 @@ def test_timeout_handling(self, mock_run):
         self.assertFalse(result.success)
         self.assertEqual(result.steps[0].status, StepStatus.FAILED)
 
-    def test_progress_callback(self):
+    def test_progress_callback(self, mock_sleep):
         callback_calls = []
 
         def callback(current, total, step):
@@ -199,7 +200,7 @@ def callback(current, total, step):
         self.assertEqual(callback_calls[0], (1, 2, "echo 1"))
         self.assertEqual(callback_calls[1], (2, 2, "echo 2"))
 
-    def test_log_file(self):
+    def test_log_file(self, mock_sleep):
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as f:
             log_file = f.name
 
@@ -223,7 +224,7 @@ def test_log_file(self):
                 os.unlink(log_file)
 
     @patch("subprocess.run")
-    def test_rollback(self, mock_run):
+    def test_rollback(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 1
         mock_result.stdout = ""
@@ -238,7 +239,7 @@ def test_rollback(self, mock_run):
         self.assertGreaterEqual(mock_run.call_count, 2)
 
     @patch("subprocess.run")
-    def test_verify_installation(self, mock_run):
+    def test_verify_installation(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "Docker version 20.10.0"
@@ -252,7 +253,7 @@ def test_verify_installation(self, mock_run):
 
         self.assertTrue(verify_results["docker --version"])
 
-    def test_get_summary(self):
+    def test_get_summary(self, mock_sleep):
         with patch("subprocess.run") as mock_run:
             mock_result = Mock()
             mock_result.returncode = 0
@@ -270,7 +271,7 @@ def test_get_summary(self):
             self.assertEqual(summary["failed"], 0)
             self.assertEqual(summary["skipped"], 0)
 
-    def test_export_log(self):
+    def test_export_log(self, mock_sleep):
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
             export_file = f.name
 
@@ -299,7 +300,7 @@ def test_export_log(self):
                 os.unlink(export_file)
 
     @patch("subprocess.run")
-    def test_step_timing(self, mock_run):
+    def test_step_timing(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
diff --git a/tests/test_retry.py b/tests/test_retry.py
new file mode 100644
index 000000000..8714cc94d
--- /dev/null
+++ b/tests/test_retry.py
@@ -0,0 +1,105 @@
+import unittest
+from unittest.mock import Mock, patch
+import time
+from cortex.utils.retry import SmartRetry
+from cortex.error_parser import ErrorCategory
+
+class TestSmartRetry(unittest.TestCase):
+    def setUp(self):
+        self.retry = SmartRetry(max_retries=3, backoff_factor=0.01)
+
+    def test_success_first_try(self):
+        mock_func = Mock()
+        mock_result = Mock()
+        mock_result.returncode = 0
+        mock_func.return_value = mock_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, mock_result)
+        self.assertEqual(mock_func.call_count, 1)
+
+    @patch("time.sleep")
+    def test_retry_on_transient_error(self, mock_sleep):
+        mock_func = Mock()
+        
+        # Fail twice with network error, then succeed
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+        
+        success_result = Mock()
+        success_result.returncode = 0
+        
+        mock_func.side_effect = [fail_result, fail_result, success_result]
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, success_result)
+        self.assertEqual(mock_func.call_count, 3)
+        self.assertEqual(mock_sleep.call_count, 2)
+
+    @patch("time.sleep")
+    def test_fail_fast_on_permanent_error(self, mock_sleep):
+        mock_func = Mock()
+        
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Permission denied"
+        
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        self.assertEqual(mock_func.call_count, 1)
+        mock_sleep.assert_not_called()
+
+    @patch("time.sleep")
+    def test_max_retries_exceeded(self, mock_sleep):
+        mock_func = Mock()
+        
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+        
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries
+        self.assertEqual(mock_sleep.call_count, 3)
+
+    @patch("time.sleep")
+    def test_exception_retry(self, mock_sleep):
+        mock_func = Mock()
+        mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)]
+
+        # We need to mock ErrorParser to classify "Network error" as transient if it's not standard
+        # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry
+        
+        result = self.retry.run(mock_func)
+        
+        self.assertEqual(result.returncode, 0)
+        self.assertEqual(mock_func.call_count, 2)
+
+    @patch("time.sleep")
+    def test_callback_notification(self, mock_sleep):
+        callback = Mock()
+        retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback)
+        
+        mock_func = Mock()
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+        
+        mock_func.side_effect = [fail_result, Mock(returncode=0)]
+
+        retry.run(mock_func)
+
+        callback.assert_called_once()
+        self.assertIn("Retrying", callback.call_args[0][0])
+
+if __name__ == "__main__":
+    unittest.main()

From 769bedcc6e8dced1c86f9c4ebbe7f5eb57807d7f Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Tue, 20 Jan 2026 06:40:48 +0000
Subject: [PATCH 02/11] [autofix.ci] apply automated fixes

---
 tests/test_retry.py | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/tests/test_retry.py b/tests/test_retry.py
index 8714cc94d..bb2fc053a 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -1,8 +1,10 @@
+import time
 import unittest
 from unittest.mock import Mock, patch
-import time
-from cortex.utils.retry import SmartRetry
+
 from cortex.error_parser import ErrorCategory
+from cortex.utils.retry import SmartRetry
+
 
 class TestSmartRetry(unittest.TestCase):
     def setUp(self):
@@ -22,15 +24,15 @@ def test_success_first_try(self):
     @patch("time.sleep")
     def test_retry_on_transient_error(self, mock_sleep):
         mock_func = Mock()
-        
+
         # Fail twice with network error, then succeed
         fail_result = Mock()
         fail_result.returncode = 1
         fail_result.stderr = "Connection timed out"
-        
+
         success_result = Mock()
         success_result.returncode = 0
-        
+
         mock_func.side_effect = [fail_result, fail_result, success_result]
 
         result = self.retry.run(mock_func)
@@ -42,11 +44,11 @@ def test_retry_on_transient_error(self, mock_sleep):
     @patch("time.sleep")
     def test_fail_fast_on_permanent_error(self, mock_sleep):
         mock_func = Mock()
-        
+
         fail_result = Mock()
         fail_result.returncode = 1
         fail_result.stderr = "Permission denied"
-        
+
         mock_func.return_value = fail_result
 
         result = self.retry.run(mock_func)
@@ -58,17 +60,17 @@ def test_fail_fast_on_permanent_error(self, mock_sleep):
     @patch("time.sleep")
     def test_max_retries_exceeded(self, mock_sleep):
         mock_func = Mock()
-        
+
         fail_result = Mock()
         fail_result.returncode = 1
         fail_result.stderr = "Connection timed out"
-        
+
         mock_func.return_value = fail_result
 
         result = self.retry.run(mock_func)
 
         self.assertEqual(result, fail_result)
-        self.assertEqual(mock_func.call_count, 4) # Initial + 3 retries
+        self.assertEqual(mock_func.call_count, 4)  # Initial + 3 retries
         self.assertEqual(mock_sleep.call_count, 3)
 
     @patch("time.sleep")
@@ -78,9 +80,9 @@ def test_exception_retry(self, mock_sleep):
 
         # We need to mock ErrorParser to classify "Network error" as transient if it's not standard
         # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry
-        
+
         result = self.retry.run(mock_func)
-        
+
         self.assertEqual(result.returncode, 0)
         self.assertEqual(mock_func.call_count, 2)
 
@@ -88,12 +90,12 @@ def test_exception_retry(self, mock_sleep):
     def test_callback_notification(self, mock_sleep):
         callback = Mock()
         retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback)
-        
+
         mock_func = Mock()
         fail_result = Mock()
         fail_result.returncode = 1
         fail_result.stderr = "Connection timed out"
-        
+
         mock_func.side_effect = [fail_result, Mock(returncode=0)]
 
         retry.run(mock_func)
@@ -101,5 +103,6 @@ def test_callback_notification(self, mock_sleep):
         callback.assert_called_once()
         self.assertIn("Retrying", callback.call_args[0][0])
 
+
 if __name__ == "__main__":
     unittest.main()

From 840f3efd0ea278723699ed992027c4d99100f42f Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Thu, 22 Jan 2026 12:16:14 +0530
Subject: [PATCH 03/11] added documentation

---
 docs/COMMANDS.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md
index a194d67fb..cf7400641 100644
--- a/docs/COMMANDS.md
+++ b/docs/COMMANDS.md
@@ -73,6 +73,7 @@ cortex install "python3 with pip and virtualenv" --execute
 - Without `--execute`, Cortex only shows the commands it would run
 - The `--dry-run` flag is recommended for first-time use to verify commands
 - Installation is recorded in history for potential rollback
+- **Smart Retry Logic**: Cortex automatically detects transient failures (like network timeouts) and retries commands with exponential backoff (up to 5 attempts). Permanent errors (like permission denied) fail immediately.
 
 ---
 

From ae299f1c9fed77149cd16089564fa33c23014a47 Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Thu, 22 Jan 2026 12:42:32 +0530
Subject: [PATCH 04/11] minor fixes

---
 cortex/coordinator.py |  7 +++----
 cortex/utils/retry.py | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index d14e4cf3c..ead65fd24 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -8,6 +8,7 @@
 from datetime import datetime
 from enum import Enum
 from typing import Any
+from cortex.utils.retry import SmartRetry
 
 from cortex.validators import DANGEROUS_PATTERNS
 
@@ -178,8 +179,6 @@ def _execute_command(self, step: InstallationStep) -> bool:
             self._log(f"Command blocked: {step.command} - {error}")
             return False
 
-        from cortex.utils.retry import SmartRetry
-
         def run_cmd():
             # Use shell=True carefully - commands are validated first
             # For complex shell commands (pipes, redirects), shell=True is needed
@@ -190,8 +189,8 @@ def run_cmd():
 
         def status_callback(msg: str):
             self._log(msg)
-            # Optionally update UI here if needed, but for now logging is sufficient
-            # The CLI progress callback handles the main step status updates
+            # Also print to stdout so the user sees the retry happening
+            print(msg)
 
         retry_handler = SmartRetry(
             max_retries=self.max_retries,
diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
index 15d55c1a3..958298ca6 100644
--- a/cortex/utils/retry.py
+++ b/cortex/utils/retry.py
@@ -20,8 +20,13 @@ def __init__(
         backoff_factor: float = 1.0,
         status_callback: Callable[[str], None] | None = None,
     ):
+        if not isinstance(max_retries, int) or max_retries < 0:
+            raise ValueError("max_retries must be a non-negative integer")
+        if not isinstance(backoff_factor, (int, float)) or backoff_factor < 0:
+            raise ValueError("backoff_factor must be a non-negative number")
+
         self.max_retries = max_retries
-        self.backoff_factor = backoff_factor
+        self.backoff_factor = float(backoff_factor)
         self.status_callback = status_callback
         self.error_parser = ErrorParser()
 
@@ -63,7 +68,7 @@ def run(self, func: Callable[[], Any]) -> Any:
             except Exception as e:
                 last_exception = e
                 if not self._should_retry(str(e)):
-                    raise e
+                    raise
 
             # If we are here, we need to retry (unless max retries reached)
             if attempt == self.max_retries:
@@ -94,6 +99,10 @@ def _should_retry(self, error_message: str) -> bool:
         analysis = self.error_parser.parse_error(error_message)
         category = analysis.primary_category
 
+        # If the error is explicitly marked as not fixable, fail fast
+        if not analysis.is_fixable:
+            return False
+
         # Retry on network errors, lock errors, or unknown errors (conservative)
         if category in [
             ErrorCategory.NETWORK_ERROR,
@@ -109,6 +118,7 @@ def _should_retry(self, error_message: str) -> bool:
             ErrorCategory.CONFIGURATION_ERROR,
             ErrorCategory.DEPENDENCY_MISSING,
             ErrorCategory.CONFLICT,
+            ErrorCategory.DISK_SPACE,
         ]:
             return False
 

From 90460571d2f49329ffcc92ff1bc85de51008f321 Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Thu, 22 Jan 2026 07:13:13 +0000
Subject: [PATCH 05/11] [autofix.ci] apply automated fixes

---
 cortex/coordinator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index ead65fd24..6d077c143 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -8,8 +8,8 @@
 from datetime import datetime
 from enum import Enum
 from typing import Any
-from cortex.utils.retry import SmartRetry
 
+from cortex.utils.retry import SmartRetry
 from cortex.validators import DANGEROUS_PATTERNS
 
 logger = logging.getLogger(__name__)

From af526dc2f4a2f228d85bbec460208d81e89c0ba3 Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Thu, 22 Jan 2026 13:04:17 +0530
Subject: [PATCH 06/11] minor fix

---
 cortex/coordinator.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index 6d077c143..28ce026f3 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -179,7 +179,7 @@ def _execute_command(self, step: InstallationStep) -> bool:
             self._log(f"Command blocked: {step.command} - {error}")
             return False
 
-        def run_cmd():
+        def run_cmd() -> subprocess.CompletedProcess[str]:
             # Use shell=True carefully - commands are validated first
             # For complex shell commands (pipes, redirects), shell=True is needed
             # Simple commands could use shlex.split() with shell=False
@@ -187,7 +187,7 @@ def run_cmd():
                 step.command, shell=True, capture_output=True, text=True, timeout=self.timeout
             )
 
-        def status_callback(msg: str):
+        def status_callback(msg: str) -> None:
             self._log(msg)
             # Also print to stdout so the user sees the retry happening
             print(msg)

From c0107a5e5622c055e2694fac82205e5489d2db84 Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Fri, 23 Jan 2026 23:10:53 +0530
Subject: [PATCH 07/11] added different strategies and documentation

---
 cortex/cli.py               |   4 +-
 cortex/coordinator.py       |  16 +++-
 cortex/utils/retry.py       | 181 +++++++++++++++++++++++++++---------
 docs/RETRY_CONFIGURATION.md | 173 ++++++++++++++++++++++++++++++++++
 tests/test_retry.py         | 144 ++++++++++++++++++++++++----
 5 files changed, 449 insertions(+), 69 deletions(-)
 create mode 100644 docs/RETRY_CONFIGURATION.md

diff --git a/cortex/cli.py b/cortex/cli.py
index 84deb0d74..693410efa 100644
--- a/cortex/cli.py
+++ b/cortex/cli.py
@@ -20,6 +20,7 @@
 from cortex.ask import AskHandler
 from cortex.branding import VERSION, console, cx_header, cx_print, show_banner
 from cortex.coordinator import InstallationCoordinator, InstallationStep, StepStatus
+from cortex.utils.retry import DEFAULT_MAX_RETRIES
 from cortex.demo import run_demo
 from cortex.dependency_importer import DependencyImporter, PackageEcosystem, ParseResult
 from cortex.env_manager import EnvironmentManager, get_env_manager
@@ -1445,6 +1446,7 @@ def install(
         dry_run: bool = False,
         parallel: bool = False,
         json_output: bool = False,
+        max_retries: int = DEFAULT_MAX_RETRIES,
     ) -> int:
         """Install software using the LLM-powered package manager."""
         # Initialize installation history
@@ -1670,7 +1672,7 @@ def parallel_log_callback(message: str, level: str = "info"):
                     timeout=300,
                     stop_on_error=True,
                     progress_callback=progress_callback,
-                    max_retries=5,
+                    max_retries=max_retries,
                 )
 
                 result = coordinator.execute()
diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index 28ce026f3..08122c1ba 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -9,7 +9,12 @@
 from enum import Enum
 from typing import Any
 
-from cortex.utils.retry import SmartRetry
+from cortex.utils.retry import (
+    DEFAULT_MAX_RETRIES,
+    ErrorCategory,
+    SmartRetry,
+    load_strategies_from_env,
+)
 from cortex.validators import DANGEROUS_PATTERNS
 
 logger = logging.getLogger(__name__)
@@ -61,7 +66,7 @@ def __init__(
         enable_rollback: bool = False,
         log_file: str | None = None,
         progress_callback: Callable[[int, int, InstallationStep], None] | None = None,
-        max_retries: int = 5,
+        max_retries: int = DEFAULT_MAX_RETRIES,
     ):
         """Initialize an installation run with optional logging and rollback."""
         self.timeout = timeout
@@ -192,8 +197,13 @@ def status_callback(msg: str) -> None:
             # Also print to stdout so the user sees the retry happening
             print(msg)
 
+        # Load strategies and apply CLI override for network errors
+        strategies = load_strategies_from_env()
+        if ErrorCategory.NETWORK_ERROR in strategies:
+            strategies[ErrorCategory.NETWORK_ERROR].max_retries = self.max_retries
+
         retry_handler = SmartRetry(
-            max_retries=self.max_retries,
+            strategies=strategies,
             status_callback=status_callback,
         )
 
diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
index 958298ca6..3fa6f12c7 100644
--- a/cortex/utils/retry.py
+++ b/cortex/utils/retry.py
@@ -1,32 +1,119 @@
 import logging
+import os
 import time
 from collections.abc import Callable
+from dataclasses import dataclass
 from typing import Any
 
 from cortex.error_parser import ErrorCategory, ErrorParser
 
 logger = logging.getLogger(__name__)
 
+# Default maximum number of retries for the global retry setting
+DEFAULT_MAX_RETRIES = 5
+
+
+@dataclass
+class RetryStrategy:
+    """Configuration for how to retry a specific error type."""
+
+    max_retries: int
+    backoff_factor: float
+    description: str
+
+
+# Default strategies for each retryable error category
+DEFAULT_STRATEGIES: dict[ErrorCategory, RetryStrategy] = {
+    ErrorCategory.NETWORK_ERROR: RetryStrategy(
+        max_retries=DEFAULT_MAX_RETRIES,
+        backoff_factor=1.0,
+        description="Network issues - retry aggressively with short backoff",
+    ),
+    ErrorCategory.LOCK_ERROR: RetryStrategy(
+        max_retries=3,
+        backoff_factor=5.0,
+        description="Lock contention - wait longer between retries",
+    ),
+    ErrorCategory.UNKNOWN: RetryStrategy(
+        max_retries=2,
+        backoff_factor=2.0,
+        description="Unknown errors - conservative retry",
+    ),
+}
+
+# Permanent error categories that should never be retried
+PERMANENT_ERRORS: set[ErrorCategory] = {
+    ErrorCategory.PERMISSION_DENIED,
+    ErrorCategory.PACKAGE_NOT_FOUND,
+    ErrorCategory.CONFIGURATION_ERROR,
+    ErrorCategory.DEPENDENCY_MISSING,
+    ErrorCategory.CONFLICT,
+    ErrorCategory.DISK_SPACE,
+}
+
+
+def load_strategies_from_env() -> dict[ErrorCategory, RetryStrategy]:
+    """
+    Load retry strategies from environment variables, falling back to defaults.
+
+    Environment variables:
+        CORTEX_RETRY_NETWORK_MAX: Max retries for network errors (default: 5)
+        CORTEX_RETRY_NETWORK_BACKOFF: Backoff factor for network errors (default: 1.0)
+        CORTEX_RETRY_LOCK_MAX: Max retries for lock errors (default: 3)
+        CORTEX_RETRY_LOCK_BACKOFF: Backoff factor for lock errors (default: 5.0)
+        CORTEX_RETRY_UNKNOWN_MAX: Max retries for unknown errors (default: 2)
+        CORTEX_RETRY_UNKNOWN_BACKOFF: Backoff factor for unknown errors (default: 2.0)
+    """
+    strategies = dict(DEFAULT_STRATEGIES)
+
+    # Network error overrides
+    if os.getenv("CORTEX_RETRY_NETWORK_MAX") or os.getenv("CORTEX_RETRY_NETWORK_BACKOFF"):
+        strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_NETWORK_MAX", "5")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_NETWORK_BACKOFF", "1.0")),
+            description="Network issues (user-configured)",
+        )
+
+    # Lock error overrides
+    if os.getenv("CORTEX_RETRY_LOCK_MAX") or os.getenv("CORTEX_RETRY_LOCK_BACKOFF"):
+        strategies[ErrorCategory.LOCK_ERROR] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_LOCK_MAX", "3")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_LOCK_BACKOFF", "5.0")),
+            description="Lock contention (user-configured)",
+        )
+
+    # Unknown error overrides
+    if os.getenv("CORTEX_RETRY_UNKNOWN_MAX") or os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF"):
+        strategies[ErrorCategory.UNKNOWN] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_UNKNOWN_MAX", "2")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF", "2.0")),
+            description="Unknown errors (user-configured)",
+        )
+
+    return strategies
+
 
 class SmartRetry:
     """
     Implements smart retry logic with exponential backoff.
     Uses ErrorParser to distinguish between transient and permanent errors.
+    Supports different retry strategies per error category.
     """
 
     def __init__(
         self,
-        max_retries: int = 5,
-        backoff_factor: float = 1.0,
+        strategies: dict[ErrorCategory, RetryStrategy] | None = None,
         status_callback: Callable[[str], None] | None = None,
     ):
-        if not isinstance(max_retries, int) or max_retries < 0:
-            raise ValueError("max_retries must be a non-negative integer")
-        if not isinstance(backoff_factor, (int, float)) or backoff_factor < 0:
-            raise ValueError("backoff_factor must be a non-negative number")
+        """
+        Initialize SmartRetry with optional custom strategies.
 
-        self.max_retries = max_retries
-        self.backoff_factor = float(backoff_factor)
+        Args:
+            strategies: Custom retry strategies per error category.
+                        If None, loads from environment or uses defaults.
+            status_callback: Optional callback for status messages.
+        """
+        self.strategies = strategies if strategies is not None else load_strategies_from_env()
         self.status_callback = status_callback
         self.error_parser = ErrorParser()
 
@@ -45,8 +132,9 @@ def run(self, func: Callable[[], Any]) -> Any:
         attempt = 0
         last_exception = None
         last_result = None
+        current_strategy: RetryStrategy | None = None
 
-        while attempt <= self.max_retries:
+        while True:
             try:
                 result = func()
                 last_result = result
@@ -62,22 +150,34 @@ def run(self, func: Callable[[], Any]) -> Any:
                 elif hasattr(result, "stdout") and result.stdout:
                     error_msg = result.stdout
 
-                if not self._should_retry(error_msg):
+                category = self._get_error_category(error_msg)
+                current_strategy = self._get_strategy(category)
+
+                if current_strategy is None:
+                    # Permanent error - fail fast
                     return result
 
             except Exception as e:
                 last_exception = e
-                if not self._should_retry(str(e)):
+                category = self._get_error_category(str(e))
+                current_strategy = self._get_strategy(category)
+
+                if current_strategy is None:
+                    # Permanent error - fail fast
                     raise
 
-            # If we are here, we need to retry (unless max retries reached)
-            if attempt == self.max_retries:
+            # Check if we've exhausted retries for this strategy
+            if current_strategy is None or attempt >= current_strategy.max_retries:
                 break
 
             attempt += 1
-            sleep_time = self.backoff_factor * (2 ** (attempt - 1))
+            sleep_time = current_strategy.backoff_factor * (2 ** (attempt - 1))
 
-            msg = f"⚠️ Transient error detected. Retrying in {sleep_time}s... (Attempt {attempt}/{self.max_retries})"
+            category_name = category.name if category else "UNKNOWN"
+            msg = (
+                f"⚠️ {category_name} detected. "
+                f"Retrying in {sleep_time}s... (Attempt {attempt}/{current_strategy.max_retries})"
+            )
             logger.warning(msg)
             if self.status_callback:
                 self.status_callback(msg)
@@ -88,39 +188,28 @@ def run(self, func: Callable[[], Any]) -> Any:
             raise last_exception
         return last_result
 
-    def _should_retry(self, error_message: str) -> bool:
-        """
-        Determine if we should retry based on the error message.
-        """
+    def _get_error_category(self, error_message: str) -> ErrorCategory | None:
+        """Classify the error message into a category."""
         if not error_message:
-            # If no error message, assume it's a generic failure that might be transient
-            return True
+            return ErrorCategory.UNKNOWN
 
         analysis = self.error_parser.parse_error(error_message)
-        category = analysis.primary_category
 
-        # If the error is explicitly marked as not fixable, fail fast
+        # If the error is explicitly marked as not fixable, treat as permanent
         if not analysis.is_fixable:
-            return False
-
-        # Retry on network errors, lock errors, or unknown errors (conservative)
-        if category in [
-            ErrorCategory.NETWORK_ERROR,
-            ErrorCategory.LOCK_ERROR,
-            ErrorCategory.UNKNOWN,
-        ]:
-            return True
-
-        # Fail fast on permanent errors
-        if category in [
-            ErrorCategory.PERMISSION_DENIED,
-            ErrorCategory.PACKAGE_NOT_FOUND,
-            ErrorCategory.CONFIGURATION_ERROR,
-            ErrorCategory.DEPENDENCY_MISSING,
-            ErrorCategory.CONFLICT,
-            ErrorCategory.DISK_SPACE,
-        ]:
-            return False
-
-        # Default to retry for safety if not explicitly categorized as permanent
-        return True
+            return None
+
+        return analysis.primary_category
+
+    def _get_strategy(self, category: ErrorCategory | None) -> RetryStrategy | None:
+        """
+        Get the retry strategy for a given error category.
+        Returns None for permanent errors (should not retry).
+        """
+        if category is None:
+            return None
+
+        if category in PERMANENT_ERRORS:
+            return None
+
+        return self.strategies.get(category)
diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md
new file mode 100644
index 000000000..0b8bad7b3
--- /dev/null
+++ b/docs/RETRY_CONFIGURATION.md
@@ -0,0 +1,173 @@
+# Retry Configuration Guide
+
+Cortex CLI includes a **Smart Retry** mechanism that automatically recovers from transient failures during package installations. This guide explains how retry logic works and how to configure it.
+
+## How It Works
+
+When an installation command fails, Cortex analyzes the error to determine if it's:
+
+1. **Transient** (temporary, likely to resolve): Network timeouts, lock contention, etc.
+2. **Permanent** (unlikely to resolve): Permission denied, package not found, disk full, etc.
+
+For transient errors, Cortex retries the command with **exponential backoff**—waiting progressively longer between attempts (1s, 2s, 4s, etc.) to allow the issue to resolve.
+
+## Default Retry Strategies
+
+Each error type has its own retry strategy:
+
+| Error Type | Max Retries | Base Backoff | Rationale |
+|------------|-------------|--------------|-----------|
+| **Network Error** | 5 | 1.0s | Short blips resolve quickly; retry aggressively |
+| **Lock Error** | 3 | 5.0s | Locks take time to release; wait longer |
+| **Unknown Error** | 2 | 2.0s | Conservative approach for unclassified errors |
+
+**Permanent errors** (Permission Denied, Package Not Found, Disk Space, Dependency Missing, Configuration Error, Conflict) **never retry**—they fail immediately.
+
+## Backoff Calculation
+
+The wait time before each retry uses exponential backoff:
+
+```
+wait_time = backoff_factor × 2^(attempt - 1)
+```
+
+Example for Network Error (backoff_factor = 1.0):
+- Attempt 1: 1.0s wait
+- Attempt 2: 2.0s wait
+- Attempt 3: 4.0s wait
+- Attempt 4: 8.0s wait
+- Attempt 5: 16.0s wait
+
+## Configuration via Environment Variables
+
+Override default strategies using environment variables:
+
+### Network Error Configuration
+```bash
+export CORTEX_RETRY_NETWORK_MAX=10        # Max retry attempts (default: 5)
+export CORTEX_RETRY_NETWORK_BACKOFF=0.5   # Base backoff in seconds (default: 1.0)
+```
+
+### Lock Error Configuration
+```bash
+export CORTEX_RETRY_LOCK_MAX=5            # Max retry attempts (default: 3)
+export CORTEX_RETRY_LOCK_BACKOFF=10.0     # Base backoff in seconds (default: 5.0)
+```
+
+### Unknown Error Configuration
+```bash
+export CORTEX_RETRY_UNKNOWN_MAX=3         # Max retry attempts (default: 2)
+export CORTEX_RETRY_UNKNOWN_BACKOFF=1.0   # Base backoff in seconds (default: 2.0)
+```
+
+## Examples
+
+### Aggressive Retry for Unstable Networks
+
+If you're on an unstable connection and want more retries with shorter waits:
+
+```bash
+export CORTEX_RETRY_NETWORK_MAX=10
+export CORTEX_RETRY_NETWORK_BACKOFF=0.5
+cortex install docker --execute
+```
+
+This gives 10 attempts with waits: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s.
+
+### Patient Retry for Shared Systems
+
+If you're on a shared server where `apt` locks are common:
+
+```bash
+export CORTEX_RETRY_LOCK_MAX=5
+export CORTEX_RETRY_LOCK_BACKOFF=30.0
+cortex install nginx --execute
+```
+
+This gives 5 attempts with waits: 30s, 60s, 120s, 240s, 480s (up to 8 minutes total wait).
+
+### Disable All Retries
+
+For CI/CD pipelines where you want fast failure:
+
+```bash
+export CORTEX_RETRY_NETWORK_MAX=0
+export CORTEX_RETRY_LOCK_MAX=0
+export CORTEX_RETRY_UNKNOWN_MAX=0
+cortex install package --execute
+```
+
+## User Feedback
+
+During retries, Cortex displays messages like:
+
+```
+⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5)
+```
+
+This shows:
+- The error type that was detected
+- How long until the next attempt
+- The current attempt number and maximum attempts
+
+## Error Categories Reference
+
+### Transient (Retried)
+
+| Category | Example Errors |
+|----------|----------------|
+| `NETWORK_ERROR` | "Connection timed out", "Temporary failure resolving" |
+| `LOCK_ERROR` | "Could not get lock", "dpkg was interrupted" |
+| `UNKNOWN` | Unclassified errors that might be transient |
+
+### Permanent (Never Retried)
+
+| Category | Example Errors |
+|----------|----------------|
+| `PERMISSION_DENIED` | "Permission denied", "Operation not permitted" |
+| `PACKAGE_NOT_FOUND` | "Unable to locate package", "No such package" |
+| `DISK_SPACE` | "No space left on device" |
+| `DEPENDENCY_MISSING` | "Depends: X but it is not installable" |
+| `CONFIGURATION_ERROR` | "Configuration file syntax error" |
+| `CONFLICT` | "Conflicts with package X" |
+
+## Programmatic Usage
+
+For advanced use cases, you can customize strategies in code:
+
+```python
+from cortex.utils.retry import SmartRetry, RetryStrategy, DEFAULT_STRATEGIES
+from cortex.error_parser import ErrorCategory
+
+# Custom strategies
+custom_strategies = dict(DEFAULT_STRATEGIES)
+custom_strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+    max_retries=10,
+    backoff_factor=0.5,
+    description="Custom network retry"
+)
+
+retry = SmartRetry(strategies=custom_strategies)
+result = retry.run(my_function)
+```
+
+## Troubleshooting
+
+### Retries Not Happening
+
+1. Check if the error is classified as permanent (see table above)
+2. Verify environment variables are set correctly
+3. Run with `--verbose` to see detailed error classification
+
+### Retries Taking Too Long
+
+Reduce `backoff_factor` or `max_retries` via environment variables.
+
+### Need More Aggressive Retries
+
+Increase `max_retries` and decrease `backoff_factor`.
+
+---
+
+**Version**: 0.9.0  
+**Last Updated**: January 2026
diff --git a/tests/test_retry.py b/tests/test_retry.py
index bb2fc053a..6e7660a25 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -1,14 +1,89 @@
-import time
 import unittest
 from unittest.mock import Mock, patch
 
 from cortex.error_parser import ErrorCategory
-from cortex.utils.retry import SmartRetry
+from cortex.utils.retry import (
+    DEFAULT_STRATEGIES,
+    PERMANENT_ERRORS,
+    RetryStrategy,
+    SmartRetry,
+    load_strategies_from_env,
+)
+
+
+class TestRetryStrategy(unittest.TestCase):
+    """Tests for the RetryStrategy dataclass."""
+
+    def test_strategy_creation(self):
+        strategy = RetryStrategy(max_retries=5, backoff_factor=1.0, description="Test")
+        self.assertEqual(strategy.max_retries, 5)
+        self.assertEqual(strategy.backoff_factor, 1.0)
+        self.assertEqual(strategy.description, "Test")
+
+
+class TestDefaultStrategies(unittest.TestCase):
+    """Tests for default strategy configurations."""
+
+    def test_network_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.NETWORK_ERROR]
+        self.assertEqual(strategy.max_retries, 5)
+        self.assertEqual(strategy.backoff_factor, 1.0)
+
+    def test_lock_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.LOCK_ERROR]
+        self.assertEqual(strategy.max_retries, 3)
+        self.assertEqual(strategy.backoff_factor, 5.0)
+
+    def test_unknown_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.UNKNOWN]
+        self.assertEqual(strategy.max_retries, 2)
+        self.assertEqual(strategy.backoff_factor, 2.0)
+
+    def test_permanent_errors_not_in_strategies(self):
+        for error in PERMANENT_ERRORS:
+            self.assertNotIn(error, DEFAULT_STRATEGIES)
+
+
+class TestLoadStrategiesFromEnv(unittest.TestCase):
+    """Tests for environment variable configuration."""
+
+    def test_default_strategies_when_no_env_vars(self):
+        with patch.dict("os.environ", {}, clear=True):
+            strategies = load_strategies_from_env()
+            self.assertEqual(
+                strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5
+            )
+
+    def test_network_override_from_env(self):
+        with patch.dict(
+            "os.environ",
+            {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"},
+        ):
+            strategies = load_strategies_from_env()
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10)
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].backoff_factor, 0.5)
+
+    def test_lock_override_from_env(self):
+        with patch.dict(
+            "os.environ",
+            {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"},
+        ):
+            strategies = load_strategies_from_env()
+            self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6)
+            self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].backoff_factor, 10.0)
 
 
 class TestSmartRetry(unittest.TestCase):
+    """Tests for SmartRetry class."""
+
     def setUp(self):
-        self.retry = SmartRetry(max_retries=3, backoff_factor=0.01)
+        # Use custom strategies with short backoff for fast tests
+        self.fast_strategies = {
+            ErrorCategory.NETWORK_ERROR: RetryStrategy(3, 0.01, "Test network"),
+            ErrorCategory.LOCK_ERROR: RetryStrategy(2, 0.01, "Test lock"),
+            ErrorCategory.UNKNOWN: RetryStrategy(2, 0.01, "Test unknown"),
+        }
+        self.retry = SmartRetry(strategies=self.fast_strategies)
 
     def test_success_first_try(self):
         mock_func = Mock()
@@ -21,11 +96,10 @@ def test_success_first_try(self):
         self.assertEqual(result, mock_result)
         self.assertEqual(mock_func.call_count, 1)
 
-    @patch("time.sleep")
-    def test_retry_on_transient_error(self, mock_sleep):
+    @patch("cortex.utils.retry.time.sleep")
+    def test_retry_on_network_error(self, mock_sleep):
         mock_func = Mock()
 
-        # Fail twice with network error, then succeed
         fail_result = Mock()
         fail_result.returncode = 1
         fail_result.stderr = "Connection timed out"
@@ -41,8 +115,8 @@ def test_retry_on_transient_error(self, mock_sleep):
         self.assertEqual(mock_func.call_count, 3)
         self.assertEqual(mock_sleep.call_count, 2)
 
-    @patch("time.sleep")
-    def test_fail_fast_on_permanent_error(self, mock_sleep):
+    @patch("cortex.utils.retry.time.sleep")
+    def test_fail_fast_on_permission_denied(self, mock_sleep):
         mock_func = Mock()
 
         fail_result = Mock()
@@ -57,7 +131,23 @@ def test_fail_fast_on_permanent_error(self, mock_sleep):
         self.assertEqual(mock_func.call_count, 1)
         mock_sleep.assert_not_called()
 
-    @patch("time.sleep")
+    @patch("cortex.utils.retry.time.sleep")
+    def test_fail_fast_on_disk_space(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "No space left on device"
+
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        self.assertEqual(mock_func.call_count, 1)
+        mock_sleep.assert_not_called()
+
+    @patch("cortex.utils.retry.time.sleep")
     def test_max_retries_exceeded(self, mock_sleep):
         mock_func = Mock()
 
@@ -70,26 +160,31 @@ def test_max_retries_exceeded(self, mock_sleep):
         result = self.retry.run(mock_func)
 
         self.assertEqual(result, fail_result)
-        self.assertEqual(mock_func.call_count, 4)  # Initial + 3 retries
+        # 1 initial + 3 retries for network error strategy
+        self.assertEqual(mock_func.call_count, 4)
         self.assertEqual(mock_sleep.call_count, 3)
 
-    @patch("time.sleep")
-    def test_exception_retry(self, mock_sleep):
+    @patch("cortex.utils.retry.time.sleep")
+    def test_different_strategy_for_lock_error(self, mock_sleep):
         mock_func = Mock()
-        mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)]
 
-        # We need to mock ErrorParser to classify "Network error" as transient if it's not standard
-        # But SmartRetry defaults to retry on unknown errors, so generic Exception should trigger retry
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Could not get lock /var/lib/apt/lists/lock"
+
+        mock_func.return_value = fail_result
 
         result = self.retry.run(mock_func)
 
-        self.assertEqual(result.returncode, 0)
-        self.assertEqual(mock_func.call_count, 2)
+        self.assertEqual(result, fail_result)
+        # Lock error strategy has max_retries=2, so 1 initial + 2 retries = 3
+        self.assertEqual(mock_func.call_count, 3)
+        self.assertEqual(mock_sleep.call_count, 2)
 
-    @patch("time.sleep")
+    @patch("cortex.utils.retry.time.sleep")
     def test_callback_notification(self, mock_sleep):
         callback = Mock()
-        retry = SmartRetry(max_retries=1, backoff_factor=0.01, status_callback=callback)
+        retry = SmartRetry(strategies=self.fast_strategies, status_callback=callback)
 
         mock_func = Mock()
         fail_result = Mock()
@@ -101,8 +196,19 @@ def test_callback_notification(self, mock_sleep):
         retry.run(mock_func)
 
         callback.assert_called_once()
+        self.assertIn("NETWORK_ERROR", callback.call_args[0][0])
         self.assertIn("Retrying", callback.call_args[0][0])
 
+    @patch("cortex.utils.retry.time.sleep")
+    def test_exception_retry(self, mock_sleep):
+        mock_func = Mock()
+        mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)]
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result.returncode, 0)
+        self.assertEqual(mock_func.call_count, 2)
+
 
 if __name__ == "__main__":
     unittest.main()

From af3e8825ad7109234541c46d13fcea13d212e2fb Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Jan 2026 17:42:22 +0000
Subject: [PATCH 08/11] [autofix.ci] apply automated fixes

---
 cortex/cli.py       | 2 +-
 tests/test_retry.py | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/cortex/cli.py b/cortex/cli.py
index 693410efa..6efa9504d 100644
--- a/cortex/cli.py
+++ b/cortex/cli.py
@@ -20,7 +20,6 @@
 from cortex.ask import AskHandler
 from cortex.branding import VERSION, console, cx_header, cx_print, show_banner
 from cortex.coordinator import InstallationCoordinator, InstallationStep, StepStatus
-from cortex.utils.retry import DEFAULT_MAX_RETRIES
 from cortex.demo import run_demo
 from cortex.dependency_importer import DependencyImporter, PackageEcosystem, ParseResult
 from cortex.env_manager import EnvironmentManager, get_env_manager
@@ -41,6 +40,7 @@
 )
 from cortex.update_checker import UpdateChannel, should_notify_update
 from cortex.updater import Updater, UpdateStatus
+from cortex.utils.retry import DEFAULT_MAX_RETRIES
 from cortex.validators import validate_api_key, validate_install_request
 from cortex.version_manager import get_version_string
 
diff --git a/tests/test_retry.py b/tests/test_retry.py
index 6e7660a25..c6d8a659e 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -50,9 +50,7 @@ class TestLoadStrategiesFromEnv(unittest.TestCase):
     def test_default_strategies_when_no_env_vars(self):
         with patch.dict("os.environ", {}, clear=True):
             strategies = load_strategies_from_env()
-            self.assertEqual(
-                strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5
-            )
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5)
 
     def test_network_override_from_env(self):
         with patch.dict(

From f2bd1e8a2ac284704826132d9e65dd025ad1a17d Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Fri, 23 Jan 2026 23:31:01 +0530
Subject: [PATCH 09/11] resolved suggestions

---
 cortex/coordinator.py |  5 +++--
 cortex/utils/retry.py | 13 ++++++++++---
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index 08122c1ba..2183ee815 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -194,8 +194,9 @@ def run_cmd() -> subprocess.CompletedProcess[str]:
 
         def status_callback(msg: str) -> None:
             self._log(msg)
-            # Also print to stdout so the user sees the retry happening
-            print(msg)
+            # Only print to stdout if no progress callback is configured to avoid duplicates
+            if self.progress_callback is None:
+                print(msg)
 
         # Load strategies and apply CLI override for network errors
         strategies = load_strategies_from_env()
diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
index 3fa6f12c7..598f73313 100644
--- a/cortex/utils/retry.py
+++ b/cortex/utils/retry.py
@@ -114,6 +114,14 @@ def __init__(
             status_callback: Optional callback for status messages.
         """
         self.strategies = strategies if strategies is not None else load_strategies_from_env()
+        
+        # Validate strategies
+        for category, strategy in self.strategies.items():
+            if strategy.max_retries < 0:
+                raise ValueError(f"Strategy for {category.name}: max_retries must be non-negative")
+            if strategy.backoff_factor <= 0:
+                raise ValueError(f"Strategy for {category.name}: backoff_factor must be positive")
+
         self.status_callback = status_callback
         self.error_parser = ErrorParser()
 
@@ -147,8 +155,6 @@ def run(self, func: Callable[[], Any]) -> Any:
                 error_msg = ""
                 if hasattr(result, "stderr") and result.stderr:
                     error_msg = result.stderr
-                elif hasattr(result, "stdout") and result.stdout:
-                    error_msg = result.stdout
 
                 category = self._get_error_category(error_msg)
                 current_strategy = self._get_strategy(category)
@@ -176,7 +182,7 @@ def run(self, func: Callable[[], Any]) -> Any:
             category_name = category.name if category else "UNKNOWN"
             msg = (
                 f"⚠️ {category_name} detected. "
-                f"Retrying in {sleep_time}s... (Attempt {attempt}/{current_strategy.max_retries})"
+                f"Retrying in {sleep_time}s... (Retry {attempt}/{current_strategy.max_retries})"
             )
             logger.warning(msg)
             if self.status_callback:
@@ -191,6 +197,7 @@ def run(self, func: Callable[[], Any]) -> Any:
     def _get_error_category(self, error_message: str) -> ErrorCategory | None:
         """Classify the error message into a category."""
         if not error_message:
+            logger.warning("Retry: Empty error message detected. Assuming UNKNOWN (transient).")
             return ErrorCategory.UNKNOWN
 
         analysis = self.error_parser.parse_error(error_message)

From 4d0162997c14939de7468287618f6591122e8d08 Mon Sep 17 00:00:00 2001
From: "autofix-ci[bot]" <114827586+autofix-ci[bot]@users.noreply.github.com>
Date: Fri, 23 Jan 2026 18:01:59 +0000
Subject: [PATCH 10/11] [autofix.ci] apply automated fixes

---
 cortex/utils/retry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
index 598f73313..ed5903615 100644
--- a/cortex/utils/retry.py
+++ b/cortex/utils/retry.py
@@ -114,7 +114,7 @@ def __init__(
             status_callback: Optional callback for status messages.
         """
         self.strategies = strategies if strategies is not None else load_strategies_from_env()
-        
+
         # Validate strategies
         for category, strategy in self.strategies.items():
             if strategy.max_retries < 0:

From 3f2aa2717b7141820a1e2df2f8ccec066078df05 Mon Sep 17 00:00:00 2001
From: krishnaShuk <shuklakrishna.kris@gmail.com>
Date: Sat, 24 Jan 2026 00:33:36 +0530
Subject: [PATCH 11/11] minor changes

---
 cortex/coordinator.py       | 9 ++++++++-
 docs/RETRY_CONFIGURATION.md | 4 ++--
 tests/test_retry.py         | 2 ++
 3 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index 2183ee815..1f982c963 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -12,6 +12,7 @@
 from cortex.utils.retry import (
     DEFAULT_MAX_RETRIES,
     ErrorCategory,
+    RetryStrategy,
     SmartRetry,
     load_strategies_from_env,
 )
@@ -201,7 +202,13 @@ def status_callback(msg: str) -> None:
         # Load strategies and apply CLI override for network errors
         strategies = load_strategies_from_env()
         if ErrorCategory.NETWORK_ERROR in strategies:
-            strategies[ErrorCategory.NETWORK_ERROR].max_retries = self.max_retries
+            # Create a new instance to avoid mutating the shared default object
+            original_strategy = strategies[ErrorCategory.NETWORK_ERROR]
+            strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+                max_retries=self.max_retries,
+                backoff_factor=original_strategy.backoff_factor,
+                description=original_strategy.description,
+            )
 
         retry_handler = SmartRetry(
             strategies=strategies,
diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md
index 0b8bad7b3..e1acf3c5e 100644
--- a/docs/RETRY_CONFIGURATION.md
+++ b/docs/RETRY_CONFIGURATION.md
@@ -27,7 +27,7 @@ Each error type has its own retry strategy:
 
 The wait time before each retry uses exponential backoff:
 
-```
+```text
 wait_time = backoff_factor × 2^(attempt - 1)
 ```
 
@@ -101,7 +101,7 @@ cortex install package --execute
 
 During retries, Cortex displays messages like:
 
-```
+```text
 ⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5)
 ```
 
diff --git a/tests/test_retry.py b/tests/test_retry.py
index c6d8a659e..9061b6a70 100644
--- a/tests/test_retry.py
+++ b/tests/test_retry.py
@@ -56,6 +56,7 @@ def test_network_override_from_env(self):
         with patch.dict(
             "os.environ",
             {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"},
+            clear=True,
         ):
             strategies = load_strategies_from_env()
             self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10)
@@ -65,6 +66,7 @@ def test_lock_override_from_env(self):
         with patch.dict(
             "os.environ",
             {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"},
+            clear=True,
         ):
             strategies = load_strategies_from_env()
             self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6)