diff --git a/cortex/cli.py b/cortex/cli.py
index 267228b0..6efa9504 100644
--- a/cortex/cli.py
+++ b/cortex/cli.py
@@ -40,6 +40,7 @@
 )
 from cortex.update_checker import UpdateChannel, should_notify_update
 from cortex.updater import Updater, UpdateStatus
+from cortex.utils.retry import DEFAULT_MAX_RETRIES
 from cortex.validators import validate_api_key, validate_install_request
 from cortex.version_manager import get_version_string
 
@@ -1445,6 +1446,7 @@ def install(
         dry_run: bool = False,
         parallel: bool = False,
         json_output: bool = False,
+        max_retries: int = DEFAULT_MAX_RETRIES,
     ) -> int:
         """Install software using the LLM-powered package manager."""
         # Initialize installation history
@@ -1670,6 +1672,7 @@ def parallel_log_callback(message: str, level: str = "info"):
                     timeout=300,
                     stop_on_error=True,
                     progress_callback=progress_callback,
+                    max_retries=max_retries,
                 )
 
                 result = coordinator.execute()
diff --git a/cortex/coordinator.py b/cortex/coordinator.py
index ac19bf80..1f982c96 100644
--- a/cortex/coordinator.py
+++ b/cortex/coordinator.py
@@ -9,6 +9,13 @@
 from enum import Enum
 from typing import Any
 
+from cortex.utils.retry import (
+    DEFAULT_MAX_RETRIES,
+    ErrorCategory,
+    RetryStrategy,
+    SmartRetry,
+    load_strategies_from_env,
+)
 from cortex.validators import DANGEROUS_PATTERNS
 
 logger = logging.getLogger(__name__)
@@ -60,6 +67,7 @@ def __init__(
         enable_rollback: bool = False,
         log_file: str | None = None,
         progress_callback: Callable[[int, int, InstallationStep], None] | None = None,
+        max_retries: int = DEFAULT_MAX_RETRIES,
     ):
         """Initialize an installation run with optional logging and rollback."""
         self.timeout = timeout
@@ -67,6 +75,7 @@ def __init__(
         self.enable_rollback = enable_rollback
         self.log_file = log_file
         self.progress_callback = progress_callback
+        self.max_retries = max_retries
 
         if descriptions and len(descriptions) != len(commands):
             raise ValueError("Number of descriptions must match number of commands")
@@ -90,6 +99,7 @@ def from_plan(
         enable_rollback: bool | None = None,
         log_file: str | None = None,
         progress_callback: Callable[[int, int, InstallationStep], None] | None = None,
+        max_retries: int = 5,
     ) -> "InstallationCoordinator":
         """Create a coordinator from a structured plan produced by an LLM.
 
@@ -124,6 +134,7 @@ def from_plan(
             ),
             log_file=log_file,
             progress_callback=progress_callback,
+            max_retries=max_retries,
         )
 
         for rollback_cmd in rollback_commands:
@@ -174,14 +185,39 @@ def _execute_command(self, step: InstallationStep) -> bool:
             self._log(f"Command blocked: {step.command} - {error}")
             return False
 
-        try:
+        def run_cmd() -> subprocess.CompletedProcess[str]:
             # Use shell=True carefully - commands are validated first
             # For complex shell commands (pipes, redirects), shell=True is needed
             # Simple commands could use shlex.split() with shell=False
-            result = subprocess.run(
+            return subprocess.run(
                 step.command, shell=True, capture_output=True, text=True, timeout=self.timeout
             )
 
+        def status_callback(msg: str) -> None:
+            self._log(msg)
+            # Only print to stdout if no progress callback is configured to avoid duplicates
+            if self.progress_callback is None:
+                print(msg)
+
+        # Load strategies and apply CLI override for network errors
+        strategies = load_strategies_from_env()
+        if ErrorCategory.NETWORK_ERROR in strategies:
+            # Create a new instance to avoid mutating the shared default object
+            original_strategy = strategies[ErrorCategory.NETWORK_ERROR]
+            strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+                max_retries=self.max_retries,
+                backoff_factor=original_strategy.backoff_factor,
+                description=original_strategy.description,
+            )
+
+        retry_handler = SmartRetry(
+            strategies=strategies,
+            status_callback=status_callback,
+        )
+
+        try:
+            result = retry_handler.run(run_cmd)
+
             step.return_code = result.returncode
             step.output = result.stdout
             step.error = result.stderr
diff --git a/cortex/utils/retry.py b/cortex/utils/retry.py
new file mode 100644
index 00000000..ed590361
--- /dev/null
+++ b/cortex/utils/retry.py
@@ -0,0 +1,222 @@
+import logging
+import os
+import time
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any
+
+from cortex.error_parser import ErrorCategory, ErrorParser
+
+logger = logging.getLogger(__name__)
+
+# Default maximum number of retries for the global retry setting
+DEFAULT_MAX_RETRIES = 5
+
+
+@dataclass
+class RetryStrategy:
+    """Configuration for how to retry a specific error type."""
+
+    max_retries: int
+    backoff_factor: float
+    description: str
+
+
+# Default strategies for each retryable error category
+DEFAULT_STRATEGIES: dict[ErrorCategory, RetryStrategy] = {
+    ErrorCategory.NETWORK_ERROR: RetryStrategy(
+        max_retries=DEFAULT_MAX_RETRIES,
+        backoff_factor=1.0,
+        description="Network issues - retry aggressively with short backoff",
+    ),
+    ErrorCategory.LOCK_ERROR: RetryStrategy(
+        max_retries=3,
+        backoff_factor=5.0,
+        description="Lock contention - wait longer between retries",
+    ),
+    ErrorCategory.UNKNOWN: RetryStrategy(
+        max_retries=2,
+        backoff_factor=2.0,
+        description="Unknown errors - conservative retry",
+    ),
+}
+
+# Permanent error categories that should never be retried
+PERMANENT_ERRORS: set[ErrorCategory] = {
+    ErrorCategory.PERMISSION_DENIED,
+    ErrorCategory.PACKAGE_NOT_FOUND,
+    ErrorCategory.CONFIGURATION_ERROR,
+    ErrorCategory.DEPENDENCY_MISSING,
+    ErrorCategory.CONFLICT,
+    ErrorCategory.DISK_SPACE,
+}
+
+
+def load_strategies_from_env() -> dict[ErrorCategory, RetryStrategy]:
+    """
+    Load retry strategies from environment variables, falling back to defaults.
+
+    Environment variables:
+        CORTEX_RETRY_NETWORK_MAX: Max retries for network errors (default: 5)
+        CORTEX_RETRY_NETWORK_BACKOFF: Backoff factor for network errors (default: 1.0)
+        CORTEX_RETRY_LOCK_MAX: Max retries for lock errors (default: 3)
+        CORTEX_RETRY_LOCK_BACKOFF: Backoff factor for lock errors (default: 5.0)
+        CORTEX_RETRY_UNKNOWN_MAX: Max retries for unknown errors (default: 2)
+        CORTEX_RETRY_UNKNOWN_BACKOFF: Backoff factor for unknown errors (default: 2.0)
+    """
+    strategies = dict(DEFAULT_STRATEGIES)
+
+    # Network error overrides
+    if os.getenv("CORTEX_RETRY_NETWORK_MAX") or os.getenv("CORTEX_RETRY_NETWORK_BACKOFF"):
+        strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_NETWORK_MAX", "5")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_NETWORK_BACKOFF", "1.0")),
+            description="Network issues (user-configured)",
+        )
+
+    # Lock error overrides
+    if os.getenv("CORTEX_RETRY_LOCK_MAX") or os.getenv("CORTEX_RETRY_LOCK_BACKOFF"):
+        strategies[ErrorCategory.LOCK_ERROR] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_LOCK_MAX", "3")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_LOCK_BACKOFF", "5.0")),
+            description="Lock contention (user-configured)",
+        )
+
+    # Unknown error overrides
+    if os.getenv("CORTEX_RETRY_UNKNOWN_MAX") or os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF"):
+        strategies[ErrorCategory.UNKNOWN] = RetryStrategy(
+            max_retries=int(os.getenv("CORTEX_RETRY_UNKNOWN_MAX", "2")),
+            backoff_factor=float(os.getenv("CORTEX_RETRY_UNKNOWN_BACKOFF", "2.0")),
+            description="Unknown errors (user-configured)",
+        )
+
+    return strategies
+
+
+class SmartRetry:
+    """
+    Implements smart retry logic with exponential backoff.
+    Uses ErrorParser to distinguish between transient and permanent errors.
+    Supports different retry strategies per error category.
+    """
+
+    def __init__(
+        self,
+        strategies: dict[ErrorCategory, RetryStrategy] | None = None,
+        status_callback: Callable[[str], None] | None = None,
+    ):
+        """
+        Initialize SmartRetry with optional custom strategies.
+
+        Args:
+            strategies: Custom retry strategies per error category.
+                        If None, loads from environment or uses defaults.
+            status_callback: Optional callback for status messages.
+        """
+        self.strategies = strategies if strategies is not None else load_strategies_from_env()
+
+        # Validate strategies
+        for category, strategy in self.strategies.items():
+            if strategy.max_retries < 0:
+                raise ValueError(f"Strategy for {category.name}: max_retries must be non-negative")
+            if strategy.backoff_factor <= 0:
+                raise ValueError(f"Strategy for {category.name}: backoff_factor must be positive")
+
+        self.status_callback = status_callback
+        self.error_parser = ErrorParser()
+
+    def run(self, func: Callable[[], Any]) -> Any:
+        """
+        Run a function with smart retry logic.
+
+        Args:
+            func: The function to execute. Expected to return a result object
+                  that has `returncode`, `stdout`, and `stderr` attributes
+                  (like subprocess.CompletedProcess), or raise an exception.
+
+        Returns:
+            The result of the function call.
+        """
+        attempt = 0
+        last_exception = None
+        last_result = None
+        current_strategy: RetryStrategy | None = None
+
+        while True:
+            try:
+                result = func()
+                last_result = result
+
+                # If result indicates success (returncode 0), return immediately
+                if hasattr(result, "returncode") and result.returncode == 0:
+                    return result
+
+                # If result indicates failure, analyze it
+                error_msg = ""
+                if hasattr(result, "stderr") and result.stderr:
+                    error_msg = result.stderr
+
+                category = self._get_error_category(error_msg)
+                current_strategy = self._get_strategy(category)
+
+                if current_strategy is None:
+                    # Permanent error - fail fast
+                    return result
+
+            except Exception as e:
+                last_exception = e
+                category = self._get_error_category(str(e))
+                current_strategy = self._get_strategy(category)
+
+                if current_strategy is None:
+                    # Permanent error - fail fast
+                    raise
+
+            # Check if we've exhausted retries for this strategy
+            if current_strategy is None or attempt >= current_strategy.max_retries:
+                break
+
+            attempt += 1
+            sleep_time = current_strategy.backoff_factor * (2 ** (attempt - 1))
+
+            category_name = category.name if category else "UNKNOWN"
+            msg = (
+                f"⚠️ {category_name} detected. "
+                f"Retrying in {sleep_time}s... (Retry {attempt}/{current_strategy.max_retries})"
+            )
+            logger.warning(msg)
+            if self.status_callback:
+                self.status_callback(msg)
+
+            time.sleep(sleep_time)
+
+        if last_exception:
+            raise last_exception
+        return last_result
+
+    def _get_error_category(self, error_message: str) -> ErrorCategory | None:
+        """Classify the error message into a category."""
+        if not error_message:
+            logger.warning("Retry: Empty error message detected. Assuming UNKNOWN (transient).")
+            return ErrorCategory.UNKNOWN
+
+        analysis = self.error_parser.parse_error(error_message)
+
+        # If the error is explicitly marked as not fixable, treat as permanent
+        if not analysis.is_fixable:
+            return None
+
+        return analysis.primary_category
+
+    def _get_strategy(self, category: ErrorCategory | None) -> RetryStrategy | None:
+        """
+        Get the retry strategy for a given error category.
+        Returns None for permanent errors (should not retry).
+        """
+        if category is None:
+            return None
+
+        if category in PERMANENT_ERRORS:
+            return None
+
+        return self.strategies.get(category)
diff --git a/docs/COMMANDS.md b/docs/COMMANDS.md
index a194d67f..cf740064 100644
--- a/docs/COMMANDS.md
+++ b/docs/COMMANDS.md
@@ -73,6 +73,7 @@ cortex install "python3 with pip and virtualenv" --execute
 - Without `--execute`, Cortex only shows the commands it would run
 - The `--dry-run` flag is recommended for first-time use to verify commands
 - Installation is recorded in history for potential rollback
+- **Smart Retry Logic**: Cortex automatically detects transient failures (like network timeouts) and retries commands with exponential backoff (up to 5 attempts). Permanent errors (like permission denied) fail immediately.
 
 ---
 
diff --git a/docs/RETRY_CONFIGURATION.md b/docs/RETRY_CONFIGURATION.md
new file mode 100644
index 00000000..e1acf3c5
--- /dev/null
+++ b/docs/RETRY_CONFIGURATION.md
@@ -0,0 +1,173 @@
+# Retry Configuration Guide
+
+Cortex CLI includes a **Smart Retry** mechanism that automatically recovers from transient failures during package installations. This guide explains how retry logic works and how to configure it.
+
+## How It Works
+
+When an installation command fails, Cortex analyzes the error to determine if it's:
+
+1. **Transient** (temporary, likely to resolve): Network timeouts, lock contention, etc.
+2. **Permanent** (unlikely to resolve): Permission denied, package not found, disk full, etc.
+
+For transient errors, Cortex retries the command with **exponential backoff**—waiting progressively longer between attempts (1s, 2s, 4s, etc.) to allow the issue to resolve.
+
+## Default Retry Strategies
+
+Each error type has its own retry strategy:
+
+| Error Type | Max Retries | Base Backoff | Rationale |
+|------------|-------------|--------------|-----------|
+| **Network Error** | 5 | 1.0s | Short blips resolve quickly; retry aggressively |
+| **Lock Error** | 3 | 5.0s | Locks take time to release; wait longer |
+| **Unknown Error** | 2 | 2.0s | Conservative approach for unclassified errors |
+
+**Permanent errors** (Permission Denied, Package Not Found, Disk Space, Dependency Missing, Configuration Error, Conflict) **never retry**—they fail immediately.
+
+## Backoff Calculation
+
+The wait time before each retry uses exponential backoff:
+
+```text
+wait_time = backoff_factor × 2^(attempt - 1)
+```
+
+Example for Network Error (backoff_factor = 1.0):
+- Attempt 1: 1.0s wait
+- Attempt 2: 2.0s wait
+- Attempt 3: 4.0s wait
+- Attempt 4: 8.0s wait
+- Attempt 5: 16.0s wait
+
+## Configuration via Environment Variables
+
+Override default strategies using environment variables:
+
+### Network Error Configuration
+```bash
+export CORTEX_RETRY_NETWORK_MAX=10        # Max retry attempts (default: 5)
+export CORTEX_RETRY_NETWORK_BACKOFF=0.5   # Base backoff in seconds (default: 1.0)
+```
+
+### Lock Error Configuration
+```bash
+export CORTEX_RETRY_LOCK_MAX=5            # Max retry attempts (default: 3)
+export CORTEX_RETRY_LOCK_BACKOFF=10.0     # Base backoff in seconds (default: 5.0)
+```
+
+### Unknown Error Configuration
+```bash
+export CORTEX_RETRY_UNKNOWN_MAX=3         # Max retry attempts (default: 2)
+export CORTEX_RETRY_UNKNOWN_BACKOFF=1.0   # Base backoff in seconds (default: 2.0)
+```
+
+## Examples
+
+### Aggressive Retry for Unstable Networks
+
+If you're on an unstable connection and want more retries with shorter waits:
+
+```bash
+export CORTEX_RETRY_NETWORK_MAX=10
+export CORTEX_RETRY_NETWORK_BACKOFF=0.5
+cortex install docker --execute
+```
+
+This gives 10 attempts with waits: 0.5s, 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s.
+
+### Patient Retry for Shared Systems
+
+If you're on a shared server where `apt` locks are common:
+
+```bash
+export CORTEX_RETRY_LOCK_MAX=5
+export CORTEX_RETRY_LOCK_BACKOFF=30.0
+cortex install nginx --execute
+```
+
+This gives 5 attempts with waits: 30s, 60s, 120s, 240s, 480s (up to 8 minutes total wait).
+
+### Disable All Retries
+
+For CI/CD pipelines where you want fast failure:
+
+```bash
+export CORTEX_RETRY_NETWORK_MAX=0
+export CORTEX_RETRY_LOCK_MAX=0
+export CORTEX_RETRY_UNKNOWN_MAX=0
+cortex install package --execute
+```
+
+## User Feedback
+
+During retries, Cortex displays messages like:
+
+```text
+⚠️ NETWORK_ERROR detected. Retrying in 2.0s... (Attempt 2/5)
+```
+
+This shows:
+- The error type that was detected
+- How long until the next attempt
+- The current attempt number and maximum attempts
+
+## Error Categories Reference
+
+### Transient (Retried)
+
+| Category | Example Errors |
+|----------|----------------|
+| `NETWORK_ERROR` | "Connection timed out", "Temporary failure resolving" |
+| `LOCK_ERROR` | "Could not get lock", "dpkg was interrupted" |
+| `UNKNOWN` | Unclassified errors that might be transient |
+
+### Permanent (Never Retried)
+
+| Category | Example Errors |
+|----------|----------------|
+| `PERMISSION_DENIED` | "Permission denied", "Operation not permitted" |
+| `PACKAGE_NOT_FOUND` | "Unable to locate package", "No such package" |
+| `DISK_SPACE` | "No space left on device" |
+| `DEPENDENCY_MISSING` | "Depends: X but it is not installable" |
+| `CONFIGURATION_ERROR` | "Configuration file syntax error" |
+| `CONFLICT` | "Conflicts with package X" |
+
+## Programmatic Usage
+
+For advanced use cases, you can customize strategies in code:
+
+```python
+from cortex.utils.retry import SmartRetry, RetryStrategy, DEFAULT_STRATEGIES
+from cortex.error_parser import ErrorCategory
+
+# Custom strategies
+custom_strategies = dict(DEFAULT_STRATEGIES)
+custom_strategies[ErrorCategory.NETWORK_ERROR] = RetryStrategy(
+    max_retries=10,
+    backoff_factor=0.5,
+    description="Custom network retry"
+)
+
+retry = SmartRetry(strategies=custom_strategies)
+result = retry.run(my_function)
+```
+
+## Troubleshooting
+
+### Retries Not Happening
+
+1. Check if the error is classified as permanent (see table above)
+2. Verify environment variables are set correctly
+3. Run with `--verbose` to see detailed error classification
+
+### Retries Taking Too Long
+
+Reduce `backoff_factor` or `max_retries` via environment variables.
+
+### Need More Aggressive Retries
+
+Increase `max_retries` and decrease `backoff_factor`.
+
+---
+
+**Version**: 0.9.0  
+**Last Updated**: January 2026
diff --git a/tests/test_coordinator.py b/tests/test_coordinator.py
index a0ad03d4..bf858de1 100644
--- a/tests/test_coordinator.py
+++ b/tests/test_coordinator.py
@@ -31,8 +31,9 @@ def test_step_duration(self):
         self.assertEqual(step.duration(), 5.5)
 
 
+@patch("time.sleep")
 class TestInstallationCoordinator(unittest.TestCase):
-    def test_initialization(self):
+    def test_initialization(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         coordinator = InstallationCoordinator(commands)
 
@@ -40,7 +41,7 @@ def test_initialization(self):
         self.assertEqual(coordinator.steps[0].command, "echo 1")
         self.assertEqual(coordinator.steps[1].command, "echo 2")
 
-    def test_from_plan_initialization(self):
+    def test_from_plan_initialization(self, mock_sleep):
         plan = [
             {"command": "echo 1", "description": "First step"},
             {"command": "echo 2", "rollback": "echo rollback"},
@@ -54,7 +55,7 @@ def test_from_plan_initialization(self):
         self.assertTrue(coordinator.enable_rollback)
         self.assertEqual(coordinator.rollback_commands, ["echo rollback"])
 
-    def test_initialization_with_descriptions(self):
+    def test_initialization_with_descriptions(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         descriptions = ["First", "Second"]
         coordinator = InstallationCoordinator(commands, descriptions)
@@ -62,7 +63,7 @@ def test_initialization_with_descriptions(self):
         self.assertEqual(coordinator.steps[0].description, "First")
         self.assertEqual(coordinator.steps[1].description, "Second")
 
-    def test_initialization_mismatched_descriptions(self):
+    def test_initialization_mismatched_descriptions(self, mock_sleep):
         commands = ["echo 1", "echo 2"]
         descriptions = ["First"]
 
@@ -70,7 +71,7 @@ def test_initialization_mismatched_descriptions(self):
             InstallationCoordinator(commands, descriptions)
 
     @patch("subprocess.run")
-    def test_execute_single_success(self, mock_run):
+    def test_execute_single_success(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
@@ -85,7 +86,7 @@ def test_execute_single_success(self, mock_run):
         self.assertEqual(result.steps[0].status, StepStatus.SUCCESS)
 
     @patch("subprocess.run")
-    def test_execute_single_failure(self, mock_run):
+    def test_execute_single_failure(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 1
         mock_result.stdout = ""
@@ -100,7 +101,7 @@ def test_execute_single_failure(self, mock_run):
         self.assertEqual(result.steps[0].status, StepStatus.FAILED)
 
     @patch("subprocess.run")
-    def test_execute_multiple_success(self, mock_run):
+    def test_execute_multiple_success(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
@@ -115,7 +116,7 @@ def test_execute_multiple_success(self, mock_run):
         self.assertTrue(all(s.status == StepStatus.SUCCESS for s in result.steps))
 
     @patch("subprocess.run")
-    def test_execute_stop_on_error(self, mock_run):
+    def test_execute_stop_on_error(self, mock_run, mock_sleep):
         def side_effect(*args, **kwargs):
             cmd = args[0] if args else kwargs.get("shell")
             if "fail" in str(cmd):
@@ -143,7 +144,7 @@ def side_effect(*args, **kwargs):
         self.assertEqual(result.steps[2].status, StepStatus.SKIPPED)
 
     @patch("subprocess.run")
-    def test_execute_continue_on_error(self, mock_run):
+    def test_execute_continue_on_error(self, mock_run, mock_sleep):
         def side_effect(*args, **kwargs):
             cmd = args[0] if args else kwargs.get("shell")
             if "fail" in str(cmd):
@@ -170,7 +171,7 @@ def side_effect(*args, **kwargs):
         self.assertEqual(result.steps[2].status, StepStatus.SUCCESS)
 
     @patch("subprocess.run")
-    def test_timeout_handling(self, mock_run):
+    def test_timeout_handling(self, mock_run, mock_sleep):
         mock_run.side_effect = Exception("Timeout")
 
         coordinator = InstallationCoordinator(["sleep 1000"], timeout=1)
@@ -179,7 +180,7 @@ def test_timeout_handling(self, mock_run):
         self.assertFalse(result.success)
         self.assertEqual(result.steps[0].status, StepStatus.FAILED)
 
-    def test_progress_callback(self):
+    def test_progress_callback(self, mock_sleep):
         callback_calls = []
 
         def callback(current, total, step):
@@ -199,7 +200,7 @@ def callback(current, total, step):
         self.assertEqual(callback_calls[0], (1, 2, "echo 1"))
         self.assertEqual(callback_calls[1], (2, 2, "echo 2"))
 
-    def test_log_file(self):
+    def test_log_file(self, mock_sleep):
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".log") as f:
             log_file = f.name
 
@@ -223,7 +224,7 @@ def test_log_file(self):
                 os.unlink(log_file)
 
     @patch("subprocess.run")
-    def test_rollback(self, mock_run):
+    def test_rollback(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 1
         mock_result.stdout = ""
@@ -238,7 +239,7 @@ def test_rollback(self, mock_run):
         self.assertGreaterEqual(mock_run.call_count, 2)
 
     @patch("subprocess.run")
-    def test_verify_installation(self, mock_run):
+    def test_verify_installation(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "Docker version 20.10.0"
@@ -252,7 +253,7 @@ def test_verify_installation(self, mock_run):
 
         self.assertTrue(verify_results["docker --version"])
 
-    def test_get_summary(self):
+    def test_get_summary(self, mock_sleep):
         with patch("subprocess.run") as mock_run:
             mock_result = Mock()
             mock_result.returncode = 0
@@ -270,7 +271,7 @@ def test_get_summary(self):
             self.assertEqual(summary["failed"], 0)
             self.assertEqual(summary["skipped"], 0)
 
-    def test_export_log(self):
+    def test_export_log(self, mock_sleep):
         with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".json") as f:
             export_file = f.name
 
@@ -299,7 +300,7 @@ def test_export_log(self):
                 os.unlink(export_file)
 
     @patch("subprocess.run")
-    def test_step_timing(self, mock_run):
+    def test_step_timing(self, mock_run, mock_sleep):
         mock_result = Mock()
         mock_result.returncode = 0
         mock_result.stdout = "success"
diff --git a/tests/test_retry.py b/tests/test_retry.py
new file mode 100644
index 00000000..9061b6a7
--- /dev/null
+++ b/tests/test_retry.py
@@ -0,0 +1,214 @@
+import unittest
+from unittest.mock import Mock, patch
+
+from cortex.error_parser import ErrorCategory
+from cortex.utils.retry import (
+    DEFAULT_STRATEGIES,
+    PERMANENT_ERRORS,
+    RetryStrategy,
+    SmartRetry,
+    load_strategies_from_env,
+)
+
+
+class TestRetryStrategy(unittest.TestCase):
+    """Tests for the RetryStrategy dataclass."""
+
+    def test_strategy_creation(self):
+        strategy = RetryStrategy(max_retries=5, backoff_factor=1.0, description="Test")
+        self.assertEqual(strategy.max_retries, 5)
+        self.assertEqual(strategy.backoff_factor, 1.0)
+        self.assertEqual(strategy.description, "Test")
+
+
+class TestDefaultStrategies(unittest.TestCase):
+    """Tests for default strategy configurations."""
+
+    def test_network_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.NETWORK_ERROR]
+        self.assertEqual(strategy.max_retries, 5)
+        self.assertEqual(strategy.backoff_factor, 1.0)
+
+    def test_lock_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.LOCK_ERROR]
+        self.assertEqual(strategy.max_retries, 3)
+        self.assertEqual(strategy.backoff_factor, 5.0)
+
+    def test_unknown_error_strategy(self):
+        strategy = DEFAULT_STRATEGIES[ErrorCategory.UNKNOWN]
+        self.assertEqual(strategy.max_retries, 2)
+        self.assertEqual(strategy.backoff_factor, 2.0)
+
+    def test_permanent_errors_not_in_strategies(self):
+        for error in PERMANENT_ERRORS:
+            self.assertNotIn(error, DEFAULT_STRATEGIES)
+
+
+class TestLoadStrategiesFromEnv(unittest.TestCase):
+    """Tests for environment variable configuration."""
+
+    def test_default_strategies_when_no_env_vars(self):
+        with patch.dict("os.environ", {}, clear=True):
+            strategies = load_strategies_from_env()
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 5)
+
+    def test_network_override_from_env(self):
+        with patch.dict(
+            "os.environ",
+            {"CORTEX_RETRY_NETWORK_MAX": "10", "CORTEX_RETRY_NETWORK_BACKOFF": "0.5"},
+            clear=True,
+        ):
+            strategies = load_strategies_from_env()
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].max_retries, 10)
+            self.assertEqual(strategies[ErrorCategory.NETWORK_ERROR].backoff_factor, 0.5)
+
+    def test_lock_override_from_env(self):
+        with patch.dict(
+            "os.environ",
+            {"CORTEX_RETRY_LOCK_MAX": "6", "CORTEX_RETRY_LOCK_BACKOFF": "10.0"},
+            clear=True,
+        ):
+            strategies = load_strategies_from_env()
+            self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].max_retries, 6)
+            self.assertEqual(strategies[ErrorCategory.LOCK_ERROR].backoff_factor, 10.0)
+
+
+class TestSmartRetry(unittest.TestCase):
+    """Tests for SmartRetry class."""
+
+    def setUp(self):
+        # Use custom strategies with short backoff for fast tests
+        self.fast_strategies = {
+            ErrorCategory.NETWORK_ERROR: RetryStrategy(3, 0.01, "Test network"),
+            ErrorCategory.LOCK_ERROR: RetryStrategy(2, 0.01, "Test lock"),
+            ErrorCategory.UNKNOWN: RetryStrategy(2, 0.01, "Test unknown"),
+        }
+        self.retry = SmartRetry(strategies=self.fast_strategies)
+
+    def test_success_first_try(self):
+        mock_func = Mock()
+        mock_result = Mock()
+        mock_result.returncode = 0
+        mock_func.return_value = mock_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, mock_result)
+        self.assertEqual(mock_func.call_count, 1)
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_retry_on_network_error(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+
+        success_result = Mock()
+        success_result.returncode = 0
+
+        mock_func.side_effect = [fail_result, fail_result, success_result]
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, success_result)
+        self.assertEqual(mock_func.call_count, 3)
+        self.assertEqual(mock_sleep.call_count, 2)
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_fail_fast_on_permission_denied(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Permission denied"
+
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        self.assertEqual(mock_func.call_count, 1)
+        mock_sleep.assert_not_called()
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_fail_fast_on_disk_space(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "No space left on device"
+
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        self.assertEqual(mock_func.call_count, 1)
+        mock_sleep.assert_not_called()
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_max_retries_exceeded(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        # 1 initial + 3 retries for network error strategy
+        self.assertEqual(mock_func.call_count, 4)
+        self.assertEqual(mock_sleep.call_count, 3)
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_different_strategy_for_lock_error(self, mock_sleep):
+        mock_func = Mock()
+
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Could not get lock /var/lib/apt/lists/lock"
+
+        mock_func.return_value = fail_result
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result, fail_result)
+        # Lock error strategy has max_retries=2, so 1 initial + 2 retries = 3
+        self.assertEqual(mock_func.call_count, 3)
+        self.assertEqual(mock_sleep.call_count, 2)
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_callback_notification(self, mock_sleep):
+        callback = Mock()
+        retry = SmartRetry(strategies=self.fast_strategies, status_callback=callback)
+
+        mock_func = Mock()
+        fail_result = Mock()
+        fail_result.returncode = 1
+        fail_result.stderr = "Connection timed out"
+
+        mock_func.side_effect = [fail_result, Mock(returncode=0)]
+
+        retry.run(mock_func)
+
+        callback.assert_called_once()
+        self.assertIn("NETWORK_ERROR", callback.call_args[0][0])
+        self.assertIn("Retrying", callback.call_args[0][0])
+
+    @patch("cortex.utils.retry.time.sleep")
+    def test_exception_retry(self, mock_sleep):
+        mock_func = Mock()
+        mock_func.side_effect = [Exception("Network error"), Mock(returncode=0)]
+
+        result = self.retry.run(mock_func)
+
+        self.assertEqual(result.returncode, 0)
+        self.assertEqual(mock_func.call_count, 2)
+
+
+if __name__ == "__main__":
+    unittest.main()