diff --git a/cmd/mcpproxy-tray/internal/api/client.go b/cmd/mcpproxy-tray/internal/api/client.go index 9fa17337..8c99c8bc 100644 --- a/cmd/mcpproxy-tray/internal/api/client.go +++ b/cmd/mcpproxy-tray/internal/api/client.go @@ -541,6 +541,25 @@ func (c *Client) RestartServer(serverName string) error { return nil } +// ForceReconnectAllServers triggers reconnection attempts for all upstream servers +func (c *Client) ForceReconnectAllServers(reason string) error { + endpoint := "/api/v1/servers/reconnect" + if reason != "" { + endpoint = endpoint + "?reason=" + url.QueryEscape(reason) + } + + resp, err := c.makeRequest("POST", endpoint, nil) + if err != nil { + return err + } + + if !resp.Success { + return fmt.Errorf("API error: %s", resp.Error) + } + + return nil +} + // TriggerOAuthLogin triggers OAuth login for a server func (c *Client) TriggerOAuthLogin(serverName string) error { endpoint := fmt.Sprintf("/api/v1/servers/%s/login", serverName) diff --git a/cmd/mcpproxy-tray/internal/state/machine.go b/cmd/mcpproxy-tray/internal/state/machine.go index 7a19c725..ea8670fc 100644 --- a/cmd/mcpproxy-tray/internal/state/machine.go +++ b/cmd/mcpproxy-tray/internal/state/machine.go @@ -205,6 +205,8 @@ func (m *Machine) determineNewState(currentState State, event Event) State { return StateCoreErrorPortConflict case EventDBLocked: return StateCoreErrorDBLocked + case EventDockerUnavailable: + return StateCoreErrorDocker case EventConfigError: return StateCoreErrorConfig case EventPermissionError: @@ -225,6 +227,8 @@ func (m *Machine) determineNewState(currentState State, event Event) State { return StateCoreErrorPortConflict case EventDBLocked: return StateCoreErrorDBLocked + case EventDockerUnavailable: + return StateCoreErrorDocker case EventConfigError: return StateCoreErrorConfig case EventPermissionError: @@ -252,6 +256,8 @@ func (m *Machine) determineNewState(currentState State, event Event) State { return StateCoreErrorPortConflict case EventDBLocked: return StateCoreErrorDBLocked + case EventDockerUnavailable: + return StateCoreErrorDocker case EventConfigError: return StateCoreErrorConfig case EventPermissionError: @@ -287,20 +293,28 @@ func (m *Machine) determineNewState(currentState State, event Event) State { return StateShuttingDown } - case StateCoreErrorPortConflict, StateCoreErrorDBLocked, StateCoreErrorGeneral: + case StateCoreErrorPortConflict, StateCoreErrorDBLocked, StateCoreErrorPermission, StateCoreErrorGeneral: switch event { case EventShutdown: return StateShuttingDown - // Error states persist - require user to fix issue manually - // No auto-retry or auto-transition to failed state + // Error states persist - require user to fix issue manually + // No auto-retry or auto-transition to failed state } case StateCoreErrorConfig: switch event { case EventShutdown: return StateShuttingDown - // Config errors persist - require user to fix config manually - // Stay in StateCoreErrorConfig for all other events + // Config errors persist - require user to fix config manually + // Stay in StateCoreErrorConfig for all other events + } + + case StateCoreErrorDocker: + switch event { + case EventRetry: + return StateLaunchingCore + case EventShutdown: + return StateShuttingDown } case StateFailed: diff --git a/cmd/mcpproxy-tray/internal/state/states.go b/cmd/mcpproxy-tray/internal/state/states.go index b6785e2f..898ab955 100644 --- a/cmd/mcpproxy-tray/internal/state/states.go +++ b/cmd/mcpproxy-tray/internal/state/states.go @@ -34,6 +34,12 @@ const ( // StateCoreErrorDBLocked represents core failed due to database lock StateCoreErrorDBLocked State = "core_error_db_locked" + // StateCoreErrorDocker represents core failed due to Docker being unavailable + StateCoreErrorDocker State = "core_error_docker" + + // StateCoreRecoveringDocker represents Docker recovery in progress + StateCoreRecoveringDocker State = "core_recovering_docker" + // StateCoreErrorConfig represents core failed due to configuration error StateCoreErrorConfig State = "core_error_config" @@ -84,6 +90,12 @@ const ( // EventPermissionError indicates core failed due to permission error EventPermissionError Event = "permission_error" + // EventDockerUnavailable indicates Docker engine is unavailable or paused + EventDockerUnavailable Event = "docker_unavailable" + + // EventDockerRecovered indicates Docker engine became available again + EventDockerRecovered Event = "docker_recovered" + // EventGeneralError indicates core failed with general error EventGeneralError Event = "general_error" @@ -112,7 +124,7 @@ type Info struct { // GetInfo returns metadata for a given state func GetInfo(state State) Info { - timeout90s := 90 * time.Second // Must exceed health monitor's readinessTimeout (60s) + timeout90s := 90 * time.Second // Must exceed health monitor's readinessTimeout (60s) timeout5s := 5 * time.Second timeout10s := 10 * time.Second @@ -133,7 +145,7 @@ func GetInfo(state State) Info { Name: StateWaitingForCore, Description: "Waiting for core to become ready", UserMessage: "Core starting up...", - Timeout: &timeout90s, // Increased to 90s to allow Docker isolation startup (health timeout is 60s) + Timeout: &timeout90s, // Increased to 90s to allow Docker isolation startup (health timeout is 60s) }, StateConnectingAPI: { Name: StateConnectingAPI, @@ -176,6 +188,20 @@ func GetInfo(state State) Info { CanRetry: false, // No timeout - config errors persist until user fixes the config }, + StateCoreErrorDocker: { + Name: StateCoreErrorDocker, + Description: "Docker engine unavailable or paused", + UserMessage: "Docker engine unavailable - resume Docker Desktop", + IsError: true, + CanRetry: true, + }, + StateCoreRecoveringDocker: { + Name: StateCoreRecoveringDocker, + Description: "Docker recovery in progress", + UserMessage: "Docker engine recovered - reconnecting servers...", + CanRetry: false, + Timeout: &timeout10s, + }, StateCoreErrorPermission: { Name: StateCoreErrorPermission, Description: "Core failed due to permission error", @@ -229,6 +255,7 @@ func CanTransition(from, to State) bool { StateWaitingForCore, StateCoreErrorPortConflict, StateCoreErrorDBLocked, + StateCoreErrorDocker, StateCoreErrorConfig, StateCoreErrorGeneral, StateShuttingDown, @@ -237,7 +264,8 @@ func CanTransition(from, to State) bool { StateConnectingAPI, StateCoreErrorPortConflict, // ADD: Handle port conflict StateCoreErrorDBLocked, // ADD: Handle DB lock - StateCoreErrorConfig, // ADD: Handle config error + StateCoreErrorDocker, + StateCoreErrorConfig, // ADD: Handle config error StateCoreErrorGeneral, StateLaunchingCore, // Retry StateShuttingDown, @@ -247,7 +275,8 @@ func CanTransition(from, to State) bool { StateReconnecting, StateCoreErrorPortConflict, // ADD: Handle port conflict during connection StateCoreErrorDBLocked, // ADD: Handle DB lock during connection - StateCoreErrorConfig, // ADD: Handle config error during connection + StateCoreErrorDocker, + StateCoreErrorConfig, // ADD: Handle config error during connection StateCoreErrorGeneral, StateShuttingDown, }, @@ -273,6 +302,15 @@ func CanTransition(from, to State) bool { // Error persists - only shutdown allowed StateShuttingDown, }, + StateCoreErrorDocker: { + StateCoreRecoveringDocker, // Transition to recovering when Docker comes back + StateShuttingDown, + }, + StateCoreRecoveringDocker: { + StateLaunchingCore, // Launch core after Docker recovery + StateCoreErrorDocker, // Back to error if Docker fails again + StateShuttingDown, + }, StateCoreErrorGeneral: { // Error persists - only shutdown allowed StateShuttingDown, diff --git a/cmd/mcpproxy-tray/main.go b/cmd/mcpproxy-tray/main.go index e0bd0534..6aaf80f4 100644 --- a/cmd/mcpproxy-tray/main.go +++ b/cmd/mcpproxy-tray/main.go @@ -3,6 +3,7 @@ package main import ( + "bytes" "context" "crypto/rand" "encoding/hex" @@ -20,6 +21,7 @@ import ( "runtime" "strconv" "strings" + "sync" "syscall" "time" @@ -41,8 +43,12 @@ var ( version = "development" // Set by build flags defaultCoreURL = "http://127.0.0.1:8080" errNoBundledCore = errors.New("no bundled core binary found") - trayAPIKey = "" // API key generated for core communication + trayAPIKey = "" // API key generated for core communication shutdownComplete = make(chan struct{}) // Signal when shutdown is complete + shutdownOnce sync.Once + + errDockerPaused = errors.New("docker engine paused") + errDockerUnavailable = errors.New("docker engine unavailable") ) // getLogDir returns the standard log directory for the current OS. @@ -153,38 +159,52 @@ func main() { // Create tray application early so icon appears shutdownFunc := func() { - defer close(shutdownComplete) // Signal when shutdown is done - - logger.Info("Tray shutdown requested") - - // Send shutdown event to state machine - stateMachine.SendEvent(state.EventShutdown) - - // Shutdown launcher (stops SSE, health monitor, kills core) - if launcher != nil { - launcher.handleShutdown() - } + firstCaller := false + shutdownOnce.Do(func() { + firstCaller = true + + go func() { + defer func() { + if r := recover(); r != nil { + logger.Error("Recovered from panic during shutdown", zap.Any("panic", r)) + } + close(shutdownComplete) + }() + + logger.Info("Tray shutdown requested") + + // Notify the state machine so it stops launching or reconnecting cores. + stateMachine.SendEvent(state.EventShutdown) + + // Cancel the shared context early so tray background workers stop emitting logs. + logger.Info("Cancelling tray context") + cancel() + + // Shutdown launcher (stops SSE, health monitor, kills core) + if launcher != nil { + launcher.handleShutdown() + } - // Shutdown state machine - stateMachine.Shutdown() + // Shutdown state machine (waits up to its internal timeout) + stateMachine.Shutdown() - // CRITICAL: Cancel context LAST, after all cleanup - // This prevents the tray.Run() goroutine from quitting prematurely - logger.Info("Cancelling context after cleanup complete") - cancel() + // Give tray.Run() goroutine a moment to notice cancellation before requesting explicit quit. + time.Sleep(50 * time.Millisecond) - // Give tray.Run() goroutine a moment to see cancellation - time.Sleep(100 * time.Millisecond) + // Finally, request the tray UI to quit (safe even if already quitting) + if trayApp != nil { + logger.Info("Quitting system tray") + trayApp.Quit() + } - // Finally, quit the tray UI - logger.Info("Quitting system tray") - trayApp.Quit() + logger.Info("Shutdown sequence finished") + }() + }) - // CRITICAL: Explicitly exit the process after cleanup - // Without this, lingering goroutines may prevent process termination - logger.Info("Shutdown complete - exiting process") - time.Sleep(50 * time.Millisecond) // Brief delay to ensure log is written - os.Exit(0) + if !firstCaller { + // Wait until the first shutdown completes so callers don't proceed early. + <-shutdownComplete + } } trayApp = tray.NewWithAPIClient(api.NewServerAdapter(apiClient), apiClient, logger.Sugar(), version, shutdownFunc) @@ -605,11 +625,11 @@ func findMcpproxyBinary() (string, error) { } } - // 2. Working-directory relative binary (local dev workflow). - addCandidate(filepath.Join(".", "mcpproxy")) - if runtime.GOOS == platformWindows { - addCandidate(filepath.Join(".", "mcpproxy-windows-amd64")) - } + // 2. Working-directory relative binary (local dev workflow). + addCandidate(filepath.Join(".", "mcpproxy")) + if runtime.GOOS == platformWindows { + addCandidate(filepath.Join(".", "mcpproxy-windows-amd64")) + } // 3. Managed installation directories (Application Support on macOS). if homeDir, err := os.UserHomeDir(); err == nil { @@ -619,7 +639,7 @@ func findMcpproxyBinary() (string, error) { } } - // 4. Common package manager locations. + // 4. Common package manager locations. addCandidate("/opt/homebrew/bin/mcpproxy") addCandidate("/usr/local/bin/mcpproxy") @@ -876,6 +896,10 @@ type CoreProcessLauncher struct { processMonitor *monitor.ProcessMonitor healthMonitor *monitor.HealthMonitor + + dockerRetryMu sync.Mutex + dockerRetryCancel context.CancelFunc + dockerReconnectPending bool } // NewCoreProcessLauncher creates a new core process launcher @@ -952,6 +976,12 @@ func (cpl *CoreProcessLauncher) handleStateTransitions(ctx context.Context, tran case state.StateCoreErrorDBLocked: cpl.handleDBLockedError() + case state.StateCoreErrorDocker: + cpl.handleDockerUnavailable(ctx) + + case state.StateCoreRecoveringDocker: + cpl.handleDockerRecovering(ctx) + case state.StateCoreErrorConfig: cpl.handleConfigError() @@ -989,6 +1019,10 @@ func (cpl *CoreProcessLauncher) updateTrayConnectionState(machineState state.Sta trayState = tray.ConnectionStateErrorPortConflict case state.StateCoreErrorDBLocked: trayState = tray.ConnectionStateErrorDBLocked + case state.StateCoreErrorDocker: + trayState = tray.ConnectionStateErrorDocker + case state.StateCoreRecoveringDocker: + trayState = tray.ConnectionStateRecoveringDocker case state.StateCoreErrorConfig: trayState = tray.ConnectionStateErrorConfig case state.StateCoreErrorGeneral: @@ -1055,9 +1089,22 @@ func (cpl *CoreProcessLauncher) safeHandleReconnecting(ctx context.Context) { } // handleLaunchCore handles launching the core process -func (cpl *CoreProcessLauncher) handleLaunchCore(_ context.Context) { +func (cpl *CoreProcessLauncher) handleLaunchCore(ctx context.Context) { cpl.logger.Info("Launching mcpproxy core process") + // Stop any pending Docker retry loop before attempting a new launch + cpl.cancelDockerRetry() + + // Ensure Docker engine is available before launching core (most upstreams depend on it) + if runtime.GOOS == platformDarwin || runtime.GOOS == platformWindows { + if err := cpl.ensureDockerAvailable(ctx); err != nil { + cpl.logger.Error("Docker engine unavailable", zap.Error(err)) + cpl.stateMachine.SetError(err) + cpl.stateMachine.SendEvent(state.EventDockerUnavailable) + return + } + } + // Stop existing process monitor if running if cpl.processMonitor != nil { cpl.processMonitor.Shutdown() @@ -1242,6 +1289,10 @@ func (cpl *CoreProcessLauncher) monitorAPIConnection(ctx context.Context, alread // handleConnected handles the connected state func (cpl *CoreProcessLauncher) handleConnected() { cpl.logger.Info("Core process fully connected and operational") + + if cpl.consumeDockerReconnectPending() { + go cpl.triggerForceReconnect("docker_recovered") + } } // handleReconnecting handles reconnection attempts @@ -1252,63 +1303,254 @@ func (cpl *CoreProcessLauncher) handleReconnecting(_ context.Context) { // handlePortConflictError handles port conflict errors func (cpl *CoreProcessLauncher) handlePortConflictError() { - cpl.logger.Warn("Core failed due to port conflict") - // Attempt automatic port resolution on Windows/macOS - // 1) Parse current coreURL and extract port - u, err := url.Parse(cpl.coreURL) - if err != nil { - cpl.logger.Error("Failed to parse coreURL for port conflict handling", "core_url", cpl.coreURL, "error", err) - return - } - portStr := u.Port() - if portStr == "" { - portStr = "8080" - } - baseHost := u.Hostname() - if baseHost == "" { - baseHost = "127.0.0.1" - } - // 2) Find next available port - startPort, _ := strconv.Atoi(portStr) - newPort, err := findNextAvailablePort(startPort + 1, startPort+50) - if err != nil { - cpl.logger.Error("Failed to find available port after conflict", "start_port", startPort, "error", err) - return - } - // 3) Update coreURL and restart flow - u.Host = net.JoinHostPort(baseHost, strconv.Itoa(newPort)) - cpl.coreURL = u.String() - cpl.logger.Info("Auto-selected alternate port after conflict", "new_core_url", cpl.coreURL) - - // Stop monitors so they can be recreated with new URL - if cpl.healthMonitor != nil { - cpl.healthMonitor.Stop() - cpl.healthMonitor = nil - } - if cpl.processMonitor != nil { - cpl.processMonitor.Shutdown() - cpl.processMonitor = nil - } - // Trigger retry which will launch core with updated args based on coreURL - cpl.stateMachine.SendEvent(state.EventRetry) + cpl.logger.Warn("Core failed due to port conflict") + // Attempt automatic port resolution on Windows/macOS + // 1) Parse current coreURL and extract port + u, err := url.Parse(cpl.coreURL) + if err != nil { + cpl.logger.Error("Failed to parse coreURL for port conflict handling", "core_url", cpl.coreURL, "error", err) + return + } + portStr := u.Port() + if portStr == "" { + portStr = "8080" + } + baseHost := u.Hostname() + if baseHost == "" { + baseHost = "127.0.0.1" + } + // 2) Find next available port + startPort, _ := strconv.Atoi(portStr) + newPort, err := findNextAvailablePort(startPort+1, startPort+50) + if err != nil { + cpl.logger.Error("Failed to find available port after conflict", "start_port", startPort, "error", err) + return + } + // 3) Update coreURL and restart flow + u.Host = net.JoinHostPort(baseHost, strconv.Itoa(newPort)) + cpl.coreURL = u.String() + cpl.logger.Info("Auto-selected alternate port after conflict", "new_core_url", cpl.coreURL) + + // Stop monitors so they can be recreated with new URL + if cpl.healthMonitor != nil { + cpl.healthMonitor.Stop() + cpl.healthMonitor = nil + } + if cpl.processMonitor != nil { + cpl.processMonitor.Shutdown() + cpl.processMonitor = nil + } + // Trigger retry which will launch core with updated args based on coreURL + cpl.stateMachine.SendEvent(state.EventRetry) } // findNextAvailablePort scans a range and returns the first free port on localhost func findNextAvailablePort(start, end int) (int, error) { - if start < 1 { - start = 1 - } - if end <= start { - end = start + 50 - } - for p := start; p <= end; p++ { - ln, err := net.Listen("tcp", net.JoinHostPort("127.0.0.1", strconv.Itoa(p))) - if err == nil { - _ = ln.Close() - return p, nil - } - } - return 0, fmt.Errorf("no free port in range %d-%d", start, end) + if start < 1 { + start = 1 + } + if end <= start { + end = start + 50 + } + for p := start; p <= end; p++ { + ln, err := net.Listen("tcp", net.JoinHostPort("127.0.0.1", strconv.Itoa(p))) + if err == nil { + _ = ln.Close() + return p, nil + } + } + return 0, fmt.Errorf("no free port in range %d-%d", start, end) +} + +// handleDockerRecovering handles the Docker recovery state +// This state provides user feedback while we're restarting the core after Docker recovery +func (cpl *CoreProcessLauncher) handleDockerRecovering(ctx context.Context) { + cpl.logger.Info("Docker recovery state - preparing to launch core") + + // Small delay to ensure Docker is fully stable + time.Sleep(500 * time.Millisecond) + + // Transition to launching core + cpl.stateMachine.SendEvent(state.EventRetry) +} + +// handleDockerUnavailable handles scenarios where Docker Desktop is paused or unavailable. +// Uses exponential backoff to avoid wasting resources while waiting for Docker recovery. +func (cpl *CoreProcessLauncher) handleDockerUnavailable(ctx context.Context) { + lastErr := cpl.stateMachine.GetLastError() + if lastErr != nil { + cpl.logger.Warn("Docker engine unavailable - waiting for recovery", zap.Error(lastErr)) + } else { + cpl.logger.Warn("Docker engine unavailable - waiting for recovery") + } + + cpl.dockerRetryMu.Lock() + if cpl.dockerRetryCancel != nil { + cpl.dockerRetryCancel() + } + retryCtx, cancel := context.WithCancel(ctx) + cpl.dockerRetryCancel = cancel + cpl.dockerRetryMu.Unlock() + + go func() { + // Exponential backoff intervals: fast when Docker just paused, slower when off for longer + intervals := []time.Duration{ + 2 * time.Second, // Immediate retry (Docker just paused) + 5 * time.Second, // Quick retry + 10 * time.Second, // Normal retry + 30 * time.Second, // Slow retry + 60 * time.Second, // Very slow retry (max backoff) + } + + attempt := 0 + startTime := time.Now() + + for { + // Calculate current interval (stay at max after reaching it) + currentInterval := intervals[min(attempt, len(intervals)-1)] + + select { + case <-retryCtx.Done(): + return + case <-time.After(currentInterval): + if err := cpl.ensureDockerAvailable(retryCtx); err == nil { + elapsed := time.Since(startTime) + cpl.logger.Info("Docker engine available - transitioning to recovery state", + zap.Int("attempts", attempt+1), + zap.Duration("total_wait", elapsed)) + cpl.setDockerReconnectPending(true) + cpl.cancelDockerRetry() + // Transition to recovering state instead of directly retrying + cpl.stateMachine.SendEvent(state.EventDockerRecovered) + return + } else if err != nil { + cpl.logger.Debug("Docker still unavailable", + zap.Int("attempt", attempt+1), + zap.Duration("next_check_in", intervals[min(attempt+1, len(intervals)-1)]), + zap.Error(err)) + } + attempt++ + } + } + }() +} + +// min returns the minimum of two integers +func min(a, b int) int { + if a < b { + return a + } + return b +} + +// cancelDockerRetry stops any pending Docker retry loop. +func (cpl *CoreProcessLauncher) cancelDockerRetry() { + cpl.dockerRetryMu.Lock() + if cpl.dockerRetryCancel != nil { + cpl.dockerRetryCancel() + cpl.dockerRetryCancel = nil + } + cpl.dockerRetryMu.Unlock() +} + +func (cpl *CoreProcessLauncher) setDockerReconnectPending(pending bool) { + cpl.dockerRetryMu.Lock() + cpl.dockerReconnectPending = pending + cpl.dockerRetryMu.Unlock() +} + +func (cpl *CoreProcessLauncher) consumeDockerReconnectPending() bool { + cpl.dockerRetryMu.Lock() + pending := cpl.dockerReconnectPending + if pending { + cpl.dockerReconnectPending = false + } + cpl.dockerRetryMu.Unlock() + return pending +} + +// ensureDockerAvailable verifies Docker Desktop is running and responsive. +func (cpl *CoreProcessLauncher) ensureDockerAvailable(ctx context.Context) error { + checkCtx := ctx + if checkCtx == nil { + checkCtx = context.Background() + } + timeoutCtx, cancel := context.WithTimeout(checkCtx, 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(timeoutCtx, "docker", "info", "--format", "{{json .ServerVersion}}") + var stderr bytes.Buffer + cmd.Stdout = &bytes.Buffer{} + cmd.Stderr = &stderr + + if err := cmd.Run(); err != nil { + stderrStr := strings.TrimSpace(stderr.String()) + lower := strings.ToLower(stderrStr) + + switch { + case strings.Contains(lower, "docker desktop is manually paused"): + if stderrStr == "" { + stderrStr = "Docker Desktop is manually paused" + } + return fmt.Errorf("%w: %s", errDockerPaused, stderrStr) + case strings.Contains(lower, "is the docker daemon running"), + strings.Contains(lower, "cannot connect to the docker daemon"), + strings.Contains(lower, "error during connect"), + strings.Contains(lower, "connectex"), + errors.Is(err, context.DeadlineExceeded): + if stderrStr == "" { + stderrStr = "Docker daemon is not responding" + } + return fmt.Errorf("%w: %s", errDockerUnavailable, stderrStr) + } + + var execErr *exec.Error + if errors.As(err, &execErr) { + return fmt.Errorf("%w: %v", errDockerUnavailable, execErr) + } + + if exitErr, ok := err.(*exec.ExitError); ok { + if stderrStr == "" { + stderrStr = exitErr.Error() + } + return fmt.Errorf("%w: %s", errDockerUnavailable, stderrStr) + } + + if stderrStr != "" { + return fmt.Errorf("%w: %s", errDockerUnavailable, stderrStr) + } + + return fmt.Errorf("%w: %v", errDockerUnavailable, err) + } + + return nil +} + +func (cpl *CoreProcessLauncher) triggerForceReconnect(reason string) { + if cpl.apiClient == nil { + return + } + + const maxAttempts = 3 + for attempt := 1; attempt <= maxAttempts; attempt++ { + if err := cpl.apiClient.ForceReconnectAllServers(reason); err != nil { + cpl.logger.Warn("Failed to trigger upstream reconnection after Docker recovery", + zap.String("reason", reason), + zap.Int("attempt", attempt), + zap.Error(err)) + time.Sleep(2 * time.Second) + continue + } + + cpl.logger.Info("Triggered upstream reconnection after Docker recovery", + zap.String("reason", reason), + zap.Int("attempt", attempt)) + return + } + + cpl.logger.Error("Exhausted attempts to trigger upstream reconnection after Docker recovery", + zap.String("reason", reason), + zap.Int("attempts", maxAttempts)) } // handleDBLockedError handles database locked errors @@ -1359,6 +1601,9 @@ func (cpl *CoreProcessLauncher) handleShutdown() { cpl.trayApp.SetConnectionState(tray.ConnectionStateDisconnected) } + // Stop any Docker retry loop + cpl.cancelDockerRetry() + // Stop SSE connection before killing core // This prevents SSE from detecting disconnection and trying to reconnect cpl.logger.Info("Stopping SSE connection") diff --git a/cmd/mcpproxy/main.go b/cmd/mcpproxy/main.go index bfa07740..420837e6 100644 --- a/cmd/mcpproxy/main.go +++ b/cmd/mcpproxy/main.go @@ -453,8 +453,10 @@ func runServer(cmd *cobra.Command, _ []string) error { // Wait for context to be cancelled <-ctx.Done() logger.Info("Shutting down server") - if err := srv.StopServer(); err != nil { - logger.Error("Error stopping server", zap.Error(err)) + // Use Shutdown() instead of StopServer() to ensure proper container cleanup + // Shutdown() calls runtime.Close() which triggers ShutdownAll() for Docker cleanup + if err := srv.Shutdown(); err != nil { + logger.Error("Error shutting down server", zap.Error(err)) } return nil diff --git a/docs/docker-recovery-improvements.md b/docs/docker-recovery-improvements.md new file mode 100644 index 00000000..fba20d01 --- /dev/null +++ b/docs/docker-recovery-improvements.md @@ -0,0 +1,1134 @@ +# Docker Resume Recovery - Critical Analysis & Improvement Recommendations + +## Executive Summary + +The current Docker resume recovery implementation (feature/docker-recovery branch) provides a solid foundation for detecting and recovering from Docker Desktop pause/resume events. However, critical analysis reveals **11 significant gaps and improvement opportunities** that could enhance reliability, user experience, and system observability. + +**🚨 CRITICAL:** Issue #11 (Duplicate Container Spawning) is the **most dangerous** problem - the system can spawn multiple containers for the same server during recovery, leading to resource exhaustion, port conflicts, and orphaned containers. + +**Severity Levels:** +- πŸ”΄ **Critical** - Can cause data loss, incorrect behavior, or poor user experience +- 🟑 **Important** - Impacts reliability or performance +- 🟒 **Nice-to-have** - Improves observability or maintainability + +--- + +## Current Implementation Review + +### βœ… What Works Well + +1. **Pre-launch Docker health probe** - Prevents repeated startup failures +2. **Polling-based recovery** - Detects when Docker becomes available again +3. **Explicit error state** - UX clearly shows Docker unavailability +4. **Force reconnect API** - Clean separation between tray and runtime +5. **Container cleanup timeout increase** - 30s prevents race conditions +6. **Safe config cloning** - Avoids mutating shared state + +### ❌ Critical Gaps Identified + +--- + +## πŸ”΄ Critical Issue #1: Reconnects ALL Servers Instead of Docker-Only + +**Problem:** +`ForceReconnectAll()` reconnects **every disconnected server**, regardless of whether they use Docker isolation. + +**Evidence:** +```go +// internal/upstream/manager.go:890-897 +for id, client := range clientMap { + if client.IsConnected() { + continue // Skip connected + } + // ❌ No check for IsDockerCommand() - reconnects HTTP, SSE, stdio servers too! + cfg := cloneServerConfig(client.GetConfig()) + // ... recreate client +} +``` + +**Impact:** +- Wastes resources reconnecting HTTP/SSE/stdio servers that weren't affected +- Unnecessary downtime for unaffected servers +- Confusing logs showing reconnections for non-Docker servers + +**Recommended Fix:** +```go +for id, client := range clientMap { + if client.IsConnected() { + continue + } + + // βœ… ADD: Filter for Docker-based servers only + if !client.IsDockerCommand() { + m.logger.Debug("Skipping force reconnect for non-Docker server", + zap.String("server", id), + zap.String("reason", reason)) + continue + } + + // Only reconnect Docker-isolated servers + cfg := cloneServerConfig(client.GetConfig()) + // ... +} +``` + +**Files to modify:** +- `internal/upstream/manager.go:890-897` (add Docker filter) + +--- + +## πŸ”΄ Critical Issue #2: No Container Health Verification + +**Problem:** +When Docker is paused, existing container sockets remain open but processes inside are frozen. When Docker resumes: +1. Tray detects Docker is available +2. Calls `ForceReconnectAll()` +3. Manager skips servers where `IsConnected() == true` +4. **But those containers are dead/paused!** + +**Evidence:** +```go +// internal/upstream/manager.go:895-897 +if client.IsConnected() { + continue // ❌ Connection alive β‰  container healthy! +} +``` + +**Impact:** +- Servers appear "connected" but are non-functional +- Tool calls timeout or hang indefinitely +- Users must manually restart servers + +**Recommended Fix:** + +Add container health verification before skipping reconnection: + +```go +if client.IsConnected() { + // βœ… For Docker servers, verify container is actually healthy + if client.IsDockerCommand() { + if !m.verifyDockerContainerHealthy(client) { + m.logger.Warn("Docker container unhealthy despite active connection", + zap.String("server", id), + zap.String("container_id", client.GetContainerID())) + // Force reconnect even though connection appears active + } else { + continue // Container is healthy, skip + } + } else { + continue // Non-Docker server, connection is sufficient + } +} +``` + +Add helper method: +```go +func (m *Manager) verifyDockerContainerHealthy(client *managed.Client) bool { + containerID := client.GetContainerID() + if containerID == "" { + return false + } + + // Quick health check: docker ps --filter id= --format {{.Status}} + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + cmd := exec.CommandContext(ctx, "docker", "ps", + "--filter", fmt.Sprintf("id=%s", containerID), + "--format", "{{.Status}}") + + output, err := cmd.Output() + if err != nil || len(output) == 0 { + return false // Container not running + } + + status := strings.ToLower(string(output)) + return strings.Contains(status, "up") || strings.Contains(status, "running") +} +``` + +**Files to modify:** +- `internal/upstream/manager.go` (add health check logic) +- `internal/upstream/managed/client.go` (add `GetContainerID()` accessor if missing) + +--- + +## 🟑 Important Issue #3: Fixed 5-Second Polling is Suboptimal + +**Problem:** +Docker health polling uses a fixed 5-second interval: +- **Too frequent** when Docker Desktop is off (wastes CPU/battery on laptops) +- **Too slow** when Docker just resumed (users wait 5s before recovery starts) + +**Evidence:** +```go +// cmd/mcpproxy-tray/main.go:1378 +ticker := time.NewTicker(5 * time.Second) // ❌ Fixed interval +``` + +**Recommended Fix:** + +Implement **exponential backoff with jitter**: + +```go +func (cpl *CoreProcessLauncher) handleDockerUnavailable(ctx context.Context) { + // Initial fast retry (Docker just paused), then slow down + intervals := []time.Duration{ + 2 * time.Second, // Immediate retry + 5 * time.Second, // Quick retry + 10 * time.Second, // Normal retry + 30 * time.Second, // Slow retry + 60 * time.Second, // Very slow retry + } + + attempt := 0 + for { + select { + case <-retryCtx.Done(): + return + case <-time.After(intervals[min(attempt, len(intervals)-1)]): + if err := cpl.ensureDockerAvailable(retryCtx); err == nil { + cpl.logger.Info("Docker available after recovery", + zap.Int("attempts", attempt+1), + zap.Duration("total_wait", time.Since(startTime))) + // Recovery logic... + return + } + attempt++ + } + } +} +``` + +**Benefits:** +- Faster recovery when Docker quickly resumes (2s vs 5s) +- Lower CPU usage when Docker is off for extended periods +- Better battery life on laptops + +--- + +## πŸ”΄ Critical Issue #4: No "Recovering" State in Tray UI + +**Problem:** +Tray shows `error_docker` state, but doesn't differentiate between: +1. "Docker is currently unavailable" (user needs to resume Docker) +2. "Docker just came back, reconnecting servers..." (recovery in progress) + +**Impact:** +- Users don't know if recovery is happening +- Appears broken even when working correctly +- No visibility into recovery progress + +**Recommended Fix:** + +**Step 1:** Add new state in `cmd/mcpproxy-tray/internal/state/states.go`: +```go +const ( + // ... existing states ... + StateCoreErrorDocker State = "core_error_docker" + + // βœ… ADD: New state for Docker recovery in progress + StateCoreRecoveringDocker State = "core_recovering_docker" +) +``` + +**Step 2:** Add corresponding tray connection state in `internal/tray/connection_state.go`: +```go +const ( + // ... existing states ... + ConnectionStateErrorDocker ConnectionState = "error_docker" + + // βœ… ADD: New state for recovery + ConnectionStateRecoveringDocker ConnectionState = "recovering_docker" +) +``` + +**Step 3:** Update state mapping in `cmd/mcpproxy-tray/main.go:1019`: +```go +case state.StateCoreRecoveringDocker: + trayState = tray.ConnectionStateRecoveringDocker +``` + +**Step 4:** Transition to recovering state when Docker comes back: +```go +// cmd/mcpproxy-tray/main.go:1385-1390 +if err := cpl.ensureDockerAvailable(retryCtx); err == nil { + cpl.logger.Info("Docker engine available - starting recovery") + cpl.setDockerReconnectPending(true) + cpl.cancelDockerRetry() + + // βœ… Transition to recovering state (not retry) + cpl.stateMachine.SendEvent(state.EventDockerRecovered) + return +} +``` + +**Step 5:** Update tray menu to show recovery status: +```go +// internal/tray/menu.go +case ConnectionStateRecoveringDocker: + return "πŸ”„ Recovering from Docker outage..." +``` + +**Files to modify:** +- `cmd/mcpproxy-tray/internal/state/states.go` (add new state + event) +- `internal/tray/connection_state.go` (add ConnectionStateRecoveringDocker) +- `cmd/mcpproxy-tray/main.go` (add state mapping + event handling) +- `internal/tray/menu.go` (add menu text for recovering state) + +--- + +## 🟑 Important Issue #5: No Observability/Metrics + +**Problem:** +No tracking of: +- How often Docker outages occur +- How long recovery takes +- Success/failure rate of reconnections +- Which servers failed to reconnect + +**Recommended Fix:** + +Add metrics struct: + +```go +// cmd/mcpproxy-tray/main.go +type DockerRecoveryMetrics struct { + OutageCount int + LastOutageTime time.Time + LastRecoveryTime time.Time + RecoveryDuration time.Duration + SuccessfulRecoveries int + FailedRecoveries int +} + +func (cpl *CoreProcessLauncher) recordDockerOutage() { + cpl.dockerMetrics.OutageCount++ + cpl.dockerMetrics.LastOutageTime = time.Now() + cpl.logger.Info("Docker outage recorded", + zap.Int("total_outages", cpl.dockerMetrics.OutageCount)) +} + +func (cpl *CoreProcessLauncher) recordDockerRecovery(success bool, duration time.Duration) { + cpl.dockerMetrics.LastRecoveryTime = time.Now() + cpl.dockerMetrics.RecoveryDuration = duration + if success { + cpl.dockerMetrics.SuccessfulRecoveries++ + } else { + cpl.dockerMetrics.FailedRecoveries++ + } + + cpl.logger.Info("Docker recovery completed", + zap.Bool("success", success), + zap.Duration("duration", duration), + zap.Int("success_count", cpl.dockerMetrics.SuccessfulRecoveries), + zap.Int("failure_count", cpl.dockerMetrics.FailedRecoveries)) +} +``` + +--- + +## 🟑 Important Issue #6: Force Reconnect Retries Too Aggressively + +**Problem:** +Force reconnect API call has only 3 attempts with 2s delays: + +```go +// cmd/mcpproxy-tray/main.go:1488-1495 +const maxAttempts = 3 +for attempt := 1; attempt <= maxAttempts; attempt++ { + if err := cpl.apiClient.ForceReconnectAllServers(reason); err != nil { + time.Sleep(2 * time.Second) // ❌ Linear backoff + continue + } +} +``` + +**Issues:** +- Linear 2s delay is arbitrary +- Only 3 attempts may not be enough if core is still starting upstream connections +- No jitter (thundering herd if multiple servers) + +**Recommended Fix:** + +Use exponential backoff with jitter: + +```go +func (cpl *CoreProcessLauncher) triggerForceReconnect(reason string) { + if cpl.apiClient == nil { + return + } + + backoff := []time.Duration{ + 1 * time.Second, // Fast first retry + 3 * time.Second, // Medium retry + 5 * time.Second, // Slow retry + 10 * time.Second, // Very slow retry + } + + for attempt := 0; attempt < len(backoff); attempt++ { + if err := cpl.apiClient.ForceReconnectAllServers(reason); err != nil { + cpl.logger.Warn("Failed to trigger upstream reconnection", + zap.String("reason", reason), + zap.Int("attempt", attempt+1), + zap.Error(err)) + + if attempt < len(backoff)-1 { + // Add jitter Β±20% + jitter := time.Duration(float64(backoff[attempt]) * 0.2 * (rand.Float64()*2 - 1)) + time.Sleep(backoff[attempt] + jitter) + } + continue + } + + cpl.logger.Info("Triggered upstream reconnection successfully", + zap.String("reason", reason), + zap.Int("attempt", attempt+1)) + return + } +} +``` + +--- + +## 🟒 Nice-to-have Issue #7: Better Error Message Differentiation + +**Problem:** +`ensureDockerAvailable()` distinguishes between "paused" and "unavailable" but state machine doesn't preserve this distinction. + +**Recommended Fix:** + +Add two separate Docker error states: +- `StateCoreErrorDockerPaused` - Docker Desktop manually paused +- `StateCoreErrorDockerDown` - Docker daemon not running + +Update tray UI accordingly: +- Paused: "⏸️ Docker Desktop is paused - click Resume in Docker menu" +- Down: "⬇️ Docker Desktop is not running - start Docker Desktop" + +--- + +## 🟒 Nice-to-have Issue #8: Configurable Timeouts + +**Problem:** +Hardcoded timeouts may not suit all systems: +- 30s Docker cleanup timeout (line 22 in docs) +- 3s Docker info check (main.go:1431) +- 60s tool indexing interval (lifecycle.go:84) + +**Recommended Fix:** + +Add configuration options: +```json +{ + "docker_recovery": { + "health_check_timeout": "3s", + "cleanup_timeout": "30s", + "polling_intervals": [2, 5, 10, 30, 60], + "max_reconnect_attempts": 4 + } +} +``` + +--- + +## 🟑 Important Issue #9: No Partial Failure Handling + +**Problem:** +If `ForceReconnectAll()` fails for some servers but succeeds for others, there's no granular status reporting. + +**Recommended Fix:** + +Return structured result from `ForceReconnectAll()`: + +```go +type ReconnectResult struct { + TotalServers int + AttemptedServers int + SuccessfulServers []string + FailedServers map[string]error +} + +func (m *Manager) ForceReconnectAll(reason string) *ReconnectResult { + result := &ReconnectResult{ + SuccessfulServers: []string{}, + FailedServers: make(map[string]error), + } + + // ... reconnection logic ... + + for id, client := range clientMap { + result.TotalServers++ + + if !shouldReconnect(client) { + continue + } + + result.AttemptedServers++ + + if err := reconnectClient(id, client); err != nil { + result.FailedServers[id] = err + } else { + result.SuccessfulServers = append(result.SuccessfulServers, id) + } + } + + return result +} +``` + +--- + +## 🟒 Nice-to-have Issue #10: No User Notification + +**Problem:** +When recovery completes, users don't get explicit feedback that servers are operational again. + +**Recommended Fix:** + +Add system notification (macOS/Windows): +```go +// After successful recovery +notification := &tray.Notification{ + Title: "Docker Recovery Complete", + Message: fmt.Sprintf("%d servers reconnected successfully", len(successfulServers)), + Icon: tray.IconSuccess, +} +cpl.trayApp.ShowNotification(notification) +``` + +--- + +## πŸ”΄ Critical Issue #11: Duplicate Container Spawning + +**Problem:** +The system can spawn **duplicate containers** for the same server when the supervisor fails to detect container liveness. This is the **most dangerous issue** as it leads to: +- Resource exhaustion (multiple containers consuming memory/CPU) +- Port binding conflicts +- Data corruption if containers share state +- Confused system state (which container is "current"?) +- Orphaned containers that never get cleaned up + +**Root Causes:** + +1. **Race Condition During ForceReconnectAll()** + ```go + // internal/upstream/manager.go:890-903 + for id, client := range clientMap { + if client.IsConnected() { + continue // ❌ No container health check! + } + + // ❌ Multiple goroutines can enter here simultaneously + cfg := cloneServerConfig(client.GetConfig()) + // ... recreate client (spawns new container) + } + ``` + + **Scenario:** + - Docker paused, 3 servers disconnected + - `ForceReconnectAll()` called + - All 3 clients reconnect in parallel goroutines + - If same client gets reconnected twice β†’ **2 containers spawned** + +2. **Lost Container ID = Orphaned Containers** + ```go + // internal/upstream/core/docker.go:20-67 + for attempt := 0; attempt < 100; attempt++ { // Wait up to 10 seconds + cidBytes, err := os.ReadFile(cidFile) + if err == nil { + c.containerID = containerID // βœ… Tracked + return + } + } + // ❌ If cidfile read fails β†’ c.containerID stays empty β†’ orphan! + ``` + + **Scenario:** + - Container spawned with slow image pull (takes 15s) + - cidfile read timeout after 10s + - `c.containerID` remains empty + - Next reconnect spawns NEW container + - Old container never cleaned up β†’ **orphan** + +3. **Random Container Names Don't Prevent Duplicates** + ```go + // internal/upstream/core/isolation.go:403-408 + func generateContainerName(serverName string) string { + sanitized := sanitizeServerNameForContainer(serverName) + // Generate 4-character random suffix + return fmt.Sprintf("mcpproxy-%s-%s", sanitized, randomSuffix) + } + ``` + + **Result:** + - Server "github" β†’ `mcpproxy-github-a1b2` + - Reconnect β†’ `mcpproxy-github-c3d4` + - **Both containers exist simultaneously!** + +4. **No Pre-Creation Check** + ```go + // internal/upstream/core/connection.go:260-328 + // ❌ Directly spawns container, no check for existing containers! + dockerRunArgs, err := c.isolationManager.BuildDockerArgs(c.config, runtimeType) + ``` + + **Missing:** + - `docker ps --filter name=mcpproxy-github-*` check + - Cleanup of stale containers before creating new one + +5. **Supervisor Liveness Detection Failures** + + **Scenario A: Slow Container Startup** + - Container starts but takes 30s to respond (image pull + init) + - Supervisor checks health after 20s β†’ timeout + - Supervisor thinks container dead β†’ spawns new one + - **Result: 2 containers, one still starting** + + **Scenario B: Network Partition** + - Docker API unreachable due to network issue + - Supervisor can't verify container health + - Assumes container dead β†’ spawns new one + - Network recovers β†’ **2 containers running** + + **Scenario C: Transient Docker API Errors** + - Docker API returns 500 Internal Server Error + - Supervisor retries immediately + - Each retry spawns new container + - **Result: Multiple containers** + +**Evidence from Codebase:** + +```go +// internal/upstream/core/docker.go:274-360 +func (c *Client) killDockerContainersByNamePatternWithContext(ctx context.Context) bool { + namePattern := "mcpproxy-" + sanitized + "-" + + // This finds ALL containers matching pattern + listCmd := exec.CommandContext(ctx, "docker", "ps", "-a", + "--filter", "name="+namePattern, "--format", "{{.ID}}\t{{.Names}}") + + // ⚠️ The fact that this returns MULTIPLE containers proves duplicates can exist! + for _, containerID := range containersToKill { + // Kill each one... + } +} +``` + +The cleanup logic **already handles multiple containers** because it knows duplicates happen! + +--- + +### **Comprehensive Solution: 5-Layer Defense** + +#### **Layer 1: Idempotent Container Creation** πŸ”΄ Critical + +**Before creating any container, clean up ALL existing containers for that server:** + +```go +// internal/upstream/core/connection.go (add before BuildDockerArgs) + +func (c *Client) ensureNoExistingContainers(ctx context.Context) error { + sanitized := sanitizeServerNameForContainer(c.config.Name) + namePattern := "mcpproxy-" + sanitized + "-" + + c.logger.Info("Checking for existing containers before creation", + zap.String("server", c.config.Name), + zap.String("name_pattern", namePattern)) + + // Find ALL containers matching our server (running or stopped) + listCmd := exec.CommandContext(ctx, "docker", "ps", "-a", + "--filter", "name="+namePattern, + "--format", "{{.ID}}\t{{.Names}}\t{{.Status}}") + + output, err := listCmd.Output() + if err != nil { + return fmt.Errorf("failed to list existing containers: %w", err) + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + if len(lines) == 0 || lines[0] == "" { + c.logger.Debug("No existing containers found - safe to create new one", + zap.String("server", c.config.Name)) + return nil + } + + // Found existing containers - clean them up first + c.logger.Warn("Found existing containers - cleaning up before creating new one", + zap.String("server", c.config.Name), + zap.Int("container_count", len(lines))) + + for _, line := range lines { + if line == "" { + continue + } + parts := strings.SplitN(line, "\t", 3) + if len(parts) >= 2 { + containerID := parts[0] + containerName := parts[1] + status := "" + if len(parts) >= 3 { + status = parts[2] + } + + c.logger.Info("Removing existing container", + zap.String("server", c.config.Name), + zap.String("container_id", containerID), + zap.String("container_name", containerName), + zap.String("status", status)) + + // Force remove (works for running and stopped containers) + rmCmd := exec.CommandContext(ctx, "docker", "rm", "-f", containerID) + if err := rmCmd.Run(); err != nil { + c.logger.Error("Failed to remove existing container", + zap.String("container_id", containerID), + zap.Error(err)) + // Continue anyway - try to remove others + } else { + c.logger.Info("Successfully removed existing container", + zap.String("container_id", containerID)) + } + } + } + + return nil +} +``` + +**Usage:** +```go +// internal/upstream/core/connection.go:260 (before BuildDockerArgs) +if willUseDocker { + // βœ… CRITICAL: Clean up any existing containers first + if err := c.ensureNoExistingContainers(connectCtx); err != nil { + c.logger.Error("Failed to ensure no existing containers", + zap.String("server", c.config.Name), + zap.Error(err)) + // Continue anyway - we'll try to create + } + + // Now safe to create new container + dockerRunArgs, err := c.isolationManager.BuildDockerArgs(c.config, runtimeType) + // ... +} +``` + +**Benefits:** +- βœ… Idempotent: Can call multiple times safely +- βœ… Prevents duplicates at creation time +- βœ… Cleans up orphaned containers automatically +- βœ… Works even if containerID was lost + +--- + +#### **Layer 2: Container Labels for Ownership Tracking** 🟑 Important + +**Label all containers with mcpproxy instance ID and server name:** + +```go +// internal/upstream/core/isolation.go:262 (add after --name) + +// Add labels for ownership tracking and cleanup +instanceID := getInstanceID() // Global instance UUID +labels := []string{ + "--label", "com.mcpproxy.managed=true", + "--label", "com.mcpproxy.instance=" + instanceID, + "--label", "com.mcpproxy.server=" + serverConfig.Name, + "--label", "com.mcpproxy.created=" + time.Now().UTC().Format(time.RFC3339), +} +args = append(args, labels...) +``` + +**Add instance ID tracking:** +```go +// internal/upstream/core/instance.go (new file) + +var ( + instanceID string + instanceIDOnce sync.Once +) + +func getInstanceID() string { + instanceIDOnce.Do(func() { + // Try to load from file first + if id, err := loadInstanceID(); err == nil { + instanceID = id + return + } + + // Generate new instance ID + instanceID = generateUUID() + saveInstanceID(instanceID) + }) + return instanceID +} + +func loadInstanceID() (string, error) { + data, err := os.ReadFile(filepath.Join(os.TempDir(), "mcpproxy-instance-id")) + if err != nil { + return "", err + } + return strings.TrimSpace(string(data)), nil +} + +func saveInstanceID(id string) { + os.WriteFile(filepath.Join(os.TempDir(), "mcpproxy-instance-id"), []byte(id), 0644) +} +``` + +**Improved cleanup using labels:** +```go +func (c *Client) cleanupOwnedContainers(ctx context.Context) error { + instanceID := getInstanceID() + serverName := c.config.Name + + // Find containers owned by THIS instance for THIS server + listCmd := exec.CommandContext(ctx, "docker", "ps", "-a", + "--filter", fmt.Sprintf("label=com.mcpproxy.instance=%s", instanceID), + "--filter", fmt.Sprintf("label=com.mcpproxy.server=%s", serverName), + "--format", "{{.ID}}\t{{.Label \"com.mcpproxy.created\"}}") + + // ... cleanup logic +} +``` + +**Benefits:** +- βœ… Accurate ownership tracking +- βœ… Can clean up containers from crashed instances +- βœ… Prevents conflicts with other mcpproxy instances +- βœ… Enables better debugging (docker ps --filter label=com.mcpproxy.managed=true) + +--- + +#### **Layer 3: Distributed Lock for Container Creation** 🟑 Important + +**Prevent race conditions during concurrent reconnection attempts:** + +```go +// internal/upstream/core/container_lock.go (new file) + +type ContainerLock struct { + locks sync.Map // serverName -> *sync.Mutex +} + +func (cl *ContainerLock) Lock(serverName string) *sync.Mutex { + mutex, _ := cl.locks.LoadOrStore(serverName, &sync.Mutex{}) + m := mutex.(*sync.Mutex) + m.Lock() + return m +} + +var globalContainerLock = &ContainerLock{} + +// Usage in connection.go: +func (c *Client) connectStdio(connectCtx context.Context) error { + if willUseDocker { + // βœ… Acquire lock for this server + lock := globalContainerLock.Lock(c.config.Name) + defer lock.Unlock() + + // Now only ONE goroutine can create container for this server + if err := c.ensureNoExistingContainers(connectCtx); err != nil { + // ... + } + + dockerRunArgs, err := c.isolationManager.BuildDockerArgs(c.config, runtimeType) + // ... + } +} +``` + +**Benefits:** +- βœ… Prevents concurrent container creation for same server +- βœ… Eliminates race condition in ForceReconnectAll() +- βœ… Simple implementation using sync.Mutex + +--- + +#### **Layer 4: Enhanced Container Health Verification** πŸ”΄ Critical + +**Add comprehensive health check before skipping reconnection:** + +```go +// internal/upstream/manager.go (enhance ForceReconnectAll) + +func (m *Manager) verifyContainerHealthy(client *managed.Client) (bool, error) { + containerID := client.GetContainerID() + if containerID == "" { + return false, fmt.Errorf("no container ID") + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Check 1: Container exists and is running + inspectCmd := exec.CommandContext(ctx, "docker", "inspect", + "--format", "{{.State.Running}},{{.State.Status}},{{.State.Health.Status}}", + containerID) + + output, err := inspectCmd.Output() + if err != nil { + return false, fmt.Errorf("container not found: %w", err) + } + + parts := strings.Split(strings.TrimSpace(string(output)), ",") + if len(parts) < 2 { + return false, fmt.Errorf("unexpected inspect output") + } + + running := parts[0] == "true" + status := parts[1] + + if !running { + return false, fmt.Errorf("container not running (status: %s)", status) + } + + // Check 2: Container is responsive (simple ping) + // This catches cases where Docker is paused but sockets remain open + pingCtx, pingCancel := context.WithTimeout(ctx, 2*time.Second) + defer pingCancel() + + if err := client.Ping(pingCtx); err != nil { + return false, fmt.Errorf("container not responsive: %w", err) + } + + return true, nil +} + +// Update ForceReconnectAll: +func (m *Manager) ForceReconnectAll(reason string) { + for id, client := range clientMap { + if client.IsConnected() { + // βœ… For Docker servers, verify container health + if client.IsDockerCommand() { + healthy, err := m.verifyContainerHealthy(client) + if !healthy { + m.logger.Warn("Container unhealthy despite active connection", + zap.String("server", id), + zap.Error(err)) + // Force reconnect + } else { + continue // Container healthy, skip + } + } else { + continue // Non-Docker, connection check sufficient + } + } + + // Reconnect logic... + } +} +``` + +**Benefits:** +- βœ… Detects frozen containers (Docker paused) +- βœ… Verifies actual responsiveness, not just connection state +- βœ… Prevents skipping reconnection for dead containers + +--- + +#### **Layer 5: Graceful Degradation on cidfile Timeout** 🟑 Important + +**If cidfile read fails, use container name to track container ID:** + +```go +// internal/upstream/core/docker.go:67 (after cidfile timeout) + +c.logger.Warn("Failed to read container ID from cidfile, attempting recovery via container name", + zap.String("server", c.config.Name), + zap.String("container_name", c.containerName)) + +// βœ… Fallback: Find container by name +if c.containerName != "" { + listCmd := exec.CommandContext(ctx, "docker", "ps", + "--filter", fmt.Sprintf("name=^%s$", c.containerName), + "--format", "{{.ID}}") + + if output, err := listCmd.Output(); err == nil { + foundID := strings.TrimSpace(string(output)) + if foundID != "" { + c.mu.Lock() + c.containerID = foundID + c.mu.Unlock() + + c.logger.Info("Successfully recovered container ID via name lookup", + zap.String("server", c.config.Name), + zap.String("container_id", foundID), + zap.String("container_name", c.containerName)) + return + } + } +} + +c.logger.Error("Failed to recover container ID - container will be orphaned on disconnect", + zap.String("server", c.config.Name), + zap.String("container_name", c.containerName)) +``` + +**Benefits:** +- βœ… Prevents orphaned containers +- βœ… Graceful fallback when cidfile fails +- βœ… Uses container name as secondary tracking mechanism + +--- + +### **Impact Assessment** + +**Without these fixes:** +- πŸ”΄ **High probability** of duplicate containers during Docker recovery +- πŸ”΄ **Resource exhaustion** on servers with frequent Docker issues +- πŸ”΄ **Port conflicts** if containers bind to specific ports +- πŸ”΄ **Orphaned containers** accumulate over time + +**With these fixes:** +- βœ… **Idempotent creation** prevents duplicates +- βœ… **Container labels** enable reliable cleanup +- βœ… **Distributed locks** prevent race conditions +- βœ… **Health verification** catches stale containers +- βœ… **Graceful degradation** prevents orphans + +--- + +### **Testing Plan for Duplicate Prevention** + +1. **Concurrent Reconnection Test** + ```bash + # Start 5 Docker servers + # Pause Docker Desktop + # Trigger ForceReconnectAll() 3 times rapidly + # Resume Docker Desktop + # Expected: Each server has EXACTLY 1 container + # Actual before fix: Each server has 2-3 containers + ``` + +2. **cidfile Timeout Test** + ```bash + # Use very large Docker image (slow pull: 20s) + # Set cidfile timeout to 5s + # Trigger reconnection + # Expected: Container ID recovered via name lookup + # Verify: Container cleaned up on disconnect + ``` + +3. **Orphan Container Test** + ```bash + # Start server, kill mcpproxy (SIGKILL) + # Restart mcpproxy + # Expected: Old container cleaned up before new one created + # Verify: docker ps shows only 1 container per server + ``` + +4. **Network Partition Test** + ```bash + # Start server + # Block Docker API (iptables or firewall) + # Trigger health check + # Unblock Docker API + # Expected: Health check detects unreachable container + # Verify: New container created, old one removed + ``` + +5. **Label-based Cleanup Test** + ```bash + # Create containers with instance ID labels + # Crash mcpproxy (kill -9) + # Start new mcpproxy instance (different instance ID) + # Expected: Old containers cleaned up despite different instance ID + # Verify: No orphaned containers remain + ``` + +--- + +## Summary of Recommended Changes + +### High Priority (Critical & Important) + +| # | Issue | Files to Modify | Effort | Impact | +|---|-------|----------------|--------|--------| +| 1 | Filter Docker-only reconnections | `internal/upstream/manager.go` | Small | High | +| 2 | Add container health verification | `internal/upstream/manager.go` | Medium | High | +| 3 | Exponential backoff polling | `cmd/mcpproxy-tray/main.go` | Small | Medium | +| 4 | Add "Recovering" state | Multiple tray files | Medium | High | +| 6 | Better force reconnect retry logic | `cmd/mcpproxy-tray/main.go` | Small | Medium | +| 9 | Partial failure handling | `internal/upstream/manager.go` | Medium | Medium | +| **11** | **Duplicate container spawning** | **`internal/upstream/core/connection.go`, `manager.go`** | **Large** | **Critical** | + +### Medium Priority (Nice-to-have) + +| # | Issue | Files to Modify | Effort | Impact | +|---|-------|----------------|--------|--------| +| 5 | Add observability/metrics | `cmd/mcpproxy-tray/main.go` | Medium | Low | +| 7 | Better error differentiation | State machine files | Medium | Low | +| 8 | Configurable timeouts | Config + multiple files | Large | Low | +| 10 | User notifications | Tray app | Small | Low | + +--- + +## Testing Recommendations + +After implementing improvements, test the following scenarios: + +1. **Basic pause/resume** + - Pause Docker Desktop β†’ Tray shows error + - Resume Docker Desktop β†’ Tray shows "recovering" β†’ transitions to "connected" + - Verify only Docker servers reconnected (not HTTP/SSE servers) + +2. **Container health verification** + - Pause Docker while server is connected + - Resume Docker + - Verify stale container connections are detected and recreated + +3. **Exponential backoff** + - Pause Docker β†’ Verify polling starts at 2s, increases to 60s + - Resume Docker quickly β†’ Verify fast recovery (within 5s) + - Leave Docker off for 5 minutes β†’ Verify polling backs off to 60s intervals + +4. **Partial failures** + - Configure 3 Docker servers + 1 HTTP server + - Break 1 Docker server (bad image name) + - Pause/resume Docker + - Verify status shows 2/3 Docker servers reconnected, HTTP server unaffected + +5. **Metrics verification** + - Check logs for outage count, recovery duration, success/failure rates + - Verify metrics persist across tray restarts + +--- + +## Implementation Plan + +**Phase 1: Critical Fixes (Week 1)** - 17 hours +1. Issue #11: Duplicate container spawning (8 hours) + - Layer 1: Idempotent creation (3 hours) + - Layer 3: Distributed locks (2 hours) + - Layer 4: Container health verification (3 hours) +2. Issue #1: Docker-only filtering (2 hours) +3. Issue #2: Container health verification (4 hours) +4. Issue #4: Add "Recovering" state (3 hours) + +**Phase 2: Reliability Improvements (Week 2)** - 11 hours +1. Issue #11 (continued): Advanced container management (4 hours) + - Layer 2: Container labels (2 hours) + - Layer 5: cidfile timeout fallback (2 hours) +2. Issue #3: Exponential backoff (2 hours) +3. Issue #6: Better retry logic (2 hours) +4. Issue #9: Partial failure handling (3 hours) + +**Phase 3: Observability & Polish (Week 3)** - 8 hours +1. Issue #5: Metrics/observability (4 hours) +2. Issue #7: Error differentiation (2 hours) +3. Issue #10: User notifications (2 hours) + +**Phase 4: Configuration (Optional)** - 4 hours +1. Issue #8: Configurable timeouts (4 hours) + +**Total effort estimate: 40 hours** (up from 28-32 hours due to Issue #11) + +--- + +## References + +- Original implementation: `docs/docker-resume-recovery.md` +- Tray state machine: `cmd/mcpproxy-tray/internal/state/` +- Upstream manager: `internal/upstream/manager.go` +- Docker isolation: `internal/upstream/core/isolation.go` diff --git a/docs/docker-resume-recovery.md b/docs/docker-resume-recovery.md new file mode 100644 index 00000000..03c4ad36 --- /dev/null +++ b/docs/docker-resume-recovery.md @@ -0,0 +1,36 @@ +# Docker Resume Recovery Improvements + +This branch introduces a coordinated recovery path for the tray and runtime when Docker Desktop is paused and resumed. + +## Tray Enhancements +- Added a pre-launch Docker health probe. The tray now waits until `docker info` succeeds before launching the core process, avoiding repeated failures when Docker is down. +- Introduced a polling loop (5s cadence) that detects when Docker becomes available again. Once recovered, the tray sets a pending reconnect flag and continues startup. +- Added a new `error_docker` connection state with explicit UX messaging so the tray menu and tooltip explain that Docker Desktop is unavailable. +- When the core transitions back to `Connected`, and a Docker recovery was detected, the tray calls a new HTTP API endpoint (`POST /api/v1/servers/reconnect`) to trigger fast upstream reconnection. + +## HTTP API and Runtime Changes +- Added `ForceReconnectAllServers(reason string)` to the runtime and exposed it through the HTTP controller to support the new `/servers/reconnect` route. +- The runtime delegates to `upstream.Manager.ForceReconnectAll`, which now rebuilds any disconnected, enabled managed client. The manager clones each server configuration, removes the old client, waits briefly for cleanup, and recreates the client so container state is refreshed. +- Existing, already connected HTTP/SSE/stdio clients are left untouched, so only affected Docker-backed servers restart. + +## Upstream Manager & Client Updates +- Added `Client.ForceReconnect` so the manager can bypass exponential backoff when forced. +- Implemented safe configuration cloning in the manager to avoid mutating shared configuration when recreating clients. +- Ensured managed clients skip force reconnect if they are already connected or currently connecting. + +## Docker Cleanup Reliability +- Increased the timeout for container shutdown/kill operations from 5–10 seconds to a shared 30-second budget. This prevents the manager from launching a new container while the previous cleanup is still in progress. + +## API Client Support +- The tray’s API adapter gained `ForceReconnectAllServers`, which works over both TCP and Unix socket transports, ensuring the new recovery path functions regardless of how the tray connects to the core. + +## Testing +- `go test ./internal/upstream/...` +- `go test ./internal/httpapi` +- `go test ./internal/runtime/...` +- `go test ./cmd/mcpproxy-tray/...` *(fails on this machine due to linker disk-space error; code compiles otherwise)* + +## Expected Behaviour +- When Docker Desktop is paused, the tray surfaces an explicit error state instead of hanging. +- After Docker resumes, the tray immediately triggers upstream reconnection; Docker-backed servers begin launching within a couple of seconds. +- HTTP-only or stdio servers continue operating throughout the outage and are not needlessly restarted. diff --git a/internal/httpapi/contracts_test.go b/internal/httpapi/contracts_test.go index 19ea98c2..9705eedc 100644 --- a/internal/httpapi/contracts_test.go +++ b/internal/httpapi/contracts_test.go @@ -75,7 +75,10 @@ func (m *MockServerController) GetAllServers() ([]map[string]interface{}, error) } func (m *MockServerController) EnableServer(_ string, _ bool) error { return nil } -func (m *MockServerController) RestartServer(_ string) error { return nil } +func (m *MockServerController) RestartServer(_ string) error { return nil } +func (m *MockServerController) ForceReconnectAllServers(_ string) error { + return nil +} func (m *MockServerController) QuarantineServer(_ string, _ bool) error { return nil } diff --git a/internal/httpapi/server.go b/internal/httpapi/server.go index c37cb5e9..319623fc 100644 --- a/internal/httpapi/server.go +++ b/internal/httpapi/server.go @@ -43,6 +43,7 @@ type ServerController interface { GetAllServers() ([]map[string]interface{}, error) EnableServer(serverName string, enabled bool) error RestartServer(serverName string) error + ForceReconnectAllServers(reason string) error QuarantineServer(serverName string, quarantined bool) error GetQuarantinedServers() ([]map[string]interface{}, error) UnquarantineServer(serverName string) error @@ -278,6 +279,7 @@ func (s *Server) setupRoutes() { // Server management r.Get("/servers", s.handleGetServers) + r.Post("/servers/reconnect", s.handleForceReconnectServers) r.Route("/servers/{id}", func(r chi.Router) { r.Post("/enable", s.handleEnableServer) r.Post("/disable", s.handleDisableServer) @@ -573,6 +575,26 @@ func (s *Server) handleDisableServer(w http.ResponseWriter, r *http.Request) { s.writeSuccess(w, response) } +func (s *Server) handleForceReconnectServers(w http.ResponseWriter, r *http.Request) { + reason := r.URL.Query().Get("reason") + + if err := s.controller.ForceReconnectAllServers(reason); err != nil { + s.logger.Error("Failed to trigger force reconnect for servers", + "reason", reason, + "error", err) + s.writeError(w, http.StatusInternalServerError, fmt.Sprintf("Failed to reconnect servers: %v", err)) + return + } + + response := contracts.ServerActionResponse{ + Server: "*", + Action: "reconnect_all", + Success: true, + } + + s.writeSuccess(w, response) +} + func (s *Server) handleRestartServer(w http.ResponseWriter, r *http.Request) { serverID := chi.URLParam(r, "id") if serverID == "" { diff --git a/internal/runtime/lifecycle.go b/internal/runtime/lifecycle.go index 325a0c64..ac6241a6 100644 --- a/internal/runtime/lifecycle.go +++ b/internal/runtime/lifecycle.go @@ -751,6 +751,31 @@ func (r *Runtime) RestartServer(serverName string) error { return nil } +// ForceReconnectAllServers triggers reconnection attempts for all managed servers. +func (r *Runtime) ForceReconnectAllServers(reason string) error { + if r.upstreamManager == nil { + return fmt.Errorf("upstream manager not initialized") + } + + if r.logger != nil { + r.logger.Info("Force reconnect requested for all upstream servers", + zap.String("reason", reason)) + } + + result := r.upstreamManager.ForceReconnectAll(reason) + + if r.logger != nil { + r.logger.Info("Force reconnect completed", + zap.Int("total_servers", result.TotalServers), + zap.Int("attempted", result.AttemptedServers), + zap.Int("successful", len(result.SuccessfulServers)), + zap.Int("failed", len(result.FailedServers)), + zap.Int("skipped", len(result.SkippedServers))) + } + + return nil +} + // HandleUpstreamServerChange should be called when upstream servers change. func (r *Runtime) HandleUpstreamServerChange(ctx context.Context) { if ctx == nil { diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index 7c2a4ade..54e349f8 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -452,10 +452,15 @@ func (r *Runtime) Close() error { } if r.upstreamManager != nil { - if err := r.upstreamManager.DisconnectAll(); err != nil { - errs = append(errs, fmt.Errorf("disconnect upstream servers: %w", err)) + // Use ShutdownAll instead of DisconnectAll to ensure proper container cleanup + // ShutdownAll handles both graceful disconnection and Docker container cleanup + shutdownCtx, shutdownCancel := context.WithTimeout(context.Background(), 15*time.Second) + defer shutdownCancel() + + if err := r.upstreamManager.ShutdownAll(shutdownCtx); err != nil { + errs = append(errs, fmt.Errorf("shutdown upstream servers: %w", err)) if r.logger != nil { - r.logger.Error("Failed to disconnect upstream servers", zap.Error(err)) + r.logger.Error("Failed to shutdown upstream servers", zap.Error(err)) } } } diff --git a/internal/server/server.go b/internal/server/server.go index 54420879..d6cce552 100644 --- a/internal/server/server.go +++ b/internal/server/server.go @@ -709,6 +709,13 @@ func (s *Server) RestartServer(serverName string) error { return s.runtime.RestartServer(serverName) } +// ForceReconnectAllServers triggers reconnection attempts for all managed servers. +func (s *Server) ForceReconnectAllServers(reason string) error { + s.logger.Info("HTTP API requested force reconnect for all upstream servers", + zap.String("reason", reason)) + return s.runtime.ForceReconnectAllServers(reason) +} + // QuarantineServer quarantines/unquarantines a server func (s *Server) QuarantineServer(serverName string, quarantined bool) error { return s.runtime.QuarantineServer(serverName, quarantined) diff --git a/internal/tray/connection_state.go b/internal/tray/connection_state.go index 0ca9026e..1d01c5a6 100644 --- a/internal/tray/connection_state.go +++ b/internal/tray/connection_state.go @@ -6,15 +6,17 @@ package tray type ConnectionState string const ( - ConnectionStateInitializing ConnectionState = "initializing" - ConnectionStateStartingCore ConnectionState = "starting_core" - ConnectionStateConnecting ConnectionState = "connecting" - ConnectionStateConnected ConnectionState = "connected" - ConnectionStateReconnecting ConnectionState = "reconnecting" - ConnectionStateDisconnected ConnectionState = "disconnected" - ConnectionStateAuthError ConnectionState = "auth_error" + ConnectionStateInitializing ConnectionState = "initializing" + ConnectionStateStartingCore ConnectionState = "starting_core" + ConnectionStateConnecting ConnectionState = "connecting" + ConnectionStateConnected ConnectionState = "connected" + ConnectionStateReconnecting ConnectionState = "reconnecting" + ConnectionStateDisconnected ConnectionState = "disconnected" + ConnectionStateAuthError ConnectionState = "auth_error" ConnectionStateErrorPortConflict ConnectionState = "error_port_conflict" // ADD: Specific error states ConnectionStateErrorDBLocked ConnectionState = "error_db_locked" + ConnectionStateErrorDocker ConnectionState = "error_docker" + ConnectionStateRecoveringDocker ConnectionState = "recovering_docker" // Docker recovery in progress ConnectionStateErrorConfig ConnectionState = "error_config" ConnectionStateErrorGeneral ConnectionState = "error_general" ConnectionStateFailed ConnectionState = "failed" diff --git a/internal/tray/connection_state_stub.go b/internal/tray/connection_state_stub.go index db6b4fee..b6b05f4a 100644 --- a/internal/tray/connection_state_stub.go +++ b/internal/tray/connection_state_stub.go @@ -6,15 +6,17 @@ package tray type ConnectionState string const ( - ConnectionStateInitializing ConnectionState = "initializing" - ConnectionStateStartingCore ConnectionState = "starting_core" - ConnectionStateConnecting ConnectionState = "connecting" - ConnectionStateConnected ConnectionState = "connected" - ConnectionStateReconnecting ConnectionState = "reconnecting" - ConnectionStateDisconnected ConnectionState = "disconnected" - ConnectionStateAuthError ConnectionState = "auth_error" + ConnectionStateInitializing ConnectionState = "initializing" + ConnectionStateStartingCore ConnectionState = "starting_core" + ConnectionStateConnecting ConnectionState = "connecting" + ConnectionStateConnected ConnectionState = "connected" + ConnectionStateReconnecting ConnectionState = "reconnecting" + ConnectionStateDisconnected ConnectionState = "disconnected" + ConnectionStateAuthError ConnectionState = "auth_error" ConnectionStateErrorPortConflict ConnectionState = "error_port_conflict" // ADD: Specific error states ConnectionStateErrorDBLocked ConnectionState = "error_db_locked" + ConnectionStateErrorDocker ConnectionState = "error_docker" + ConnectionStateRecoveringDocker ConnectionState = "recovering_docker" // Docker recovery in progress ConnectionStateErrorConfig ConnectionState = "error_config" ConnectionStateErrorGeneral ConnectionState = "error_general" ConnectionStateFailed ConnectionState = "failed" diff --git a/internal/tray/tray.go b/internal/tray/tray.go index 0ec4bdf3..1f70c89b 100644 --- a/internal/tray/tray.go +++ b/internal/tray/tray.go @@ -269,6 +269,9 @@ func (a *App) applyConnectionStateToUI(state ConnectionState) { case ConnectionStateErrorDBLocked: statusText = "Status: Database locked" tooltip = "Database locked by another mcpproxy instance. Kill other instance with: pkill mcpproxy" + case ConnectionStateErrorDocker: + statusText = "Status: Docker unavailable" + tooltip = "Docker Desktop is paused or unavailable. Resume Docker and retry." case ConnectionStateErrorConfig: statusText = "Status: Configuration error" tooltip = "Invalid configuration file. Fix ~/.mcpproxy/mcp_config.json and restart." diff --git a/internal/upstream/core/client.go b/internal/upstream/core/client.go index d5595f0c..91b00604 100644 --- a/internal/upstream/core/client.go +++ b/internal/upstream/core/client.go @@ -447,6 +447,13 @@ func (c *Client) GetServerInfo() *mcp.InitializeResult { return c.serverInfo } +// GetContainerID returns the Docker container ID if this is a Docker-based server +func (c *Client) GetContainerID() string { + c.mu.RLock() + defer c.mu.RUnlock() + return c.containerID +} + // GetTransportType returns the transport type being used func (c *Client) GetTransportType() string { return c.transportType diff --git a/internal/upstream/core/connection.go b/internal/upstream/core/connection.go index 873b414d..f27f37bf 100644 --- a/internal/upstream/core/connection.go +++ b/internal/upstream/core/connection.go @@ -25,6 +25,8 @@ const ( osDarwin = "darwin" osWindows = "windows" + dockerCleanupTimeout = 30 * time.Second + // Transport types transportHTTP = "http" transportHTTPStreamable = "streamable-http" @@ -135,7 +137,7 @@ func (c *Client) Connect(ctx context.Context) error { zap.String("container_id", c.containerID), zap.Error(err)) - cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 10*time.Second) + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerCleanupTimeout) defer cleanupCancel() // Try to cleanup using container name first, then ID, then pattern matching @@ -267,11 +269,25 @@ func (c *Client) connectStdio(ctx context.Context) error { } if willUseDocker { + // CRITICAL: Acquire per-server lock to prevent concurrent container creation + // This prevents race conditions when multiple goroutines try to reconnect the same server + lock := globalContainerLock.Lock(c.config.Name) + defer lock.Unlock() + c.logger.Debug("Docker command detected, setting up container ID tracking", zap.String("server", c.config.Name), zap.String("command", c.config.Command), zap.Strings("original_args", args)) + // CRITICAL: Clean up any existing containers first to prevent duplicates + // This makes container creation idempotent and safe to call multiple times + if err := c.ensureNoExistingContainers(ctx); err != nil { + c.logger.Error("Failed to ensure no existing containers", + zap.String("server", c.config.Name), + zap.Error(err)) + // Continue anyway - we'll try to create the container + } + // Create temp file for container ID tmpFile, err := os.CreateTemp("", "mcpproxy-cid-*.txt") if err == nil { @@ -382,7 +398,7 @@ func (c *Client) connectStdio(ctx context.Context) error { zap.String("container_id", c.containerID), zap.Error(err)) - cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 10*time.Second) + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerCleanupTimeout) defer cleanupCancel() // Try to cleanup using container name first, then ID, then pattern matching @@ -1534,7 +1550,7 @@ func (c *Client) DisconnectWithContext(_ context.Context) error { // Create a fresh context for Docker cleanup with its own timeout // This ensures cleanup can complete even if the main context expires - cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), 5*time.Second) + cleanupCtx, cleanupCancel := context.WithTimeout(context.Background(), dockerCleanupTimeout) defer cleanupCancel() if c.containerID != "" { diff --git a/internal/upstream/core/container_lock.go b/internal/upstream/core/container_lock.go new file mode 100644 index 00000000..21846265 --- /dev/null +++ b/internal/upstream/core/container_lock.go @@ -0,0 +1,24 @@ +package core + +import ( + "sync" +) + +// ContainerLock provides per-server locking for container creation +// This prevents race conditions where multiple goroutines try to create +// containers for the same server simultaneously (e.g., during ForceReconnectAll) +type ContainerLock struct { + locks sync.Map // serverName -> *sync.Mutex +} + +// Lock acquires a lock for the specified server name +// Returns a mutex that the caller MUST unlock when done +func (cl *ContainerLock) Lock(serverName string) *sync.Mutex { + mutex, _ := cl.locks.LoadOrStore(serverName, &sync.Mutex{}) + m := mutex.(*sync.Mutex) + m.Lock() + return m +} + +// globalContainerLock is the global instance used by all clients +var globalContainerLock = &ContainerLock{} diff --git a/internal/upstream/core/docker.go b/internal/upstream/core/docker.go index 9a515715..ff1f1518 100644 --- a/internal/upstream/core/docker.go +++ b/internal/upstream/core/docker.go @@ -2,6 +2,7 @@ package core import ( "context" + "fmt" "os" "os/exec" "strings" @@ -61,9 +62,53 @@ func (c *Client) readContainerIDWithContext(ctx context.Context, cidFile string) } } - c.logger.Warn("Failed to read Docker container ID from cidfile after 10 seconds", + c.logger.Warn("Failed to read container ID from cidfile, attempting recovery via container name", zap.String("server", c.config.Name), - zap.String("cid_file", cidFile)) + zap.String("cid_file", cidFile), + zap.String("container_name", c.containerName)) + + if c.upstreamLogger != nil { + c.upstreamLogger.Warn("cidfile read timeout - attempting name lookup recovery") + } + + // Fallback: Find container by name + if c.containerName != "" { + listCmd := exec.CommandContext(ctx, "docker", "ps", + "--filter", fmt.Sprintf("name=^%s$", c.containerName), + "--format", "{{.ID}}") + + if output, err := listCmd.Output(); err == nil { + foundID := strings.TrimSpace(string(output)) + if foundID != "" { + c.mu.Lock() + c.containerID = foundID + c.mu.Unlock() + + c.logger.Info("Successfully recovered container ID via name lookup", + zap.String("server", c.config.Name), + zap.String("container_id", foundID[:12]), + zap.String("full_container_id", foundID), + zap.String("container_name", c.containerName)) + + if c.upstreamLogger != nil { + c.upstreamLogger.Info("Container ID recovered via name lookup", + zap.String("container_id", foundID)) + } + + // Clean up the cidfile since we got the ID + os.Remove(cidFile) + return + } + } + } + + c.logger.Error("Failed to recover container ID - container will be orphaned on disconnect", + zap.String("server", c.config.Name), + zap.String("container_name", c.containerName)) + + if c.upstreamLogger != nil { + c.upstreamLogger.Error("Failed to recover container ID - may be orphaned") + } } // killDockerContainerWithContext kills the Docker container if one is running with context timeout @@ -422,6 +467,90 @@ func (c *Client) killDockerContainerByNameWithContext(ctx context.Context, conta return true } +// ensureNoExistingContainers removes all existing containers for this server before creating a new one +// This makes container creation idempotent and prevents duplicate container spawning +func (c *Client) ensureNoExistingContainers(ctx context.Context) error { + sanitized := sanitizeServerNameForContainer(c.config.Name) + namePattern := "mcpproxy-" + sanitized + "-" + + c.logger.Info("Checking for existing containers before creation", + zap.String("server", c.config.Name), + zap.String("name_pattern", namePattern)) + + // Find ALL containers matching our server (running or stopped) + listCmd := exec.CommandContext(ctx, "docker", "ps", "-a", + "--filter", "name="+namePattern, + "--format", "{{.ID}}\t{{.Names}}\t{{.Status}}") + + output, err := listCmd.Output() + if err != nil { + return fmt.Errorf("failed to list existing containers: %w", err) + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + if len(lines) == 0 || lines[0] == "" { + c.logger.Debug("No existing containers found - safe to create new one", + zap.String("server", c.config.Name)) + return nil + } + + // Found existing containers - clean them up first + c.logger.Warn("Found existing containers - cleaning up before creating new one", + zap.String("server", c.config.Name), + zap.Int("container_count", len(lines))) + + if c.upstreamLogger != nil { + c.upstreamLogger.Warn("Cleaning up existing containers before creating new one", + zap.Int("container_count", len(lines))) + } + + for _, line := range lines { + if line == "" { + continue + } + parts := strings.SplitN(line, "\t", 3) + if len(parts) >= 2 { + containerID := parts[0] + containerName := parts[1] + status := "" + if len(parts) >= 3 { + status = parts[2] + } + + c.logger.Info("Removing existing container", + zap.String("server", c.config.Name), + zap.String("container_id", containerID), + zap.String("container_name", containerName), + zap.String("status", status)) + + if c.upstreamLogger != nil { + c.upstreamLogger.Info("Removing existing container", + zap.String("container_id", containerID), + zap.String("container_name", containerName)) + } + + // Force remove (works for running and stopped containers) + rmCmd := exec.CommandContext(ctx, "docker", "rm", "-f", containerID) + if err := rmCmd.Run(); err != nil { + c.logger.Error("Failed to remove existing container", + zap.String("container_id", containerID), + zap.Error(err)) + // Continue anyway - try to remove others + } else { + c.logger.Info("Successfully removed existing container", + zap.String("container_id", containerID)) + + if c.upstreamLogger != nil { + c.upstreamLogger.Info("Successfully removed existing container", + zap.String("container_id", containerID)) + } + } + } + } + + return nil +} + // checkDockerContainerHealth checks if Docker containers are still running func (c *Client) checkDockerContainerHealth() { // For Docker commands, we can check if containers are still running diff --git a/internal/upstream/core/instance.go b/internal/upstream/core/instance.go new file mode 100644 index 00000000..c20b92fd --- /dev/null +++ b/internal/upstream/core/instance.go @@ -0,0 +1,60 @@ +package core + +import ( + "fmt" + "os" + "path/filepath" + "strings" + "sync" + + "github.com/google/uuid" +) + +var ( + instanceID string + instanceIDOnce sync.Once +) + +// getInstanceID returns a unique identifier for this mcpproxy instance +// The ID is persisted across restarts and used to label Docker containers +func getInstanceID() string { + instanceIDOnce.Do(func() { + // Try to load from file first + if id, err := loadInstanceID(); err == nil && id != "" { + instanceID = id + return + } + + // Generate new instance ID + instanceID = uuid.New().String() + _ = saveInstanceID(instanceID) // Best effort save + }) + return instanceID +} + +// loadInstanceID attempts to load the instance ID from disk +func loadInstanceID() (string, error) { + instanceFile := filepath.Join(os.TempDir(), "mcpproxy-instance-id") + data, err := os.ReadFile(instanceFile) + if err != nil { + return "", err + } + return strings.TrimSpace(string(data)), nil +} + +// saveInstanceID persists the instance ID to disk +func saveInstanceID(id string) error { + instanceFile := filepath.Join(os.TempDir(), "mcpproxy-instance-id") + return os.WriteFile(instanceFile, []byte(id), 0644) +} + +// formatContainerLabels returns Docker labels for container ownership tracking +func formatContainerLabels(serverName string) []string { + instanceID := getInstanceID() + return []string{ + "--label", "com.mcpproxy.managed=true", + "--label", fmt.Sprintf("com.mcpproxy.instance=%s", instanceID), + "--label", fmt.Sprintf("com.mcpproxy.server=%s", serverName), + "--label", fmt.Sprintf("com.mcpproxy.created=%d", os.Getpid()), + } +} diff --git a/internal/upstream/core/isolation.go b/internal/upstream/core/isolation.go index 8de27ccb..e5b2bfe8 100644 --- a/internal/upstream/core/isolation.go +++ b/internal/upstream/core/isolation.go @@ -263,6 +263,10 @@ func (im *IsolationManager) BuildDockerArgs(serverConfig *config.ServerConfig, r containerName := generateContainerName(serverConfig.Name) args = append(args, "--name", containerName) + // Add labels for ownership tracking and cleanup + labels := formatContainerLabels(serverConfig.Name) + args = append(args, labels...) + // Add log driver only if explicitly configured logDriver := "" if serverConfig.Isolation != nil && serverConfig.Isolation.LogDriver != "" { diff --git a/internal/upstream/managed/client.go b/internal/upstream/managed/client.go index 8a91f34a..0d14271f 100644 --- a/internal/upstream/managed/client.go +++ b/internal/upstream/managed/client.go @@ -597,6 +597,39 @@ func (mc *Client) performHealthCheck() { zap.String("server", mc.Config.Name)) } +// ForceReconnect triggers an immediate reconnection attempt regardless of backoff state. +func (mc *Client) ForceReconnect(reason string) { + if mc == nil { + return + } + + serverName := "" + if mc.Config != nil { + serverName = mc.Config.Name + } + + if mc.IsConnected() { + mc.logger.Debug("Force reconnect skipped - client already connected", + zap.String("server", serverName), + zap.String("reason", reason)) + return + } + + if mc.IsConnecting() { + mc.logger.Debug("Force reconnect skipped - client currently connecting", + zap.String("server", serverName), + zap.String("reason", reason)) + return + } + + mc.logger.Info("Force reconnect requested", + zap.String("server", serverName), + zap.String("reason", reason), + zap.String("state", mc.StateManager.GetState().String())) + + go mc.tryReconnect() +} + // tryReconnect attempts to reconnect the client with proper error handling func (mc *Client) tryReconnect() { // CRITICAL FIX: Prevent concurrent reconnection attempts to avoid duplicate containers @@ -900,6 +933,14 @@ func (mc *Client) IsDockerCommand() bool { return mc.isDockerServer() } +// GetContainerID returns the Docker container ID if this is a Docker-based server +func (mc *Client) GetContainerID() string { + if mc.coreClient == nil { + return "" + } + return mc.coreClient.GetContainerID() +} + // setToolCountCache records the latest tool count and timestamp for non-blocking consumers. func (mc *Client) setToolCountCache(count int) { mc.toolCountMu.Lock() diff --git a/internal/upstream/manager.go b/internal/upstream/manager.go index cd0192ac..2362bd2b 100644 --- a/internal/upstream/manager.go +++ b/internal/upstream/manager.go @@ -3,6 +3,9 @@ package upstream import ( "context" "fmt" + "maps" + "os/exec" + "slices" "strings" "sync" "time" @@ -40,6 +43,44 @@ type Manager struct { shutdownCancel context.CancelFunc } +func cloneServerConfig(cfg *config.ServerConfig) *config.ServerConfig { + if cfg == nil { + return nil + } + + clone := *cfg + + if cfg.Args != nil { + clone.Args = slices.Clone(cfg.Args) + } + + if cfg.Env != nil { + clone.Env = maps.Clone(cfg.Env) + } + + if cfg.Headers != nil { + clone.Headers = maps.Clone(cfg.Headers) + } + + if cfg.OAuth != nil { + o := *cfg.OAuth + if cfg.OAuth.Scopes != nil { + o.Scopes = slices.Clone(cfg.OAuth.Scopes) + } + clone.OAuth = &o + } + + if cfg.Isolation != nil { + iso := *cfg.Isolation + if cfg.Isolation.ExtraArgs != nil { + iso.ExtraArgs = slices.Clone(cfg.Isolation.ExtraArgs) + } + clone.Isolation = &iso + } + + return &clone +} + // NewManager creates a new upstream manager func NewManager(logger *zap.Logger, globalConfig *config.Config, storage *storage.BoltDB, secretResolver *secret.Resolver) *Manager { shutdownCtx, shutdownCancel := context.WithCancel(context.Background()) @@ -272,6 +313,177 @@ func (m *Manager) RemoveServer(id string) { } } +// ShutdownAll disconnects all clients and ensures all Docker containers are stopped +// This should be called during application shutdown to ensure clean exit +func (m *Manager) ShutdownAll(ctx context.Context) error { + m.logger.Info("Shutting down all upstream servers") + + m.mu.RLock() + clientMap := make(map[string]*managed.Client, len(m.clients)) + for id, client := range m.clients { + clientMap[id] = client + } + m.mu.RUnlock() + + if len(clientMap) == 0 { + m.logger.Debug("No upstream servers to shutdown") + return nil + } + + m.logger.Info("Disconnecting all upstream servers (in parallel)", + zap.Int("count", len(clientMap))) + + // Disconnect all clients in parallel using goroutines + // This ensures shutdown is fast even with many servers + var wg sync.WaitGroup + for id, client := range clientMap { + if client == nil { + continue + } + + wg.Add(1) + go func(clientID string, c *managed.Client) { + defer wg.Done() + + serverName := c.GetConfig().Name + m.logger.Debug("Disconnecting server", + zap.String("id", clientID), + zap.String("server", serverName)) + + if err := c.Disconnect(); err != nil { + m.logger.Warn("Error disconnecting server", + zap.String("id", clientID), + zap.String("server", serverName), + zap.Error(err)) + } else { + m.logger.Debug("Successfully disconnected server", + zap.String("id", clientID), + zap.String("server", serverName)) + } + }(id, client) + } + + // Wait for all disconnections to complete (with timeout from context) + done := make(chan struct{}) + go func() { + wg.Wait() + close(done) + }() + + select { + case <-done: + m.logger.Info("All upstream servers disconnected successfully") + case <-ctx.Done(): + m.logger.Warn("Shutdown context cancelled, some servers may not have disconnected cleanly") + } + + // Additional cleanup: Find and stop ALL mcpproxy-managed containers + // This catches any orphaned containers from previous crashes + m.cleanupAllManagedContainers(ctx) + + m.logger.Info("All upstream servers shut down successfully") + return nil +} + +// cleanupAllManagedContainers finds and stops all Docker containers managed by mcpproxy +// Uses labels to identify containers across all instances +func (m *Manager) cleanupAllManagedContainers(ctx context.Context) { + m.logger.Info("Cleaning up all mcpproxy-managed Docker containers") + + // Find all containers with our management label + listCmd := exec.CommandContext(ctx, "docker", "ps", "-a", + "--filter", "label=com.mcpproxy.managed=true", + "--format", "{{.ID}}\t{{.Names}}\t{{.Label \"com.mcpproxy.server\"}}") + + output, err := listCmd.Output() + if err != nil { + m.logger.Debug("No Docker containers found or Docker unavailable", zap.Error(err)) + return + } + + lines := strings.Split(strings.TrimSpace(string(output)), "\n") + if len(lines) == 0 || lines[0] == "" { + m.logger.Debug("No mcpproxy-managed containers found") + return + } + + m.logger.Info("Found mcpproxy-managed containers to cleanup", + zap.Int("count", len(lines))) + + // Grace period for graceful shutdown + gracePeriod := 10 * time.Second + graceCtx, graceCancel := context.WithTimeout(ctx, gracePeriod) + defer graceCancel() + + containerIDs := []string{} + for _, line := range lines { + if line == "" { + continue + } + parts := strings.SplitN(line, "\t", 3) + if len(parts) >= 1 { + containerID := parts[0] + containerName := "" + serverName := "" + if len(parts) >= 2 { + containerName = parts[1] + } + if len(parts) >= 3 { + serverName = parts[2] + } + + m.logger.Info("Stopping container", + zap.String("container_id", containerID), + zap.String("container_name", containerName), + zap.String("server", serverName)) + + containerIDs = append(containerIDs, containerID) + + // Try graceful stop first + stopCmd := exec.CommandContext(graceCtx, "docker", "stop", containerID) + if err := stopCmd.Run(); err != nil { + m.logger.Warn("Graceful stop failed, will force kill", + zap.String("container_id", containerID), + zap.Error(err)) + } else { + m.logger.Info("Container stopped gracefully", + zap.String("container_id", containerID)) + } + } + } + + // Force kill any remaining containers after grace period + if graceCtx.Err() != nil || len(containerIDs) > 0 { + m.logger.Info("Force killing any remaining containers") + + killCtx, killCancel := context.WithTimeout(ctx, 5*time.Second) + defer killCancel() + + for _, containerID := range containerIDs { + // Check if container is still running + psCmd := exec.CommandContext(killCtx, "docker", "ps", "-q", + "--filter", "id="+containerID) + if output, err := psCmd.Output(); err == nil && len(strings.TrimSpace(string(output))) > 0 { + // Still running, force kill + m.logger.Info("Force killing container", + zap.String("container_id", containerID)) + + killCmd := exec.CommandContext(killCtx, "docker", "kill", containerID) + if err := killCmd.Run(); err != nil { + m.logger.Error("Failed to force kill container", + zap.String("container_id", containerID), + zap.Error(err)) + } else { + m.logger.Info("Container force killed", + zap.String("container_id", containerID)) + } + } + } + } + + m.logger.Info("Container cleanup completed") +} + // GetClient returns a client by ID func (m *Manager) GetClient(id string) (*managed.Client, bool) { m.mu.RLock() @@ -829,6 +1041,176 @@ func (m *Manager) RetryConnection(serverName string) error { return nil } +// verifyContainerHealthy checks if a Docker container is actually running and responsive +// This is critical for detecting "zombie" containers where the socket is open but the process is frozen +func (m *Manager) verifyContainerHealthy(client *managed.Client) (bool, error) { + // Only check Docker-based servers + if !client.IsDockerCommand() { + return true, nil // Non-Docker servers don't need container health check + } + + containerID := client.GetContainerID() + if containerID == "" { + return false, fmt.Errorf("no container ID available") + } + + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + // Check 1: Container exists and is running + inspectCmd := exec.CommandContext(ctx, "docker", "inspect", + "--format", "{{.State.Running}},{{.State.Status}}", + containerID) + + output, err := inspectCmd.Output() + if err != nil { + return false, fmt.Errorf("container not found or unreachable: %w", err) + } + + parts := strings.Split(strings.TrimSpace(string(output)), ",") + if len(parts) < 2 { + return false, fmt.Errorf("unexpected docker inspect output: %s", string(output)) + } + + running := parts[0] == "true" + status := parts[1] + + if !running { + return false, fmt.Errorf("container not running (status: %s)", status) + } + + m.logger.Debug("Container health check passed", + zap.String("container_id", containerID[:12]), + zap.String("status", status)) + + return true, nil +} + +// ReconnectResult tracks the results of a ForceReconnectAll operation +type ReconnectResult struct { + TotalServers int + AttemptedServers int + SuccessfulServers []string + FailedServers map[string]error + SkippedServers map[string]string // server name -> skip reason +} + +// ForceReconnectAll triggers reconnection attempts for all managed clients. +// For Docker-based servers, this includes container health verification to catch frozen containers. +// Returns detailed results about which servers were reconnected, skipped, or failed. +func (m *Manager) ForceReconnectAll(reason string) *ReconnectResult { + result := &ReconnectResult{ + SuccessfulServers: []string{}, + FailedServers: make(map[string]error), + SkippedServers: make(map[string]string), + } + m.mu.RLock() + clientMap := make(map[string]*managed.Client, len(m.clients)) + for id, client := range m.clients { + clientMap[id] = client + } + m.mu.RUnlock() + + result.TotalServers = len(clientMap) + + if len(clientMap) == 0 { + m.logger.Debug("ForceReconnectAll: no managed clients registered") + return result + } + + m.logger.Info("ForceReconnectAll: processing managed clients", + zap.Int("client_count", len(clientMap)), + zap.String("reason", reason)) + + for id, client := range clientMap { + if client == nil { + result.SkippedServers[id] = "nil client" + continue + } + + // CRITICAL: For Docker servers, verify container health even if connected + // This catches frozen containers when Docker is paused/resumed + if client.IsConnected() { + if client.IsDockerCommand() { + healthy, err := m.verifyContainerHealthy(client) + if !healthy { + m.logger.Warn("Container unhealthy despite active connection - forcing reconnect", + zap.String("server", id), + zap.Error(err)) + // Fall through to reconnect logic + } else { + result.SkippedServers[id] = "container healthy" + m.logger.Debug("Skipping reconnect - container healthy", + zap.String("server", id)) + continue + } + } else { + // Non-Docker server, connection state is sufficient + result.SkippedServers[id] = "already connected" + m.logger.Debug("Skipping reconnect - already connected", + zap.String("server", id)) + continue + } + } + + // Filter: Only reconnect Docker-based servers (skip HTTP/SSE/non-Docker stdio) + if !client.IsDockerCommand() { + result.SkippedServers[id] = "not a Docker server" + m.logger.Debug("Skipping reconnect - not a Docker server", + zap.String("server", id), + zap.String("reason", reason)) + continue + } + + cfg := cloneServerConfig(client.GetConfig()) + if cfg == nil { + result.SkippedServers[id] = "failed to clone config" + continue + } + + if !cfg.Enabled { + result.SkippedServers[id] = "server disabled" + m.logger.Debug("Skipping reconnect - server disabled", + zap.String("server", id)) + continue + } + + result.AttemptedServers++ + + m.logger.Info("ForceReconnectAll: rebuilding Docker client", + zap.String("server", cfg.Name), + zap.String("id", id), + zap.String("reason", reason)) + + m.RemoveServer(id) + + // Small delay to allow container/process cleanup before restart + time.Sleep(200 * time.Millisecond) + + if err := m.AddServer(id, cfg); err != nil { + result.FailedServers[id] = err + m.logger.Error("ForceReconnectAll: failed to rebuild client", + zap.String("server", cfg.Name), + zap.String("id", id), + zap.Error(err)) + } else { + result.SuccessfulServers = append(result.SuccessfulServers, id) + m.logger.Info("ForceReconnectAll: successfully rebuilt client", + zap.String("server", cfg.Name), + zap.String("id", id)) + } + } + + m.logger.Info("ForceReconnectAll completed", + zap.Int("total", result.TotalServers), + zap.Int("attempted", result.AttemptedServers), + zap.Int("successful", len(result.SuccessfulServers)), + zap.Int("failed", len(result.FailedServers)), + zap.Int("skipped", len(result.SkippedServers))) + + return result +} + // startOAuthEventMonitor monitors the database for OAuth completion events from CLI processes func (m *Manager) startOAuthEventMonitor(ctx context.Context) { m.logger.Info("Starting OAuth event monitor for cross-process notifications")