From 27b83a04022d5043349addad76adb7a0eb0ca9af Mon Sep 17 00:00:00 2001 From: engineer Date: Sat, 14 Feb 2026 11:37:06 -0800 Subject: [PATCH] fix: replace permanent watchdog halt with self-healing cooldown (#73) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The tunnel watchdog circuit breaker permanently halted after 5 restarts, leaving the tunnel disconnected forever. Replace with exponential backoff cooldown (5min → 10min → 20min → 30min cap) that automatically resumes monitoring. Add process liveness fast-path to detect dead cloudflared immediately. Reset cooldown count on successful recovery. --- backend/src/services/tunnel-service.ts | 58 ++++++++++++++++++++------ bin/cli.ts | 22 ++++++++++ 2 files changed, 68 insertions(+), 12 deletions(-) diff --git a/backend/src/services/tunnel-service.ts b/backend/src/services/tunnel-service.ts index ade60f20..01a285f4 100644 --- a/backend/src/services/tunnel-service.ts +++ b/backend/src/services/tunnel-service.ts @@ -17,6 +17,8 @@ const WATCHDOG_INTERVAL_MS = 30_000 const WATCHDOG_FAIL_THRESHOLD = 3 const MAX_RESTARTS = 5 const MAX_RESTART_WINDOW_MS = 10 * 60 * 1000 +const COOLDOWN_BASE_MS = 5 * 60 * 1000 +const COOLDOWN_MAX_MS = 30 * 60 * 1000 const MAX_LOG_SIZE_BYTES = 10 * 1024 * 1024 const MAX_LOG_BACKUPS = 2 const URL_CAPTURE_TIMEOUT_MS = 30_000 @@ -35,6 +37,8 @@ interface TunnelStatus { consecutiveFailures: number restartsInWindow: number halted: boolean + cooldownUntil: number | null + cooldownCount: number } error: string | null } @@ -60,6 +64,8 @@ class TunnelService { restarting: false, restartTimestamps: [] as number[], halted: false, + cooldownUntil: null as number | null, + cooldownCount: 0, } isRunning(): boolean { @@ -88,6 +94,8 @@ class TunnelService { consecutiveFailures: this.watchdogState.consecutiveFailures, restartsInWindow: this.watchdogState.restartTimestamps.length, halted: this.watchdogState.halted, + cooldownUntil: this.watchdogState.cooldownUntil, + cooldownCount: this.watchdogState.cooldownCount, }, error: this.error, } @@ -290,31 +298,57 @@ class TunnelService { restarting: false, restartTimestamps: [], halted: false, + cooldownUntil: null, + cooldownCount: 0, } logger.info('Starting tunnel watchdog') this.watchdogTimer = setInterval(async () => { - if (this.watchdogState.restarting || this.watchdogState.halted) return - - const connected = await this.checkConnected() - if (connected) { - if (this.watchdogState.consecutiveFailures > 0) { - logger.info(`Tunnel recovered after ${this.watchdogState.consecutiveFailures} failed check(s)`) - } + if (this.watchdogState.restarting) return + + if (this.watchdogState.cooldownUntil) { + const remaining = this.watchdogState.cooldownUntil - Date.now() + if (remaining > 0) return + logger.info('Watchdog cooldown expired, resuming monitoring') + this.watchdogState.cooldownUntil = null + this.watchdogState.halted = false + this.watchdogState.restartTimestamps = [] this.watchdogState.consecutiveFailures = 0 - return + this.error = null } - this.watchdogState.consecutiveFailures++ - logger.warn(`Tunnel disconnected (${this.watchdogState.consecutiveFailures}/${WATCHDOG_FAIL_THRESHOLD})`) + if (!this.process || this.process.killed) { + logger.warn('Tunnel process is dead, attempting restart') + this.watchdogState.consecutiveFailures = WATCHDOG_FAIL_THRESHOLD + } else { + const connected = await this.checkConnected() + if (connected) { + if (this.watchdogState.consecutiveFailures > 0) { + logger.info(`Tunnel recovered after ${this.watchdogState.consecutiveFailures} failed check(s)`) + } + this.watchdogState.consecutiveFailures = 0 + if (this.watchdogState.cooldownCount > 0) { + this.watchdogState.cooldownCount = 0 + } + return + } + this.watchdogState.consecutiveFailures++ + logger.warn(`Tunnel disconnected (${this.watchdogState.consecutiveFailures}/${WATCHDOG_FAIL_THRESHOLD})`) + } if (this.watchdogState.consecutiveFailures < WATCHDOG_FAIL_THRESHOLD) return if (this.isCircuitBroken()) { - logger.error(`Circuit breaker triggered: ${MAX_RESTARTS} restarts in ${MAX_RESTART_WINDOW_MS / 60000} min window. Halting watchdog.`) + this.watchdogState.cooldownCount++ + const cooldownMs = Math.min( + COOLDOWN_BASE_MS * Math.pow(2, this.watchdogState.cooldownCount - 1), + COOLDOWN_MAX_MS + ) + this.watchdogState.cooldownUntil = Date.now() + cooldownMs this.watchdogState.halted = true - this.error = 'Watchdog halted: too many restarts' + this.error = `Watchdog cooling down: ${MAX_RESTARTS} restarts in ${MAX_RESTART_WINDOW_MS / 60000} min. Resuming in ${Math.round(cooldownMs / 60000)} min` + logger.warn(this.error) return } diff --git a/bin/cli.ts b/bin/cli.ts index c684469d..d1aac742 100755 --- a/bin/cli.ts +++ b/bin/cli.ts @@ -1079,6 +1079,14 @@ interface TunnelStatusResponse { edgeLocationFormatted?: string; haConnections?: number; error?: string; + watchdog?: { + enabled: boolean; + consecutiveFailures: number; + restartsInWindow: number; + halted: boolean; + cooldownUntil: number | null; + cooldownCount: number; + }; } async function commandHealth(args: string[]): Promise { @@ -1294,6 +1302,20 @@ async function commandHealth(args: string[]): Promise { " warning: tunnel metrics not reachable; showing last known URL", ); } + if (results.tunnel.data?.watchdog) { + const wd = results.tunnel.data.watchdog; + if (wd.halted && wd.cooldownUntil) { + const remainMs = wd.cooldownUntil - Date.now(); + const remainMin = Math.max(0, Math.ceil(remainMs / 60000)); + console.log( + ` watchdog: cooling down (attempt ${wd.cooldownCount}), resumes in ${remainMin} min`, + ); + } else if (wd.consecutiveFailures > 0) { + console.log( + ` watchdog: ${wd.consecutiveFailures} consecutive failure(s)`, + ); + } + } if (results.tunnel.error) { console.log(` error: ${results.tunnel.error}`); }