Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 151 additions & 63 deletions backend/src/services/tunnel-service.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ const AUTH_FILE = path.join(CONFIG_DIR, 'auth.json')

const METRICS_PORTS = [20241, 20242, 20243, 20244, 20245]
const WATCHDOG_INTERVAL_MS = 30_000
const WATCHDOG_JITTER_MS = 5_000
const WATCHDOG_FAIL_THRESHOLD = 3
const MAX_RESTARTS = 5
const MAX_RESTART_WINDOW_MS = 10 * 60 * 1000
Expand All @@ -24,6 +25,16 @@ const MAX_LOG_BACKUPS = 2
const URL_CAPTURE_TIMEOUT_MS = 30_000
const PROCESS_KILL_TIMEOUT_MS = 5_000

const FATAL_ERROR_PATTERNS = [
/unauthorized/i,
/tunnel not found/i,
/failed to connect to an ideally located cfd server/i,
/connection refused/i,
/failed to unmarshal tunnel credentials/i,
/invalid tunnel credentials/i,
/err_tunnel_id/i,
]

interface TunnelStatus {
running: boolean
url: string | null
Expand All @@ -39,6 +50,7 @@ interface TunnelStatus {
halted: boolean
cooldownUntil: number | null
cooldownCount: number
fatalError: string | null
}
error: string | null
}
Expand All @@ -51,7 +63,7 @@ interface AuthConfig {
class TunnelService {
private process: ChildProcess | null = null
private logStream: fs.WriteStream | null = null
private watchdogTimer: ReturnType<typeof setInterval> | null = null
private watchdogTimer: ReturnType<typeof setTimeout> | null = null
private localPort: number = 5001
private url: string | null = null
private urlWithAuth: string | null = null
Expand All @@ -66,6 +78,7 @@ class TunnelService {
halted: false,
cooldownUntil: null as number | null,
cooldownCount: 0,
fatalError: null as string | null,
}

isRunning(): boolean {
Expand Down Expand Up @@ -96,6 +109,7 @@ class TunnelService {
halted: this.watchdogState.halted,
cooldownUntil: this.watchdogState.cooldownUntil,
cooldownCount: this.watchdogState.cooldownCount,
fatalError: this.watchdogState.fatalError,
},
error: this.error,
}
Expand Down Expand Up @@ -228,6 +242,17 @@ class TunnelService {
this.logStream?.write(`[${ts()}] ${line}\n`)
}

for (const pattern of FATAL_ERROR_PATTERNS) {
if (pattern.test(output)) {
const fatalMsg = `Fatal cloudflared error: ${output.trim().slice(0, 200)}`
logger.error(fatalMsg)
this.watchdogState.fatalError = fatalMsg
this.watchdogState.halted = true
this.error = fatalMsg
break
}
}

const urlMatch = output.match(/https:\/\/[a-z0-9-]+\.trycloudflare\.com/)
if (urlMatch && !capturedUrl) {
capturedUrl = urlMatch[0]
Expand Down Expand Up @@ -276,15 +301,17 @@ class TunnelService {

this.startedAt = Date.now()
this.writeTunnelState()
this.updateEndpoints()

logger.info(`Tunnel established: ${this.url}`)
this.logStream?.write(`[${ts()}] Tunnel established: ${this.url}\n`)

const reachable = await this.verifyReachable()
if (!reachable) {
logger.warn('Tunnel URL obtained but not yet reachable')
this.logStream?.write(`[${ts()}] WARNING: Tunnel URL not immediately reachable\n`)
if (reachable) {
this.updateEndpoints()
} else {
logger.warn('Tunnel URL obtained but not yet reachable — deferring endpoints.json update')
this.logStream?.write(`[${ts()}] WARNING: Tunnel URL not yet reachable, will retry\n`)
this.deferEndpointUpdate()
}

this.startWatchdog()
Expand All @@ -300,82 +327,115 @@ class TunnelService {
halted: false,
cooldownUntil: null,
cooldownCount: 0,
fatalError: null,
}

logger.info('Starting tunnel watchdog')
this.scheduleWatchdogTick()
}

this.watchdogTimer = setInterval(async () => {
if (this.watchdogState.restarting) return
private scheduleWatchdogTick(): void {
const jitter = Math.floor(Math.random() * WATCHDOG_JITTER_MS)
this.watchdogTimer = setTimeout(() => this.watchdogTick(), WATCHDOG_INTERVAL_MS + jitter)
}

if (this.watchdogState.cooldownUntil) {
const remaining = this.watchdogState.cooldownUntil - Date.now()
if (remaining > 0) return
logger.info('Watchdog cooldown expired, resuming monitoring')
this.watchdogState.cooldownUntil = null
this.watchdogState.halted = false
this.watchdogState.restartTimestamps = []
this.watchdogState.consecutiveFailures = 0
this.error = null
}
private async watchdogTick(): Promise<void> {
if (this.watchdogState.restarting) {
this.scheduleWatchdogTick()
return
}

if (!this.process || this.process.killed) {
logger.warn('Tunnel process is dead, attempting restart')
this.watchdogState.consecutiveFailures = WATCHDOG_FAIL_THRESHOLD
} else {
const connected = await this.checkConnected()
if (connected) {
if (this.watchdogState.consecutiveFailures > 0) {
logger.info(`Tunnel recovered after ${this.watchdogState.consecutiveFailures} failed check(s)`)
}
this.watchdogState.consecutiveFailures = 0
if (this.watchdogState.cooldownCount > 0) {
this.watchdogState.cooldownCount = 0
}
return
}
this.watchdogState.consecutiveFailures++
logger.warn(`Tunnel disconnected (${this.watchdogState.consecutiveFailures}/${WATCHDOG_FAIL_THRESHOLD})`)
if (this.watchdogState.fatalError) {
this.scheduleWatchdogTick()
return
}

if (this.watchdogState.cooldownUntil) {
const remaining = this.watchdogState.cooldownUntil - Date.now()
if (remaining > 0) {
this.scheduleWatchdogTick()
return
}
logger.info('Watchdog cooldown expired, resuming monitoring')
this.watchdogState.cooldownUntil = null
this.watchdogState.halted = false
this.watchdogState.restartTimestamps = []
this.watchdogState.consecutiveFailures = 0
this.error = null
}

if (this.watchdogState.consecutiveFailures < WATCHDOG_FAIL_THRESHOLD) return

if (this.isCircuitBroken()) {
this.watchdogState.cooldownCount++
const cooldownMs = Math.min(
COOLDOWN_BASE_MS * Math.pow(2, this.watchdogState.cooldownCount - 1),
COOLDOWN_MAX_MS
)
this.watchdogState.cooldownUntil = Date.now() + cooldownMs
this.watchdogState.halted = true
this.error = `Watchdog cooling down: ${MAX_RESTARTS} restarts in ${MAX_RESTART_WINDOW_MS / 60000} min. Resuming in ${Math.round(cooldownMs / 60000)} min`
logger.warn(this.error)
if (!this.process || this.process.killed) {
logger.warn('Tunnel process is dead, attempting restart')
this.watchdogState.consecutiveFailures = WATCHDOG_FAIL_THRESHOLD
} else {
const connected = await this.checkConnected()
if (connected) {
if (this.watchdogState.consecutiveFailures > 0) {
logger.info(`Tunnel recovered after ${this.watchdogState.consecutiveFailures} failed check(s)`)
}
this.watchdogState.consecutiveFailures = 0
if (this.watchdogState.cooldownCount > 0) {
this.watchdogState.cooldownCount = 0
}
this.scheduleWatchdogTick()
return
}
this.watchdogState.consecutiveFailures++
logger.warn(`Tunnel disconnected (${this.watchdogState.consecutiveFailures}/${WATCHDOG_FAIL_THRESHOLD})`)
}

if (this.watchdogState.consecutiveFailures < WATCHDOG_FAIL_THRESHOLD) {
this.scheduleWatchdogTick()
return
}

this.watchdogState.restarting = true
const stillDisconnected = !this.process || this.process.killed || !(await this.checkConnected())
if (!stillDisconnected) {
logger.info('Tunnel reconnected before restart — skipping')
this.watchdogState.consecutiveFailures = 0
this.watchdogState.restartTimestamps.push(Date.now())
logger.info('Watchdog restarting tunnel...')
this.scheduleWatchdogTick()
return
}

try {
await this.doRestart()
if (this.url) {
logger.info(`Watchdog restored tunnel: ${this.url}`)
} else {
logger.warn('Watchdog restarted tunnel but no URL obtained')
}
} catch (err) {
logger.error('Watchdog failed to restart tunnel:', err instanceof Error ? err.message : err)
this.error = `Watchdog restart failed: ${err instanceof Error ? err.message : 'unknown'}`
} finally {
this.watchdogState.restarting = false
if (this.isCircuitBroken()) {
this.watchdogState.cooldownCount++
const cooldownMs = Math.min(
COOLDOWN_BASE_MS * Math.pow(2, this.watchdogState.cooldownCount - 1),
COOLDOWN_MAX_MS
)
this.watchdogState.cooldownUntil = Date.now() + cooldownMs
this.watchdogState.halted = true
this.error = `Watchdog cooling down: ${MAX_RESTARTS} restarts in ${MAX_RESTART_WINDOW_MS / 60000} min. Resuming in ${Math.round(cooldownMs / 60000)} min`
logger.warn(this.error)
this.scheduleWatchdogTick()
return
}

this.watchdogState.restarting = true
this.watchdogState.consecutiveFailures = 0
this.watchdogState.restartTimestamps.push(Date.now())
logger.info('Watchdog restarting tunnel...')

try {
await this.doRestart()
if (this.url) {
logger.info(`Watchdog restored tunnel: ${this.url}`)
} else {
logger.warn('Watchdog restarted tunnel but no URL obtained')
}
}, WATCHDOG_INTERVAL_MS)
} catch (err) {
logger.error('Watchdog failed to restart tunnel:', err instanceof Error ? err.message : err)
this.error = `Watchdog restart failed: ${err instanceof Error ? err.message : 'unknown'}`
} finally {
this.watchdogState.restarting = false
}

this.scheduleWatchdogTick()
}

private stopWatchdog(): void {
if (this.watchdogTimer) {
clearInterval(this.watchdogTimer)
clearTimeout(this.watchdogTimer)
this.watchdogTimer = null
logger.info('Tunnel watchdog stopped')
}
Expand Down Expand Up @@ -467,6 +527,34 @@ class TunnelService {
return false
}

private deferEndpointUpdate(): void {
const maxRetries = 5
const retryDelayMs = 10_000
let attempt = 0

const retry = async () => {
attempt++
if (!this.url || !this.isRunning()) return

const reachable = await this.verifyReachable()
if (reachable) {
logger.info('Deferred reachability check passed — updating endpoints.json')
this.updateEndpoints()
return
}

if (attempt < maxRetries) {
logger.warn(`Deferred reachability check failed (${attempt}/${maxRetries}), retrying in ${retryDelayMs / 1000}s`)
setTimeout(retry, retryDelayMs)
} else {
logger.warn('Deferred reachability checks exhausted — writing endpoint anyway to satisfy requirements')
this.updateEndpoints()
}
}

setTimeout(retry, retryDelayMs)
}

private getAuth(): AuthConfig | null {
const username = process.env.AUTH_USERNAME
const password = process.env.AUTH_PASSWORD
Expand Down
5 changes: 4 additions & 1 deletion bin/cli.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1086,6 +1086,7 @@ interface TunnelStatusResponse {
halted: boolean;
cooldownUntil: number | null;
cooldownCount: number;
fatalError: string | null;
};
}

Expand Down Expand Up @@ -1304,7 +1305,9 @@ async function commandHealth(args: string[]): Promise<void> {
}
if (results.tunnel.data?.watchdog) {
const wd = results.tunnel.data.watchdog;
if (wd.halted && wd.cooldownUntil) {
if (wd.fatalError) {
console.log(` watchdog: FATAL — ${wd.fatalError}`);
} else if (wd.halted && wd.cooldownUntil) {
const remainMs = wd.cooldownUntil - Date.now();
const remainMin = Math.max(0, Math.ceil(remainMs / 60000));
console.log(
Expand Down