From 76a5591ba3584a6b9564a9f9431ae334b6069306 Mon Sep 17 00:00:00 2001 From: observerr411 Date: Thu, 26 Feb 2026 11:58:50 +0100 Subject: [PATCH] feat(api): extend health endpoint with detailed component checks --- .github/workflows/ci.yml | 25 ++- docs/health-check.md | 354 +++++++++++++++++++++++++++++++ package.json | 5 +- src/app.ts | 27 ++- src/config/health.ts | 75 +++++++ src/index.ts | 24 ++- src/services/healthCheck.test.ts | 308 +++++++++++++++++++++++++++ src/services/healthCheck.ts | 274 ++++++++++++++++++++++++ tests/integration/health.test.ts | 152 +++++++++++++ 9 files changed, 1238 insertions(+), 6 deletions(-) create mode 100644 docs/health-check.md create mode 100644 src/config/health.ts create mode 100644 src/services/healthCheck.test.ts create mode 100644 src/services/healthCheck.ts create mode 100644 tests/integration/health.test.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 07a0135..a811566 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -33,5 +33,26 @@ jobs: - name: Typecheck run: npm run typecheck - - name: Run Tests - run: NODE_ENV=test npm test \ No newline at end of file + - name: Run Unit Tests + run: NODE_ENV=test npm run test:unit + + - name: Run Integration Tests + run: NODE_ENV=test npm run test:integration + + - name: Generate Coverage Report + run: NODE_ENV=test npm run test:coverage + + - name: Build + run: npm run build + + - name: Verify Build Artifacts + run: | + if [ ! -d "dist" ]; then + echo "Build failed: dist directory not found" + exit 1 + fi + if [ ! -f "dist/index.js" ]; then + echo "Build failed: dist/index.js not found" + exit 1 + fi + echo "✅ Build artifacts verified" \ No newline at end of file diff --git a/docs/health-check.md b/docs/health-check.md new file mode 100644 index 0000000..b7ca42d --- /dev/null +++ b/docs/health-check.md @@ -0,0 +1,354 @@ +# Health Check Endpoint + +## Overview + +The `/api/health` endpoint provides comprehensive health monitoring for all system components. It's designed for load balancer integration and monitoring systems. + +## Endpoint + +``` +GET /api/health +``` + +## Response Format + +### Success Response (200 OK) + +All critical components are healthy, or only optional components are degraded: + +```json +{ + "status": "ok", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "ok", + "soroban_rpc": "ok", + "horizon": "ok" + } +} +``` + +### Degraded Response (200 OK) + +Optional components are down or any component is slow: + +```json +{ + "status": "degraded", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "ok", + "soroban_rpc": "down", + "horizon": "degraded" + } +} +``` + +### Critical Failure Response (503 Service Unavailable) + +Critical components (API or database) are down: + +```json +{ + "status": "down", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "down" + } +} +``` + +## Component Status Values + +- `ok`: Component is healthy and responsive +- `degraded`: Component is responding but slowly (>1s for DB, >2s for external services) +- `down`: Component is not responding or returning errors + +## Components + +### Critical Components + +These components must be healthy for the service to function: + +1. **API**: Always returns `ok` if the service can respond +2. **Database**: Executes `SELECT 1` query to verify connectivity + +### Optional Components + +These components are checked if configured but don't cause 503 if down: + +1. **Soroban RPC**: Calls `getHealth` JSON-RPC method +2. **Horizon**: Pings root endpoint + +## Configuration + +Configure via environment variables: + +```bash +# Required +DB_HOST=localhost +DB_PORT=5432 +DB_USER=postgres +DB_PASSWORD=postgres +DB_NAME=callora + +# Optional - Soroban RPC +SOROBAN_RPC_ENABLED=true +SOROBAN_RPC_URL=https://soroban-testnet.stellar.org +SOROBAN_RPC_TIMEOUT=2000 + +# Optional - Horizon +HORIZON_ENABLED=true +HORIZON_URL=https://horizon-testnet.stellar.org +HORIZON_TIMEOUT=2000 + +# Health Check Timeouts +HEALTH_CHECK_DB_TIMEOUT=2000 +``` + +## Status Determination Logic + +1. If any **critical component** (API or database) is `down` → Overall status: `down` (503) +2. If any component is `degraded` or `down` → Overall status: `degraded` (200) +3. Otherwise → Overall status: `ok` (200) + +## Performance Thresholds + +- Database: Marked as `degraded` if response time > 1000ms +- External services: Marked as `degraded` if response time > 2000ms +- Overall health check: Completes in < 500ms under normal conditions + +## Timeout Protection + +All checks have timeout protection to prevent blocking: + +- Database: 2000ms default (configurable) +- Soroban RPC: 2000ms default (configurable) +- Horizon: 2000ms default (configurable) + +If a timeout occurs, the component is marked as `down`. + +## Load Balancer Integration + +### AWS Application Load Balancer (ALB) + +```json +{ + "HealthCheckEnabled": true, + "HealthCheckPath": "/api/health", + "HealthCheckIntervalSeconds": 30, + "HealthCheckTimeoutSeconds": 5, + "HealthyThresholdCount": 2, + "UnhealthyThresholdCount": 3, + "Matcher": { + "HttpCode": "200" + } +} +``` + +### NGINX + +```nginx +upstream backend { + server backend1:3000 max_fails=3 fail_timeout=30s; + server backend2:3000 max_fails=3 fail_timeout=30s; +} + +server { + location / { + proxy_pass http://backend; + + # Health check + health_check interval=10s fails=3 passes=2 uri=/api/health; + } +} +``` + +### Kubernetes + +```yaml +apiVersion: v1 +kind: Pod +metadata: + name: callora-backend +spec: + containers: + - name: app + image: callora-backend:latest + livenessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + httpGet: + path: /api/health + port: 3000 + initialDelaySeconds: 10 + periodSeconds: 5 + timeoutSeconds: 3 + failureThreshold: 2 +``` + +## Security Considerations + +- No sensitive information is exposed in health responses +- Stack traces are never included in responses +- Internal error details are logged server-side only +- Timeout protection prevents resource exhaustion +- Connection pooling prevents database connection leaks + +## Testing + +### Manual Testing + +```bash +# Basic health check +curl http://localhost:3000/api/health + +# With verbose output +curl -i http://localhost:3000/api/health + +# Pretty print JSON +curl -s http://localhost:3000/api/health | jq +``` + +### Automated Testing + +```bash +# Run unit tests +npm run test:unit + +# Run integration tests +npm run test:integration + +# Run all tests with coverage +npm run test:coverage +``` + +## Monitoring Integration + +### Prometheus + +Example metrics endpoint integration: + +```typescript +import { register, Counter, Histogram } from 'prom-client'; + +const healthCheckDuration = new Histogram({ + name: 'health_check_duration_seconds', + help: 'Duration of health checks', + labelNames: ['component', 'status'], +}); + +const healthCheckTotal = new Counter({ + name: 'health_check_total', + help: 'Total number of health checks', + labelNames: ['component', 'status'], +}); +``` + +### Datadog + +```javascript +const StatsD = require('node-dogstatsd').StatsD; +const dogstatsd = new StatsD(); + +// After health check +dogstatsd.gauge('health.status', status === 'ok' ? 1 : 0); +dogstatsd.histogram('health.response_time', responseTime); +``` + +## Troubleshooting + +### Health Check Returns 503 + +1. Check database connectivity: `psql -h $DB_HOST -U $DB_USER -d $DB_NAME` +2. Verify database credentials in environment variables +3. Check database logs for connection errors +4. Verify network connectivity to database + +### Health Check Times Out + +1. Check database query performance +2. Verify external service URLs are correct +3. Check network latency to external services +4. Consider increasing timeout values + +### Degraded Status + +1. Check component response times in logs +2. Investigate slow database queries +3. Check external service status pages +4. Monitor network latency + +## Best Practices + +1. **Poll Frequency**: Check every 10-30 seconds for load balancers +2. **Failure Threshold**: Require 2-3 consecutive failures before marking unhealthy +3. **Timeout**: Set load balancer timeout < health check timeout +4. **Monitoring**: Alert on degraded status, page on down status +5. **Logging**: Log all health check failures with full context +6. **Graceful Degradation**: Continue serving traffic on degraded status + +## Example Responses + +### All Healthy + +```bash +$ curl http://localhost:3000/api/health +{ + "status": "ok", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "ok", + "soroban_rpc": "ok", + "horizon": "ok" + } +} +``` + +### Database Down + +```bash +$ curl -i http://localhost:3000/api/health +HTTP/1.1 503 Service Unavailable +Content-Type: application/json + +{ + "status": "down", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "down" + } +} +``` + +### Optional Service Down + +```bash +$ curl http://localhost:3000/api/health +{ + "status": "degraded", + "version": "1.0.0", + "timestamp": "2026-02-26T10:30:00.000Z", + "checks": { + "api": "ok", + "database": "ok", + "soroban_rpc": "down" + } +} +``` diff --git a/package.json b/package.json index a186b4f..b8c6fdb 100644 --- a/package.json +++ b/package.json @@ -12,7 +12,10 @@ "lint": "eslint . --ext .ts", "typecheck": "tsc --noEmit", "test": "node --import tsx --test \"src/**/*.test.ts\"", - "test:coverage": "node --import tsx --test --experimental-test-coverage \"src/**/*.test.ts\"", + "test:unit": "node --import tsx --test \"src/**/*.test.ts\"", + "test:integration": "node --import tsx --test \"tests/integration/**/*.test.ts\"", + "test:coverage": "node --import tsx --test --experimental-test-coverage \"src/**/*.test.ts\" \"tests/integration/**/*.test.ts\"", + "test:all": "npm run test:unit && npm run test:integration", "validate:issue-9": "node scripts/validate-issue-9.mjs" }, "dependencies": { diff --git a/src/app.ts b/src/app.ts index 401fd3d..acad4cc 100644 --- a/src/app.ts +++ b/src/app.ts @@ -1,4 +1,5 @@ import express from 'express'; +import type { Pool } from 'pg'; import { InMemoryUsageEventsRepository, @@ -8,9 +9,11 @@ import { import { requireAuth, type AuthenticatedLocals } from './middleware/requireAuth.js'; import { buildDeveloperAnalytics } from './services/developerAnalytics.js'; import { errorHandler } from './middleware/errorHandler.js'; +import { performHealthCheck, type HealthCheckConfig } from './services/healthCheck.js'; interface AppDependencies { usageEventsRepository: UsageEventsRepository; + healthCheckConfig?: HealthCheckConfig; } const isValidGroupBy = (value: string): value is GroupBy => @@ -35,8 +38,28 @@ export const createApp = (dependencies?: Partial) => { app.use(express.json()); - app.get('/api/health', (_req, res) => { - res.json({ status: 'ok', service: 'callora-backend' }); + app.get('/api/health', async (_req, res) => { + // If no health check config provided, return simple health check + if (!dependencies?.healthCheckConfig) { + res.json({ status: 'ok', service: 'callora-backend' }); + return; + } + + try { + const healthStatus = await performHealthCheck(dependencies.healthCheckConfig); + const statusCode = healthStatus.status === 'down' ? 503 : 200; + res.status(statusCode).json(healthStatus); + } catch (error) { + // Never expose internal errors in health check + res.status(503).json({ + status: 'down', + timestamp: new Date().toISOString(), + checks: { + api: 'ok', + database: 'down', + }, + }); + } }); app.get('/api/apis', (_req, res) => { diff --git a/src/config/health.ts b/src/config/health.ts new file mode 100644 index 0000000..844f7d7 --- /dev/null +++ b/src/config/health.ts @@ -0,0 +1,75 @@ +/** + * Health Check Configuration + * + * Centralizes health check configuration from environment variables + */ + +import { Pool } from 'pg'; +import type { HealthCheckConfig } from '../services/healthCheck.js'; + +let dbPool: Pool | null = null; + +/** + * Creates or returns existing database connection pool + */ +function getDbPool(): Pool { + if (!dbPool) { + dbPool = new Pool({ + host: process.env.DB_HOST || 'localhost', + port: parseInt(process.env.DB_PORT || '5432', 10), + user: process.env.DB_USER || 'postgres', + password: process.env.DB_PASSWORD || 'postgres', + database: process.env.DB_NAME || 'callora', + max: 10, + idleTimeoutMillis: 30000, + connectionTimeoutMillis: 5000, + }); + } + return dbPool; +} + +/** + * Builds health check configuration from environment variables + */ +export function buildHealthCheckConfig(): HealthCheckConfig | undefined { + // Only enable detailed health checks if database is configured + if (!process.env.DB_HOST && !process.env.DB_NAME) { + return undefined; + } + + const config: HealthCheckConfig = { + version: process.env.APP_VERSION || '1.0.0', + database: { + pool: getDbPool(), + timeout: parseInt(process.env.HEALTH_CHECK_DB_TIMEOUT || '2000', 10), + }, + }; + + // Add Soroban RPC if enabled + if (process.env.SOROBAN_RPC_ENABLED === 'true' && process.env.SOROBAN_RPC_URL) { + config.sorobanRpc = { + url: process.env.SOROBAN_RPC_URL, + timeout: parseInt(process.env.SOROBAN_RPC_TIMEOUT || '2000', 10), + }; + } + + // Add Horizon if enabled + if (process.env.HORIZON_ENABLED === 'true' && process.env.HORIZON_URL) { + config.horizon = { + url: process.env.HORIZON_URL, + timeout: parseInt(process.env.HORIZON_TIMEOUT || '2000', 10), + }; + } + + return config; +} + +/** + * Closes database pool gracefully + */ +export async function closeDbPool(): Promise { + if (dbPool) { + await dbPool.end(); + dbPool = null; + } +} diff --git a/src/index.ts b/src/index.ts index 01117bb..b862a8c 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,13 +1,35 @@ import { fileURLToPath } from 'node:url'; +import dotenv from 'dotenv'; import { createApp } from './app.js'; +import { buildHealthCheckConfig, closeDbPool } from './config/health.js'; -const app = createApp(); +// Load environment variables +dotenv.config(); + +const healthCheckConfig = buildHealthCheckConfig(); +const app = createApp({ healthCheckConfig }); const PORT = process.env.PORT ?? 3000; if (process.argv[1] && fileURLToPath(import.meta.url) === process.argv[1]) { app.listen(PORT, () => { console.log(`Callora backend listening on http://localhost:${PORT}`); + if (healthCheckConfig) { + console.log('✅ Health check endpoint enabled at /api/health'); + } + }); + + // Graceful shutdown + process.on('SIGTERM', async () => { + console.log('SIGTERM received, closing connections...'); + await closeDbPool(); + process.exit(0); + }); + + process.on('SIGINT', async () => { + console.log('SIGINT received, closing connections...'); + await closeDbPool(); + process.exit(0); }); } diff --git a/src/services/healthCheck.test.ts b/src/services/healthCheck.test.ts new file mode 100644 index 0000000..ed20aa7 --- /dev/null +++ b/src/services/healthCheck.test.ts @@ -0,0 +1,308 @@ +/** + * Health Check Service Unit Tests + * + * Comprehensive test coverage for health check functionality + * All external dependencies are mocked - no real network calls + */ + +import assert from 'node:assert/strict'; +import test, { describe, mock } from 'node:test'; +import type { Pool, QueryResult } from 'pg'; +import { + checkDatabase, + checkSorobanRpc, + checkHorizon, + determineOverallStatus, + performHealthCheck, + type HealthCheckConfig, +} from './healthCheck.js'; + +// Mock Pool for database tests +function createMockPool( + queryResult: QueryResult | Error, + delay: number = 0 +): Pool { + return { + query: async () => { + if (delay > 0) { + await new Promise((resolve) => setTimeout(resolve, delay)); + } + if (queryResult instanceof Error) { + throw queryResult; + } + return queryResult; + }, + } as unknown as Pool; +} + +describe('checkDatabase', () => { + test('returns ok when database responds quickly', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult); + const result = await checkDatabase(pool, 2000); + + assert.equal(result.status, 'ok'); + assert.ok(result.responseTime !== undefined); + assert.ok(result.responseTime < 1000); + }); + + test('returns degraded when database responds slowly', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult, 1100); + const result = await checkDatabase(pool, 2000); + + assert.equal(result.status, 'degraded'); + assert.ok(result.responseTime !== undefined); + assert.ok(result.responseTime >= 1000); + }); + + test('returns down when database query fails', async () => { + const pool = createMockPool(new Error('Connection refused')); + const result = await checkDatabase(pool, 2000); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Connection refused'); + }); + + test('returns down when database times out', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult, 3000); + const result = await checkDatabase(pool, 500); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Database check timeout'); + }); + + test('returns down when query returns unexpected result', async () => { + const pool = createMockPool({ rows: [] } as QueryResult); + const result = await checkDatabase(pool, 2000); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Unexpected query result'); + }); +}); + +describe('checkSorobanRpc', () => { + test('returns ok when Soroban RPC responds quickly', async () => { + const mockFetch = mock.fn(async () => ({ + ok: true, + json: async () => ({ jsonrpc: '2.0', id: 1, result: { status: 'healthy' } }), + })); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkSorobanRpc('https://soroban-test.stellar.org', 2000); + + assert.equal(result.status, 'ok'); + assert.ok(result.responseTime !== undefined); + assert.ok(result.responseTime < 2000); + }); + + test('returns degraded when Soroban RPC responds with non-ok status', async () => { + const mockFetch = mock.fn(async () => ({ + ok: false, + status: 503, + })); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkSorobanRpc('https://soroban-test.stellar.org', 2000); + + assert.equal(result.status, 'degraded'); + assert.equal(result.error, 'HTTP 503'); + }); + + test('returns down when Soroban RPC is unreachable', async () => { + const mockFetch = mock.fn(async () => { + throw new Error('Network error'); + }); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkSorobanRpc('https://soroban-test.stellar.org', 2000); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Network error'); + }); + + test('returns down when Soroban RPC times out', async () => { + const mockFetch = mock.fn(async () => { + await new Promise((resolve) => setTimeout(resolve, 3000)); + return { ok: true, json: async () => ({}) }; + }); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkSorobanRpc('https://soroban-test.stellar.org', 100); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Timeout'); + }); +}); + +describe('checkHorizon', () => { + test('returns ok when Horizon responds quickly', async () => { + const mockFetch = mock.fn(async () => ({ + ok: true, + })); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkHorizon('https://horizon-testnet.stellar.org', 2000); + + assert.equal(result.status, 'ok'); + assert.ok(result.responseTime !== undefined); + }); + + test('returns degraded when Horizon responds with non-ok status', async () => { + const mockFetch = mock.fn(async () => ({ + ok: false, + status: 500, + })); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkHorizon('https://horizon-testnet.stellar.org', 2000); + + assert.equal(result.status, 'degraded'); + assert.equal(result.error, 'HTTP 500'); + }); + + test('returns down when Horizon is unreachable', async () => { + const mockFetch = mock.fn(async () => { + throw new Error('ECONNREFUSED'); + }); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkHorizon('https://horizon-testnet.stellar.org', 2000); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'ECONNREFUSED'); + }); + + test('returns down when Horizon times out', async () => { + const mockFetch = mock.fn(async () => { + await new Promise((resolve) => setTimeout(resolve, 3000)); + return { ok: true }; + }); + global.fetch = mockFetch as unknown as typeof fetch; + + const result = await checkHorizon('https://horizon-testnet.stellar.org', 100); + + assert.equal(result.status, 'down'); + assert.equal(result.error, 'Timeout'); + }); +}); + +describe('determineOverallStatus', () => { + test('returns down when api is down', () => { + const status = determineOverallStatus({ + api: 'down', + database: 'ok', + }); + assert.equal(status, 'down'); + }); + + test('returns down when database is down', () => { + const status = determineOverallStatus({ + api: 'ok', + database: 'down', + }); + assert.equal(status, 'down'); + }); + + test('returns degraded when optional component is down', () => { + const status = determineOverallStatus({ + api: 'ok', + database: 'ok', + soroban_rpc: 'down', + }); + assert.equal(status, 'degraded'); + }); + + test('returns degraded when any component is degraded', () => { + const status = determineOverallStatus({ + api: 'ok', + database: 'degraded', + }); + assert.equal(status, 'degraded'); + }); + + test('returns ok when all components are ok', () => { + const status = determineOverallStatus({ + api: 'ok', + database: 'ok', + soroban_rpc: 'ok', + horizon: 'ok', + }); + assert.equal(status, 'ok'); + }); +}); + +describe('performHealthCheck', () => { + test('returns healthy status when all components are ok', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult); + const mockFetch = mock.fn(async () => ({ + ok: true, + json: async () => ({}), + })); + global.fetch = mockFetch as unknown as typeof fetch; + + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool }, + sorobanRpc: { url: 'https://soroban-test.stellar.org' }, + horizon: { url: 'https://horizon-testnet.stellar.org' }, + }; + + const result = await performHealthCheck(config); + + assert.equal(result.status, 'ok'); + assert.equal(result.version, '1.0.0'); + assert.equal(result.checks.api, 'ok'); + assert.equal(result.checks.database, 'ok'); + assert.equal(result.checks.soroban_rpc, 'ok'); + assert.equal(result.checks.horizon, 'ok'); + assert.ok(result.timestamp); + }); + + test('returns down status when database fails', async () => { + const pool = createMockPool(new Error('Connection refused')); + + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool }, + }; + + const result = await performHealthCheck(config); + + assert.equal(result.status, 'down'); + assert.equal(result.checks.database, 'down'); + }); + + test('returns degraded status when optional component fails', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult); + const mockFetch = mock.fn(async () => { + throw new Error('Network error'); + }); + global.fetch = mockFetch as unknown as typeof fetch; + + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool }, + sorobanRpc: { url: 'https://soroban-test.stellar.org' }, + }; + + const result = await performHealthCheck(config); + + assert.equal(result.status, 'degraded'); + assert.equal(result.checks.api, 'ok'); + assert.equal(result.checks.database, 'ok'); + assert.equal(result.checks.soroban_rpc, 'down'); + }); + + test('skips optional components when not configured', async () => { + const pool = createMockPool({ rows: [{ result: 1 }] } as QueryResult); + + const config: HealthCheckConfig = { + database: { pool }, + }; + + const result = await performHealthCheck(config); + + assert.equal(result.status, 'ok'); + assert.equal(result.checks.soroban_rpc, undefined); + assert.equal(result.checks.horizon, undefined); + }); +}); diff --git a/src/services/healthCheck.ts b/src/services/healthCheck.ts new file mode 100644 index 0000000..800f6df --- /dev/null +++ b/src/services/healthCheck.ts @@ -0,0 +1,274 @@ +/** + * Health Check Service + * + * Provides comprehensive health monitoring for all system components. + * Designed for load balancer integration and monitoring systems. + */ + +import type { Pool } from 'pg'; + +export type ComponentStatus = 'ok' | 'degraded' | 'down'; + +export interface HealthCheckResult { + status: ComponentStatus; + version?: string; + timestamp: string; + checks: { + api: ComponentStatus; + database: ComponentStatus; + soroban_rpc?: ComponentStatus; + horizon?: ComponentStatus; + }; +} + +export interface ComponentCheck { + status: ComponentStatus; + responseTime?: number; + error?: string; +} + +export interface HealthCheckConfig { + version?: string; + database: { + pool: Pool; + timeout?: number; + }; + sorobanRpc?: { + url: string; + timeout?: number; + }; + horizon?: { + url: string; + timeout?: number; + }; +} + +const DEFAULT_DB_TIMEOUT = 2000; +const DEFAULT_EXTERNAL_TIMEOUT = 2000; +const DEGRADED_THRESHOLD_DB = 1000; +const DEGRADED_THRESHOLD_EXTERNAL = 2000; + +/** + * Checks database health by executing SELECT 1 + * Uses connection pool for efficiency + */ +export async function checkDatabase( + pool: Pool, + timeoutMs: number = DEFAULT_DB_TIMEOUT +): Promise { + const startTime = Date.now(); + + try { + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Database check timeout')), timeoutMs); + }); + + const queryPromise = pool.query('SELECT 1 as result'); + const result = await Promise.race([queryPromise, timeoutPromise]); + + const responseTime = Date.now() - startTime; + + if (result.rows[0]?.result === 1) { + return { + status: responseTime > DEGRADED_THRESHOLD_DB ? 'degraded' : 'ok', + responseTime, + }; + } + + return { + status: 'down', + responseTime, + error: 'Unexpected query result', + }; + } catch (error) { + const responseTime = Date.now() - startTime; + return { + status: 'down', + responseTime, + error: error instanceof Error ? error.message : 'Unknown error', + }; + } +} + +/** + * Checks Soroban RPC health via getHealth JSON-RPC method + * Safe to call even if service is unreachable + */ +export async function checkSorobanRpc( + url: string, + timeoutMs: number = DEFAULT_EXTERNAL_TIMEOUT +): Promise { + const startTime = Date.now(); + + try { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + const response = await fetch(url, { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + jsonrpc: '2.0', + id: 1, + method: 'getHealth', + params: [], + }), + signal: controller.signal, + }); + + clearTimeout(timeoutId); + const responseTime = Date.now() - startTime; + + if (response.ok) { + await response.json(); // Validate JSON response + return { + status: responseTime > DEGRADED_THRESHOLD_EXTERNAL ? 'degraded' : 'ok', + responseTime, + }; + } + + return { + status: 'degraded', + responseTime, + error: `HTTP ${response.status}`, + }; + } catch (error) { + const responseTime = Date.now() - startTime; + const errorMessage = + error instanceof Error + ? error.name === 'AbortError' + ? 'Timeout' + : error.message + : 'Unknown error'; + + return { + status: 'down', + responseTime, + error: errorMessage, + }; + } +} + +/** + * Checks Horizon API health via root endpoint ping + * Safe to call even if service is unreachable + */ +export async function checkHorizon( + url: string, + timeoutMs: number = DEFAULT_EXTERNAL_TIMEOUT +): Promise { + const startTime = Date.now(); + + try { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), timeoutMs); + + const response = await fetch(url, { + method: 'GET', + signal: controller.signal, + }); + + clearTimeout(timeoutId); + const responseTime = Date.now() - startTime; + + if (response.ok) { + return { + status: responseTime > DEGRADED_THRESHOLD_EXTERNAL ? 'degraded' : 'ok', + responseTime, + }; + } + + return { + status: 'degraded', + responseTime, + error: `HTTP ${response.status}`, + }; + } catch (error) { + const responseTime = Date.now() - startTime; + const errorMessage = + error instanceof Error + ? error.name === 'AbortError' + ? 'Timeout' + : error.message + : 'Unknown error'; + + return { + status: 'down', + responseTime, + error: errorMessage, + }; + } +} + +/** + * Determines overall system status based on component checks + * Critical components (api, database) down → 'down' + * Any component degraded/down → 'degraded' + * All healthy → 'ok' + */ +export function determineOverallStatus(checks: { + api: ComponentStatus; + database: ComponentStatus; + soroban_rpc?: ComponentStatus; + horizon?: ComponentStatus; +}): ComponentStatus { + // Critical components must be 'ok' + if (checks.api === 'down' || checks.database === 'down') { + return 'down'; + } + + // Check for any degraded or down components + const allStatuses = Object.values(checks); + if (allStatuses.includes('degraded') || allStatuses.includes('down')) { + return 'degraded'; + } + + return 'ok'; +} + +/** + * Performs comprehensive health check of all configured components + * Returns detailed status suitable for load balancers and monitoring + */ +export async function performHealthCheck( + config: HealthCheckConfig +): Promise { + const checks: HealthCheckResult['checks'] = { + api: 'ok', // API is healthy if we can respond + database: 'down', // Default to down until checked + }; + + // Check database (critical component) + const dbCheck = await checkDatabase( + config.database.pool, + config.database.timeout + ); + checks.database = dbCheck.status; + + // Check Soroban RPC (optional component) + if (config.sorobanRpc) { + const sorobanCheck = await checkSorobanRpc( + config.sorobanRpc.url, + config.sorobanRpc.timeout + ); + checks.soroban_rpc = sorobanCheck.status; + } + + // Check Horizon (optional component) + if (config.horizon) { + const horizonCheck = await checkHorizon( + config.horizon.url, + config.horizon.timeout + ); + checks.horizon = horizonCheck.status; + } + + const overallStatus = determineOverallStatus(checks); + + return { + status: overallStatus, + version: config.version, + timestamp: new Date().toISOString(), + checks, + }; +} diff --git a/tests/integration/health.test.ts b/tests/integration/health.test.ts new file mode 100644 index 0000000..9630b26 --- /dev/null +++ b/tests/integration/health.test.ts @@ -0,0 +1,152 @@ +/** + * Health Check Integration Tests + * + * Tests the health endpoint with real database integration + * Uses pg-mem for in-memory PostgreSQL testing + */ + +import assert from 'node:assert/strict'; +import test, { describe } from 'node:test'; +import request from 'supertest'; +import { createTestDb } from '../helpers/db.js'; +import { createApp } from '../../src/app.js'; +import type { HealthCheckConfig } from '../../src/services/healthCheck.js'; + +describe('GET /api/health - Integration Tests', () => { + test('returns 200 with ok status when database is healthy', async () => { + const testDb = createTestDb(); + + try { + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool: testDb.pool }, + }; + + const app = createApp({ healthCheckConfig: config }); + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 200); + assert.equal(response.body.status, 'ok'); + assert.equal(response.body.version, '1.0.0'); + assert.equal(response.body.checks.api, 'ok'); + assert.equal(response.body.checks.database, 'ok'); + assert.ok(response.body.timestamp); + } finally { + await testDb.end(); + } + }); + + test('returns 503 when database is down', async () => { + const testDb = createTestDb(); + await testDb.end(); // Close pool to simulate database down + + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool: testDb.pool }, + }; + + const app = createApp({ healthCheckConfig: config }); + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 503); + assert.equal(response.body.status, 'down'); + assert.equal(response.body.checks.database, 'down'); + }); + + test('executes SELECT 1 query successfully', async () => { + const testDb = createTestDb(); + + try { + // Verify SELECT 1 works directly + const result = await testDb.pool.query('SELECT 1 as result'); + assert.equal(result.rows[0].result, 1); + + // Verify health check uses it correctly + const config: HealthCheckConfig = { + database: { pool: testDb.pool }, + }; + + const app = createApp({ healthCheckConfig: config }); + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 200); + assert.equal(response.body.checks.database, 'ok'); + } finally { + await testDb.end(); + } + }); + + test('aggregates status correctly with multiple components', async () => { + const testDb = createTestDb(); + + try { + const config: HealthCheckConfig = { + version: '1.0.0', + database: { pool: testDb.pool }, + // Soroban and Horizon not configured - should be omitted + }; + + const app = createApp({ healthCheckConfig: config }); + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 200); + assert.equal(response.body.status, 'ok'); + assert.equal(response.body.checks.api, 'ok'); + assert.equal(response.body.checks.database, 'ok'); + assert.equal(response.body.checks.soroban_rpc, undefined); + assert.equal(response.body.checks.horizon, undefined); + } finally { + await testDb.end(); + } + }); + + test('returns simple health check when no config provided', async () => { + const app = createApp(); // No health check config + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 200); + assert.equal(response.body.status, 'ok'); + assert.equal(response.body.service, 'callora-backend'); + }); + + test('handles health check errors gracefully without exposing internals', async () => { + // Create a pool that will throw an error + const badPool = { + query: async () => { + throw new Error('Internal database error with sensitive info'); + }, + }; + + const config: HealthCheckConfig = { + database: { pool: badPool as any }, + }; + + const app = createApp({ healthCheckConfig: config }); + const response = await request(app).get('/api/health'); + + assert.equal(response.status, 503); + assert.equal(response.body.status, 'down'); + // Should not expose internal error message + assert.ok(!JSON.stringify(response.body).includes('sensitive info')); + }); + + test('completes health check within performance threshold', async () => { + const testDb = createTestDb(); + + try { + const config: HealthCheckConfig = { + database: { pool: testDb.pool, timeout: 500 }, + }; + + const app = createApp({ healthCheckConfig: config }); + const startTime = Date.now(); + const response = await request(app).get('/api/health'); + const duration = Date.now() - startTime; + + assert.equal(response.status, 200); + assert.ok(duration < 500, `Health check took ${duration}ms, expected < 500ms`); + } finally { + await testDb.end(); + } + }); +});