diff --git a/artemis-monitor/Dockerfile b/artemis-monitor/Dockerfile new file mode 100644 index 00000000..10e9f25c --- /dev/null +++ b/artemis-monitor/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +EXPOSE 3000 + +CMD ["python", "app.py"] diff --git a/artemis-monitor/README.md b/artemis-monitor/README.md new file mode 100644 index 00000000..6d442f75 --- /dev/null +++ b/artemis-monitor/README.md @@ -0,0 +1,307 @@ +# ARTEMIS Monitor + +A comprehensive monitoring and analytics service that provides real-time health status, infrastructure monitoring, and BGP security analytics for ARTEMIS deployments. The service acts as a centralized monitoring system that connects to your ARTEMIS deployment and exposes monitoring data through REST APIs. + +## How It Works + +The service connects to your ARTEMIS deployment and monitors three key areas: + +1. **Service Health**: HTTP health checks on all ARTEMIS services +2. **Container Status**: Docker API integration for uptime and container state monitoring +3. **BGP Security**: GraphQL API queries for real-time BGP updates and hijack detection + +## Docker Setup + +**Prerequisites:** ARTEMIS must be running before starting the monitor. + +```bash +# Start the ARTEMIS monitor (attaches to ARTEMIS network) +docker-compose -f docker-compose.artemis-monitor.yaml up -d + +# View logs +docker-compose -f docker-compose.artemis-monitor.yaml logs -f artemis-monitor + +# Stop the service +docker-compose -f docker-compose.artemis-monitor.yaml down +``` + +The ARTEMIS monitor will be available at `http://artemis-monitor:3001` and automatically connects to the ARTEMIS network for monitoring. + +## Configuration + +### Service Configuration + +Services are configured in `config.yaml`, with environment variable support: +```yaml +services: + configuration: + host: configuration + port: 3000 + endpoint: /health + # ... more services +``` + +## API Endpoints & JSON Responses + +### 1. Services Health Status +``` +GET /health/all +``` + +**Response:** +```json +{ + "services": [ + { + "service": "configuration", + "service_status": "running", + "response_time_ms": 123.45, + "status_code": 200, + "url": "http://configuration:3000/health" + }, + { + "service": "detection", + "service_status": "stopped", + "url": "http://detection:3000/health" + } + ], + "summary": { + "status_counts": {"running": 1, "stopped": 1}, + "total_services": 2, + "running_services": 1, + "average_response_time_ms": 106.39 + }, + "success": true, + "overall_status": "partially_running" +} +``` + +### 2. Container Uptime Information +``` +GET /uptime +``` + +**Response:** +```json +{ + "uptimes": { + "configuration": "2 hours", + "detection": "Not running" + }, + "containers": [ + { + "service": "configuration", + "uptime": "2 hours", + "status": "Up 2 hours", + "state": "running", + "image": "artemis_configuration:latest" + } + ], + "summary": { + "total_containers": 3, + "running_containers": 2, + "monitoring_services": 4, + "service_coverage": 75.0 + }, + "success": true, + "overall_status": "partially_running" +} +``` + +### 3. BGP Data Summary +``` +GET /bgp/summary?limit=5 +``` + +**Response:** +```json +{ + "success": true, + "timestamp": 1679404800.123, + "bgp_updates": [ + { + "prefix": "192.168.1.0/24", + "origin_as": 65001, + "peer_asn": 65000, + "type": "A", + "timestamp": "2024-03-21T10:00:00Z" + } + ], + "hijacks": [ + { + "prefix": "10.0.0.0/16", + "hijack_as": 65999, + "active": true, + "time_detected": "2024-03-21T09:45:00Z" + } + ], + "analytics": { + "bgp_updates": { + "total_count": 1, + "announcement_count": 1 + }, + "hijacks": { + "total_count": 1, + "active_count": 1 + }, + "summary": { + "security_status": "warning", + "active_threats": 1 + } + } +} +``` + +### 4. BGP Updates Only +``` +GET /bgp/updates?limit=10 +``` + +**Response:** +```json +{ + "success": true, + "bgp_updates": [ + { + "prefix": "192.168.1.0/24", + "origin_as": 65001, + "type": "A", + "timestamp": "2024-03-21T10:00:00Z" + } + ], + "analytics": { + "total_count": 1, + "announcement_count": 1, + "unique_prefixes": 1 + } +} +``` + +### 5. Hijacks Only +``` +GET /bgp/hijacks?limit=10 +``` + +**Response:** +```json +{ + "success": true, + "hijacks": [ + { + "prefix": "10.0.0.0/16", + "hijack_as": 65999, + "active": true, + "time_detected": "2024-03-21T09:45:00Z" + } + ], + "analytics": { + "total_count": 1, + "active_count": 1, + "security_status": "warning" + } +} +``` + +## CLI Usage + +The CLI script requires a `--url` parameter to specify the monitor service URL: + +```bash +# Basic health check (local) +python artemis_monitor.py --url http://localhost:3001 + +# Basic health check (ngrok) +python artemis_monitor.py --url https://your-monitor.ngrok.io + +# JSON output for scripting +python artemis_monitor.py --url http://localhost:3001 --json + +# Periodic monitoring +python artemis_monitor.py --url http://localhost:3001 --periodic 60 + +# Limited runs +python artemis_monitor.py --url http://localhost:3001 --periodic 60 --max-runs 10 + +# Periodic with JSON output +python artemis_monitor.py --url http://localhost:3001 --periodic 30 --json + +# Save results to file +python artemis_monitor.py --url http://localhost:3001 --json > status.json +``` + +**Key Features:** +- **Visual Reports**: Rich console output with colored tables and status indicators +- **Periodic Monitoring**: Configurable intervals (minimum 10 seconds) +- **JSON Export**: Machine-readable output for automation and logging +- **Flexible Control**: Limit monitoring runs or run continuously + +### Output Example +``` +┌─ ARTEMIS Status Report - 2024-03-21 14:30:15 ─┐ +└─────────────────────────────────────────────────┘ + +┌─ SERVICES SUMMARY ─┐ +│ Status │ Count │ Percent │ +│ running │ 12 │ 85.7% │ +│ stopped │ 2 │ 14.3% │ +│ Total Services │ 14 │ 100.0% │ +└─────────────────────┘ + +┌─ Running Services ─────────────────────────────────────────────┐ +│ Service │ Status │ Code │ Uptime │ Response │ Error │ +│ configuration │ running │ 200 │ 3 days │ 45.2ms │ N/A │ +│ detection │ running │ 200 │ 3 days │ 67.1ms │ N/A │ +│ database │ running │ 200 │ 3 days │ 23.8ms │ N/A │ +│ bgpstreamlive │ running │ 200 │ 3 days │ 89.3ms │ N/A │ +└─────────────────────────────────────────────────────────────────┘ + +┌─ Stopped Services ─────────────────────────────────────────────┐ +│ Service │ Status │ Code │ Uptime │ Response │ Error │ +│ mitigation │ stopped │ N/A │ N/A │ N/A │ Connection...│ +│ notifier │ stopped │ N/A │ N/A │ N/A │ Service un...│ +└─────────────────────────────────────────────────────────────────┘ + +┌─ CONTAINER SUMMARY ─┐ +│ Metric │ Value │ +│ Total Containers │ 15 │ +│ UP (Running) │ 15 │ +│ DOWN (Not Found) │ 0 │ +│ Service Coverage │ 107.1% │ +└─────────────────────┘ + +┌─ Recent BGP Updates ─┐ +│ Timestamp │ Prefix │ Origin ASN │ Type │ Peer ASN │ +│ 2024-03-21 14:29 │ 8.8.8.0/24 │ 15169 │ announce │ 174 │ +│ 2024-03-21 14:28 │ 1.1.1.0/24 │ 13335 │ announce │ 6939 │ +└─────────────────────────────────────────────────────────────────────┘ + +┌─ Recent Hijacks ─┐ +│ Status │ +│ ✓ No hijacks detected - Network appears secure │ +└──────────────────────────────────────────────────┘ + +┌─ BGP NETWORK SUMMARY ─┐ +│ Metric │ Value │ Status │ Details │ +│ Total BGP Updates│ 25 │ Active │ Last 10 records│ +│ Route Announcements│ 20 │ Normal │ 80.0% of updates│ +│ Route Withdrawals│ 5 │ Normal │ 20.0% of updates│ +│ Total Hijacks │ 0 │ SECURE │ 0 active, 0 resolved│ +│ Security Status │ SECURE│ SECURE │ Overall network security│ +└─────────────────────────────────────────────────────────────────┘ + +⚠️ Warning: 2 services are not running +``` + +## Monitoring Integration + +The REST APIs are designed for integration with monitoring systems like Prometheus, Grafana, and alerting platforms. All endpoints return structured JSON for easy consumption by external tools. + +### Webhook Integration +All API endpoints can be used for webhook-based monitoring: +```bash +# Example webhook for external monitoring +curl -X GET "http://artemis-monitor:3001/health/all" \ + -H "Accept: application/json" | \ + jq '.overall_status' | \ + xargs -I {} echo "ARTEMIS Status: {}" +``` diff --git a/artemis-monitor/app.py b/artemis-monitor/app.py new file mode 100644 index 00000000..c61b322d --- /dev/null +++ b/artemis-monitor/app.py @@ -0,0 +1,103 @@ +""" +ARTEMIS Health Monitor Service + +A comprehensive monitoring service for ARTEMIS BGP hijack detection system. +Provides health checks, uptime monitoring, and BGP data analytics. +""" +import logging +import os +from flask import Flask + +from routes.health import health_bp +from routes.uptime import uptime_bp +from routes.bgp import bgp_bp + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), + logging.FileHandler('health-monitor.log') + ] +) + +logger = logging.getLogger(__name__) + + +def create_app(): + """ + Create and configure the Flask application. + + Returns: + Flask: Configured Flask application instance + """ + app = Flask(__name__) + + # Configure Flask app + app.config['JSON_SORT_KEYS'] = False + app.config['JSONIFY_PRETTYPRINT_REGULAR'] = True + + # Register blueprints + app.register_blueprint(health_bp) + app.register_blueprint(uptime_bp) + app.register_blueprint(bgp_bp) + + logger.info("Flask application created and configured") + logger.info("Registered blueprints: health, uptime, bgp") + + return app + + +app = create_app() + + +@app.route('/health') +def basic_health(): + """Health check endpoint.""" + from flask import jsonify + return jsonify({'status': 'running', 'service': 'health-monitor'}) + + +@app.errorhandler(404) +def not_found(error): + """Handle 404 errors.""" + return { + 'error': 'Endpoint not found', + 'message': 'The requested endpoint does not exist', + 'available_endpoints': [ + '/health', + '/health/all', + '/uptime', + '/bgp/summary', + '/bgp/updates', + '/bgp/hijacks' + ] + }, 404 + + +@app.errorhandler(500) +def internal_error(error): + """Handle 500 errors.""" + logger.error(f"Internal server error: {str(error)}") + return { + 'error': 'Internal server error', + 'message': 'An unexpected error occurred' + }, 500 + + +if __name__ == '__main__': + logger.info("Starting ARTEMIS Health Monitor Service") + logger.info("Available endpoints:") + logger.info(" GET /health - Basic health check") + logger.info(" GET /health/all - Services health summary") + logger.info(" GET /uptime - Containers uptime summary") + logger.info(" GET /bgp/summary - BGP data summary") + logger.info(" GET /bgp/updates - BGP updates only") + logger.info(" GET /bgp/hijacks - Hijacks updates only") + + host = os.getenv('FLASK_HOST', '0.0.0.0') + port = int(os.getenv('FLASK_PORT', 3000)) + debug = os.getenv('FLASK_DEBUG', 'False').lower() == 'true' + + logger.info(f"Server starting on {host}:{port} (debug={debug})") + app.run(host=host, port=port, debug=debug) diff --git a/artemis-monitor/artemis_monitor.py b/artemis-monitor/artemis_monitor.py new file mode 100755 index 00000000..0a820d40 --- /dev/null +++ b/artemis-monitor/artemis_monitor.py @@ -0,0 +1,704 @@ +#!/usr/bin/env python +import aiohttp +import asyncio +from rich.console import Console +from rich.table import Table +from rich.panel import Panel +import sys +from rich import box +import time +import argparse +import json + +console = Console() + +MONITOR_BASE_URL = None + +def get_health_api(): + return f"{MONITOR_BASE_URL}/health/all" + +def get_uptime_api(): + return f"{MONITOR_BASE_URL}/uptime" + +def get_bgp_summary_api(): + return f"{MONITOR_BASE_URL}/bgp/summary" + +async def get_health_status(session): + try: + async with session.get(get_health_api(), timeout=10) as response: + if response.status == 200: + return await response.json() + else: + console.print(f"[red]Health API returned status {response.status}[/red]") + return None + except Exception as e: + console.print(f"[red]Error getting health status: {str(e)}[/red]") + return None + +async def get_uptime_status(session): + try: + async with session.get(get_uptime_api(), timeout=10) as response: + if response.status == 200: + return await response.json() + else: + console.print(f"[yellow]Uptime API returned status {response.status}[/yellow]") + return None + except Exception as e: + console.print(f"[yellow]Error getting uptime status: {str(e)}[/yellow]") + return None + +async def get_bgp_data(session): + try: + async with session.get(f"{get_bgp_summary_api()}?limit=10", timeout=15) as response: + if response.status == 200: + return await response.json() + else: + console.print(f"[yellow]BGP API returned status {response.status}[/yellow]") + return None + except Exception as e: + console.print(f"[yellow]Error getting BGP data: {str(e)}[/yellow]") + return None + +def combine_health_and_uptime(health_data, uptime_data): + """Combine health and uptime data for each service""" + if not health_data or 'services' not in health_data: + return health_data + + if not uptime_data or 'uptimes' not in uptime_data: + return health_data + + # Add uptime information to health data + for service in health_data['services']: + service_name = service['service'] + if service_name in uptime_data['uptimes']: + uptime_value = uptime_data['uptimes'][service_name] + service['uptime'] = uptime_value if uptime_value != 'Not running' else 'N/A' + else: + service['uptime'] = 'N/A' + + return health_data + +def print_service_table(services, status_filter, title): + """Print a table of services filtered by status""" + filtered_services = [s for s in services if s['service_status'] == status_filter] + if not filtered_services: + return False + + table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + table.add_column("Service", no_wrap=True, width=20) + table.add_column("Status", no_wrap=True, width=12) + table.add_column("Code", no_wrap=True, width=8) + table.add_column("Uptime", no_wrap=True, width=15) + table.add_column("Response (ms)", no_wrap=True, width=15) + table.add_column("Error", width=30) + + for service in filtered_services: + status_color = { + 'running': 'green', + 'unconfigured': 'yellow', + 'unreachable': 'red', + 'error': 'red', + 'stopped': 'red', + 'timeout': 'red' + }.get(service['service_status'], 'white') + + error_text = service.get('error', 'N/A') + if error_text != 'N/A' and len(error_text) > 30: + error_text = error_text[:27] + '...' + + table.add_row( + service['service'], + f"[{status_color}]{service['service_status']}[/{status_color}]", + str(service.get('status_code', 'N/A')), + service.get('uptime', 'N/A'), + str(round(service.get('response_time_ms', 0), 1)), + error_text + ) + + # Determine panel color based on status + panel_color = { + 'running': 'bold green', + 'unconfigured': 'bold yellow', + 'stopped': 'bold red', + 'unreachable': 'bold red', + 'error': 'bold red', + 'timeout': 'bold red' + }.get(status_filter, 'bold white') + + console.print(Panel(title, style=panel_color, box=box.SIMPLE)) + console.print(table) + console.print() + return True + +def print_bgp_updates_table(bgp_updates): + """Print detailed BGP updates table using correct field names""" + if not bgp_updates: + console.print(Panel("[yellow]No Recent BGP Updates Available[/yellow]", style="bold yellow", box=box.SIMPLE)) + console.print() + return + + console.print(Panel("Recent BGP Updates", style="bold blue", box=box.SIMPLE)) + updates_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + updates_table.add_column("Timestamp", width=20) + updates_table.add_column("Prefix", width=18) + updates_table.add_column("Origin ASN", width=10) + updates_table.add_column("Type", width=8) + updates_table.add_column("Peer ASN", width=10) + updates_table.add_column("Path Length", width=10) + updates_table.add_column("Service", width=20) + + for update in bgp_updates[:10]: # Show first 10 + # Map type codes to readable names + update_type = update.get('type', 'unknown') + if update_type == 'A': + type_display = 'announce' + type_color = 'green' + elif update_type == 'W': + type_display = 'withdraw' + type_color = 'yellow' + else: + type_display = update_type + type_color = 'white' + + # Extract timestamp and format it + timestamp = update.get('timestamp', 'N/A') + if timestamp != 'N/A' and len(timestamp) > 19: + timestamp = timestamp[:19].replace('T', ' ') + + # Calculate AS path length + as_path = update.get('as_path', []) + path_length = len(as_path) if as_path else 0 + + # Extract service name (remove the prefix) + service = update.get('service', 'N/A') + if '|' in service: + service = service.split('|')[1] + + updates_table.add_row( + timestamp, + update.get('prefix', 'N/A'), + str(update.get('origin_as', 'N/A')), + f"[{type_color}]{type_display}[/{type_color}]", + str(update.get('peer_asn', 'N/A')), + str(path_length), + service + ) + + console.print(updates_table) + console.print() + +def print_hijacks_table(hijacks): + """Print hijacks table or empty state""" + console.print(Panel("Recent Hijacks", style="bold red", box=box.SIMPLE)) + + if not hijacks: + # Show empty state with informative message + empty_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + empty_table.add_column("Status", width=60, justify="center") + empty_table.add_row("[green]✓ No hijacks detected - Network appears secure[/green]") + console.print(empty_table) + console.print() + return + + hijacks_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + hijacks_table.add_column("Timestamp", width=20) + hijacks_table.add_column("Prefix", width=18) + hijacks_table.add_column("Hijacker ASN", width=12) + hijacks_table.add_column("Victim ASN", width=10) + hijacks_table.add_column("Type", width=12) + hijacks_table.add_column("Confidence", width=10) + hijacks_table.add_column("Status", width=10) + + for hijack in hijacks[:10]: # Show first 10 + confidence = hijack.get('confidence', 0) + confidence_color = 'red' if confidence > 0.8 else 'yellow' if confidence > 0.5 else 'white' + + timestamp = hijack.get('timestamp', 'N/A') + if timestamp != 'N/A' and len(timestamp) > 19: + timestamp = timestamp[:19].replace('T', ' ') + + status = hijack.get('status', 'unknown') + status_color = 'red' if status == 'ongoing' else 'green' if status == 'resolved' else 'yellow' + + hijacks_table.add_row( + timestamp, + hijack.get('prefix', 'N/A'), + str(hijack.get('hijacker_asn', 'N/A')), + str(hijack.get('victim_asn', 'N/A')), + hijack.get('type', 'N/A'), + f"[{confidence_color}]{confidence:.2f}[/{confidence_color}]", + f"[{status_color}]{status}[/{status_color}]" + ) + + console.print(hijacks_table) + console.print() + +def print_bgp_summary_table(bgp_data): + """Print BGP summary as a separate table at the end using correct field names""" + if not bgp_data or not bgp_data.get('success'): + console.print(Panel("[red]BGP Summary Unavailable[/red]", style="bold red", box=box.SIMPLE)) + console.print() + return + + analytics = bgp_data.get('analytics', {}) + bgp_analytics = analytics.get('bgp_updates', {}) + hijack_analytics = analytics.get('hijacks', {}) + summary_analytics = analytics.get('summary', {}) + + console.print(Panel("[blue bold]BGP NETWORK SUMMARY[/]", style="bold", box=box.SIMPLE, expand=False)) + + # Create comprehensive summary table + summary_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + summary_table.add_column("Metric", width=25) + summary_table.add_column("Value", width=15) + summary_table.add_column("Status", width=20) + summary_table.add_column("Details", width=25) + + # BGP Updates metrics (using correct field names) + total_updates = bgp_analytics.get('total_count', 0) + announcements = bgp_analytics.get('announcement_count', 0) + withdrawals = bgp_analytics.get('withdrawal_count', 0) + unique_prefixes = bgp_analytics.get('unique_prefixes', 0) + unique_origin_asns = bgp_analytics.get('unique_origin_asns', 0) + unique_peer_asns = bgp_analytics.get('unique_peer_asns', 0) + + # Hijacks metrics + total_hijacks = hijack_analytics.get('total_count', 0) + active_hijacks = hijack_analytics.get('active_count', 0) + resolved_hijacks = hijack_analytics.get('resolved_count', 0) + + # Summary metrics + security_status = summary_analytics.get('security_status', 'unknown') + processing_status = summary_analytics.get('processing_status', 'unknown') + data_freshness = summary_analytics.get('data_freshness', 'unknown') + + security_color = { + 'secure': 'green', + 'warning': 'yellow', + 'critical': 'red' + }.get(security_status, 'white') + + # Add rows to summary table + summary_table.add_row( + "Total BGP Updates", + str(total_updates), + "[green]Active[/green]" if total_updates > 0 else "[yellow]Inactive[/yellow]", + f"Last {bgp_data.get('query_limit', 10)} records" + ) + summary_table.add_row( + "Route Announcements", + str(announcements), + "[green]Normal[/green]" if announcements > 0 else "[yellow]None[/yellow]", + f"{(announcements/total_updates*100):.1f}% of updates" if total_updates > 0 else "N/A" + ) + summary_table.add_row( + "Route Withdrawals", + str(withdrawals), + "[yellow]Caution[/yellow]" if withdrawals > announcements else "[green]Normal[/green]", + f"{(withdrawals/total_updates*100):.1f}% of updates" if total_updates > 0 else "N/A" + ) + summary_table.add_row( + "Unique Prefixes", + str(unique_prefixes), + "[green]Diverse[/green]" if unique_prefixes > 1 else "[yellow]Limited[/yellow]", + "Network coverage" + ) + summary_table.add_row( + "Unique Origin ASNs", + str(unique_origin_asns), + "[green]Multi-AS[/green]" if unique_origin_asns > 1 else "[yellow]Single-AS[/yellow]", + "Origin autonomous systems" + ) + summary_table.add_row( + "Unique Peer ASNs", + str(unique_peer_asns), + "[green]Multi-Peer[/green]" if unique_peer_asns > 1 else "[yellow]Single-Peer[/yellow]", + "Peer autonomous systems" + ) + summary_table.add_row( + "Total Hijacks", + str(total_hijacks), + "[red]ALERT[/red]" if total_hijacks > 0 else "[green]SECURE[/green]", + f"{active_hijacks} active, {resolved_hijacks} resolved" + ) + summary_table.add_row( + "Security Status", + security_status.upper(), + f"[{security_color}]{security_status.upper()}[/{security_color}]", + "Overall network security" + ) + summary_table.add_row( + "Processing Status", + processing_status.upper(), + "[green]OK[/green]" if processing_status == 'current' else "[yellow]BACKLOG[/yellow]", + "Data processing state" + ) + summary_table.add_row( + "Data Freshness", + data_freshness.upper(), + "[green]FRESH[/green]" if data_freshness == 'recent' else "[yellow]STALE[/yellow]", + "Data recency status" + ) + + console.print(summary_table) + console.print() + +async def run_single_check(json_output=False): + """Run a single health check""" + try: + # Get all data concurrently + async with aiohttp.ClientSession() as session: + health_task = get_health_status(session) + uptime_task = get_uptime_status(session) + bgp_task = get_bgp_data(session) + + health_data, uptime_data, bgp_data = await asyncio.gather( + health_task, uptime_task, bgp_task, return_exceptions=True + ) + + if not health_data or isinstance(health_data, Exception): + if json_output: + error_result = { + "success": False, + "error": "Failed to get health data", + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S") + } + print(json.dumps(error_result, indent=2)) + return False + else: + console.print("[red]Failed to get health data - cannot continue[/red]") + return False + + # Combine health and uptime data + if uptime_data and not isinstance(uptime_data, Exception): + health_data = combine_health_and_uptime(health_data, uptime_data) + + # Prepare structured data for JSON output + if json_output: + result = { + "success": True, + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S"), + "health": health_data if health_data else {}, + "uptime": uptime_data if uptime_data and not isinstance(uptime_data, Exception) else {}, + "bgp": bgp_data if bgp_data and not isinstance(bgp_data, Exception) else {}, + "summary": { + "total_services": health_data.get('summary', {}).get('total_services', 0), + "running_services": health_data.get('summary', {}).get('running_services', 0), + "service_status_counts": health_data.get('summary', {}).get('status_counts', {}), + "overall_status": health_data.get('overall_status', 'unknown'), + "all_services_running": False + } + } + + # Add container summary if available + if uptime_data and not isinstance(uptime_data, Exception): + uptime_summary = uptime_data.get('summary', {}) + result["summary"]["containers"] = { + "total_containers": uptime_summary.get('total_containers', 0), + "running_containers": uptime_summary.get('running_containers', 0), + "monitoring_services": uptime_summary.get('monitoring_services', 0), + "service_coverage": uptime_summary.get('service_coverage', 0), + "missing_services": uptime_summary.get('missing_services', []) + } + + # Add BGP summary if available + if bgp_data and not isinstance(bgp_data, Exception): + analytics = bgp_data.get('analytics', {}) + result["summary"]["bgp"] = { + "total_updates": analytics.get('bgp_updates', {}).get('total_count', 0), + "total_hijacks": analytics.get('hijacks', {}).get('total_count', 0), + "active_hijacks": analytics.get('hijacks', {}).get('active_count', 0), + "security_status": analytics.get('summary', {}).get('security_status', 'unknown'), + "processing_status": analytics.get('summary', {}).get('processing_status', 'unknown') + } + + # Determine if all services are running + summary = health_data.get('summary', {}) + total_services = summary.get('total_services', 0) + running_services = summary.get('running_services', 0) + result["summary"]["all_services_running"] = running_services == total_services and total_services > 0 + + print(json.dumps(result, indent=2)) + return result["summary"]["all_services_running"] + + # Regular console output (existing code) + # Create header + console.print(Panel("ARTEMIS Status Report - " + time.strftime("%Y-%m-%d %H:%M:%S"), + style="bold", + box=box.SIMPLE)) + console.print() + + # Print service status summary + summary = health_data.get('summary', {}) + console.print(Panel("[blue bold]SERVICES SUMMARY[/]", style="bold", box=box.SIMPLE, expand=False)) + + summary_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + summary_table.add_column("Status", width=15) + summary_table.add_column("Count", width=8) + summary_table.add_column("Percent", width=10) + + total_services = summary.get('total_services', 0) + status_counts = summary.get('status_counts', {}) + + for status, count in status_counts.items(): + status_color = { + 'running': 'green', + 'unconfigured': 'yellow', + 'unreachable': 'red', + 'error': 'red', + 'stopped': 'red', + 'timeout': 'red' + }.get(status, 'white') + + percentage = (count / total_services) * 100 if total_services > 0 else 0 + + summary_table.add_row( + f"[{status_color}]{status}[/{status_color}]", + str(count), + f"[{status_color}]{percentage:.1f}%[/{status_color}]" + ) + + # Add total services row + summary_table.add_row( + "[bold]Total Services[/bold]", + f"[bold]{total_services}[/bold]", + "[bold]100.0%[/bold]" + ) + console.print(summary_table) + console.print() + + # Print service details by status category + services = health_data.get('services', []) + + # Show all service categories + print_service_table(services, 'running', "Running Services") + print_service_table(services, 'stopped', "Stopped Services") + print_service_table(services, 'unconfigured', "Unconfigured Services") + print_service_table(services, 'unreachable', "Unreachable Services") + print_service_table(services, 'error', "Error Services") + print_service_table(services, 'timeout', "Timeout Services") + + # Container uptime summary + if uptime_data and not isinstance(uptime_data, Exception): + uptime_summary = uptime_data.get('summary', {}) + if uptime_summary: + console.print(Panel("[blue bold]CONTAINER SUMMARY[/]", style="bold", box=box.SIMPLE, expand=False)) + + container_table = Table(show_header=True, header_style="bold", box=box.SIMPLE) + container_table.add_column("Metric", width=25) + container_table.add_column("Value", width=15) + + total_containers = uptime_summary.get('total_containers', 0) + running_containers = uptime_summary.get('running_containers', 0) + monitoring_services = uptime_summary.get('monitoring_services', 0) + + # Calculate down containers + down_containers = monitoring_services - running_containers if monitoring_services > running_containers else 0 + + # Use the service coverage directly from the API response + api_coverage = uptime_summary.get('service_coverage', 0) + actual_coverage = min(api_coverage, 100.0) + + container_table.add_row("Total Containers", str(total_containers)) + container_table.add_row("UP (Running)", f"[green]{running_containers}[/green]") + container_table.add_row("DOWN (Not Found)", f"[red]{down_containers}[/red]" if down_containers > 0 else "[green]0[/green]") + container_table.add_row("Service Coverage", f"[green]{actual_coverage:.1f}%[/green]" if actual_coverage >= 90 else f"[yellow]{actual_coverage:.1f}%[/yellow]") + + console.print(container_table) + console.print() + + # BGP Data Section + if bgp_data and not isinstance(bgp_data, Exception): + # Show BGP updates + bgp_updates = bgp_data.get('bgp_updates', []) + print_bgp_updates_table(bgp_updates) + + # Show hijacks + hijacks = bgp_data.get('hijacks', []) + print_hijacks_table(hijacks) + + # Show BGP summary at the end + print_bgp_summary_table(bgp_data) + + # Final status check + running_services = status_counts.get('running', 0) + if running_services != total_services: + not_running = total_services - running_services + console.print(f"[red]⚠️ Warning: {not_running} services are not running[/red]") + return False + else: + console.print("[green]✅ All services are running[/green]") + return True + + except Exception as e: + if json_output: + error_result = { + "success": False, + "error": str(e), + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S") + } + print(json.dumps(error_result, indent=2)) + else: + console.print(f"[red]Fatal Error: {str(e)}[/red]", style="bold red") + return False + + +async def run_periodic_monitoring(interval_seconds, max_runs=None, json_output=False): + """Run periodic monitoring with specified interval""" + run_count = 0 + + try: + if not json_output: + console.print(Panel(f"[blue bold]ARTEMIS Periodic Monitoring Started[/blue bold]\n" + f"Interval: {interval_seconds} seconds\n" + f"Max runs: {'Unlimited' if max_runs is None else max_runs}\n" + f"Press Ctrl+C to stop", + style="bold blue", box=box.SIMPLE)) + console.print() + + results = [] + + while True: + run_count += 1 + + if not json_output: + console.print(f"[cyan]--- Check #{run_count} ---[/cyan]") + + success = await run_single_check(json_output) + + # For JSON output, collect results + if json_output: + # The result is already printed by run_single_check, just track success + pass + + # Exit if max runs reached + if max_runs and run_count >= max_runs: + if not json_output: + console.print(f"\n[yellow]Completed {max_runs} monitoring runs[/yellow]") + break + + if not json_output: + # Show next check info + next_check = time.strftime("%H:%M:%S", time.localtime(time.time() + interval_seconds)) + console.print(f"\n[dim]Next check at {next_check} (in {interval_seconds}s). Press Ctrl+C to stop.[/dim]") + console.print("=" * 80) + console.print() + + # Wait for next interval + await asyncio.sleep(interval_seconds) + + except KeyboardInterrupt: + if not json_output: + console.print(f"\n[yellow]Monitoring stopped by user after {run_count} checks[/yellow]") + except Exception as e: + if json_output: + error_result = { + "success": False, + "error": f"Periodic monitoring error: {str(e)}", + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S"), + "run_count": run_count + } + print(json.dumps(error_result, indent=2)) + else: + console.print(f"\n[red]Periodic monitoring error: {str(e)}[/red]") + + +def parse_arguments(): + """Parse command line arguments""" + parser = argparse.ArgumentParser( + description="ARTEMIS Health Monitor - Monitor ARTEMIS BGP security services", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + %(prog)s # Use default http://localhost:3001 + %(prog)s --url http://localhost:3001 # Explicit local URL + %(prog)s --url localhost:3001 --periodic 30 # Periodic monitoring + %(prog)s --url localhost:3001 --periodic 60 --max-runs 10 # Limited runs + %(prog)s --json # JSON output + """ + ) + + parser.add_argument( + '--url', + type=str, + help='Monitor service URL' + ) + + parser.add_argument( + '--periodic', '-p', + type=int, + metavar='SECONDS', + help='Run periodic monitoring with specified interval in seconds (e.g., 30, 60, 300)' + ) + + parser.add_argument( + '--max-runs', '-m', + type=int, + metavar='COUNT', + help='Maximum number of monitoring runs (only with --periodic). Default: unlimited' + ) + + parser.add_argument( + '--json', + action='store_true', + help='Output results in JSON format for programmatic consumption' + ) + + return parser.parse_args() + + +async def main(): + """Main function with argument parsing""" + args = parse_arguments() + + if not args.url: + console.print("[red]Error: --url parameter is required[/red]") + console.print("[yellow]Example: python3 artemis_monitor.py --url http://localhost:3001[/yellow]") + sys.exit(1) + + # Set API endpoints from command line argument + global MONITOR_BASE_URL + MONITOR_BASE_URL = args.url + + # Validate arguments + if args.max_runs and not args.periodic: + if args.json: + error_result = { + "success": False, + "error": "--max-runs can only be used with --periodic", + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S") + } + print(json.dumps(error_result, indent=2)) + else: + console.print("[red]Error: --max-runs can only be used with --periodic[/red]") + sys.exit(1) + + if args.periodic: + if args.periodic < 10: + if args.json: + error_result = { + "success": False, + "error": "Periodic interval must be at least 10 seconds", + "timestamp": time.time(), + "datetime": time.strftime("%Y-%m-%d %H:%M:%S") + } + print(json.dumps(error_result, indent=2)) + else: + console.print("[red]Error: Periodic interval must be at least 10 seconds[/red]") + sys.exit(1) + + # Run periodic monitoring + await run_periodic_monitoring(args.periodic, args.max_runs, args.json) + else: + # Run single check + success = await run_single_check(args.json) + sys.exit(0 if success else 1) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/artemis-monitor/auth.py b/artemis-monitor/auth.py new file mode 100644 index 00000000..44f68272 --- /dev/null +++ b/artemis-monitor/auth.py @@ -0,0 +1,80 @@ +""" +ARTEMIS Authentication Module +Handles login and JWT token management for ARTEMIS API access +""" +import os +import logging +import ssl + +logger = logging.getLogger(__name__) + +NGINX_HOST = os.getenv('NGINX_HOST', 'nginx') +BASE_URL = f"https://{NGINX_HOST}:443" +LOGIN_URL = f"{BASE_URL}/api/auth/login/credentials" +JWT_URL = f"{BASE_URL}/api/auth/jwt" + +DEFAULT_EMAIL = os.getenv('ADMIN_EMAIL') +DEFAULT_PASS = os.getenv('ADMIN_PASS') +API_KEY = os.getenv('API_KEY') + +async def authenticate(session): + """ + Authenticate with ARTEMIS and return JWT token + Returns: JWT token string or None if authentication fails + """ + try: + # Create SSL context that allows self-signed certificates for internal Docker communication + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + + login_headers = { + 'x-artemis-api-key': API_KEY, + 'Content-Type': 'application/json' + } + + login_data = { + 'email': DEFAULT_EMAIL, + 'password': DEFAULT_PASS + } + + async with session.post(LOGIN_URL, json=login_data, headers=login_headers, ssl=ssl_context) as response: + if response.status != 200: + logger.error(f"Step 1 - Login failed with status: {response.status}") + response_text = await response.text() + logger.error(f"Response: {response_text}") + return None + + user_data = await response.json() + session_id = user_data.get('user', {}).get('sessionId') + + if not session_id: + logger.error("No session ID in login response") + return None + + logger.info("Login successful") + + jwt_headers = { + 'x-artemis-api-key': API_KEY, + 'Content-Type': 'application/json', + 'Cookie': f'sid={session_id}' + } + + async with session.get(JWT_URL, headers=jwt_headers, ssl=ssl_context) as response: + if response.status == 200: + jwt_data = await response.json() + if 'accessToken' in jwt_data: + logger.info("JWT token obtained successfully") + return jwt_data['accessToken'] + else: + logger.error("Step 2 - No accessToken in response") + return None + else: + logger.error(f"JWT request failed with status: {response.status}") + response_text = await response.text() + logger.error(f"Response: {response_text}") + return None + + except Exception as e: + logger.error(f"Authentication error: {str(e)}") + return None diff --git a/artemis-monitor/config.yaml b/artemis-monitor/config.yaml new file mode 100644 index 00000000..eba574c1 --- /dev/null +++ b/artemis-monitor/config.yaml @@ -0,0 +1,57 @@ +services: + configuration: + host: configuration + port: 3000 + endpoint: /health + database: + host: database + port: 3000 + endpoint: /health + detection: + host: detection + port: 3000 + endpoint: /health + prefixtree: + host: prefixtree + port: 3000 + endpoint: /health + mitigation: + host: mitigation + port: 3000 + endpoint: /health + notifier: + host: notifier + port: 3000 + endpoint: /health + autoignore: + host: autoignore + port: 3000 + endpoint: /health + autostarter: + host: autostarter + port: 3000 + endpoint: /health + fileobserver: + host: fileobserver + port: 3000 + endpoint: /health + riperistap: + host: riperistap + port: 3000 + endpoint: /health + bgpstreamlivetap: + host: bgpstreamlivetap + port: 3000 + endpoint: /health + bgpstreamkafkatap: + host: bgpstreamkafkatap + port: 3000 + endpoint: /health + bgpstreamhisttap: + host: bgpstreamhisttap + port: 3000 + endpoint: /health + exabgptap: + host: exabgptap + port: 3000 + endpoint: /health diff --git a/artemis-monitor/requirements.txt b/artemis-monitor/requirements.txt new file mode 100644 index 00000000..a06a15ba --- /dev/null +++ b/artemis-monitor/requirements.txt @@ -0,0 +1,7 @@ +flask[async]==2.3.3 +aiohttp==3.8.5 +python-json-logger==2.0.7 +rich==13.7.0 +asyncio==3.4.3 +docker==7.1.0 +PyYAML==6.0.1 diff --git a/artemis-monitor/routes/__init__.py b/artemis-monitor/routes/__init__.py new file mode 100644 index 00000000..dea50929 --- /dev/null +++ b/artemis-monitor/routes/__init__.py @@ -0,0 +1,3 @@ +""" +Routes package for health monitor API endpoints. +""" diff --git a/artemis-monitor/routes/bgp.py b/artemis-monitor/routes/bgp.py new file mode 100644 index 00000000..2cd2ad58 --- /dev/null +++ b/artemis-monitor/routes/bgp.py @@ -0,0 +1,69 @@ +""" +BGP monitoring routes for ARTEMIS BGP updates and hijacks. +""" +import asyncio +import logging +from flask import Blueprint, jsonify, request + +from services.bgp_service import get_bgp_summary_data, get_bgp_updates_only, get_hijacks_only + +logger = logging.getLogger(__name__) + +bgp_bp = Blueprint('bgp', __name__) + + +@bgp_bp.route('/bgp/summary') +def get_bgp_summary(): + """Get aggregated BGP data (updates and hijacks) with enhanced analytics.""" + try: + limit = request.args.get('limit', 10, type=int) + logger.info(f"BGP summary requested with limit: {limit}") + + bgp_data = asyncio.run(get_bgp_summary_data(limit)) + + return jsonify(bgp_data) + + except Exception as e: + logger.error(f"Error in BGP summary endpoint: {str(e)}") + return jsonify({ + 'error': f'BGP summary failed: {str(e)}', + 'success': False + }), 500 + + +@bgp_bp.route('/bgp/updates') +def get_bgp_updates(): + """Get BGP updates data with analytics.""" + try: + limit = request.args.get('limit', 10, type=int) + logger.info(f"BGP updates requested with limit: {limit}") + + updates_data = asyncio.run(get_bgp_updates_only(limit)) + + return jsonify(updates_data) + + except Exception as e: + logger.error(f"Error in BGP updates endpoint: {str(e)}") + return jsonify({ + 'error': f'BGP updates fetch failed: {str(e)}', + 'success': False + }), 500 + + +@bgp_bp.route('/bgp/hijacks') +def get_hijacks(): + """Get hijacks data with analytics.""" + try: + limit = request.args.get('limit', 10, type=int) + logger.info(f"Hijacks requested with limit: {limit}") + + hijacks_data = asyncio.run(get_hijacks_only(limit)) + + return jsonify(hijacks_data) + + except Exception as e: + logger.error(f"Error in hijacks endpoint: {str(e)}") + return jsonify({ + 'error': f'Hijacks fetch failed: {str(e)}', + 'success': False + }), 500 diff --git a/artemis-monitor/routes/health.py b/artemis-monitor/routes/health.py new file mode 100644 index 00000000..953849be --- /dev/null +++ b/artemis-monitor/routes/health.py @@ -0,0 +1,63 @@ +""" +Health monitoring routes for ARTEMIS services. +""" +import asyncio +import logging +from flask import Blueprint, jsonify + +from utils.config import load_config, get_services_map +from services.health_service import check_all_services_health + +logger = logging.getLogger(__name__) + +health_bp = Blueprint('health', __name__) + + +@health_bp.route('/health/all') +def get_all_services_health(): + """Get health status for all registered ARTEMIS services.""" + try: + config = load_config() + if not config: + logger.error("Failed to load configuration") + return jsonify({ + 'error': 'Configuration not available', + 'success': False + }), 500 + + services = get_services_map(config) + if not services: + logger.warning("No services configured") + return jsonify({ + 'error': 'No services configured', + 'success': False + }), 500 + + logger.info(f"Checking health of {len(services)} services") + health_data = asyncio.run(check_all_services_health(services)) + + # Add metadata + health_data['success'] = True + + summary = health_data.get('summary', {}) + running_count = summary.get('running_services', 0) + total_count = summary.get('total_services', 0) + + if running_count == total_count and total_count > 0: + overall_status = 'all_running' + elif running_count > 0: + overall_status = 'partially_running' + else: + overall_status = 'none_running' + + health_data['overall_status'] = overall_status + + logger.info(f"Health check completed: {overall_status}") + return jsonify(health_data) + + except Exception as e: + logger.error(f"Error in health check: {str(e)}") + return jsonify({ + 'error': f'Health check failed: {str(e)}', + 'success': False + }), 500 diff --git a/artemis-monitor/routes/uptime.py b/artemis-monitor/routes/uptime.py new file mode 100644 index 00000000..85a48e20 --- /dev/null +++ b/artemis-monitor/routes/uptime.py @@ -0,0 +1,59 @@ +""" +Uptime monitoring routes for ARTEMIS Docker containers. +""" +import logging +from flask import Blueprint, jsonify + +from utils.config import load_config, get_services_map +from services.uptime_service import get_artemis_uptime + +logger = logging.getLogger(__name__) + +uptime_bp = Blueprint('uptime', __name__) + + +@uptime_bp.route('/uptime') +def get_uptime(): + """Get uptime information for ARTEMIS containers.""" + try: + config = load_config() + services = get_services_map(config) if config else None + + logger.info("Fetching ARTEMIS container uptime data") + uptime_data = get_artemis_uptime(services) + + if uptime_data is None: + logger.error("Failed to get uptime data") + return jsonify({ + 'error': 'Failed to retrieve uptime data', + 'success': False + }), 500 + + # Add metadata + uptime_data['success'] = True + + # Determine overall uptime status + summary = uptime_data.get('summary', {}) + running_count = summary.get('running_containers', 0) + total_count = summary.get('total_containers', 0) + + if total_count == 0: + overall_status = 'no_containers' + elif running_count == total_count: + overall_status = 'all_running' + elif running_count > 0: + overall_status = 'partially_running' + else: + overall_status = 'all_stopped' + + uptime_data['overall_status'] = overall_status + + logger.info(f"Uptime check completed: {overall_status} ({running_count}/{total_count} running)") + return jsonify(uptime_data) + + except Exception as e: + logger.error(f"Error getting uptime: {str(e)}") + return jsonify({ + 'error': f'Uptime check failed: {str(e)}', + 'success': False + }), 500 diff --git a/artemis-monitor/services/__init__.py b/artemis-monitor/services/__init__.py new file mode 100644 index 00000000..d91366f2 --- /dev/null +++ b/artemis-monitor/services/__init__.py @@ -0,0 +1,3 @@ +""" +Services package for health monitor logic. +""" diff --git a/artemis-monitor/services/analytics_service.py b/artemis-monitor/services/analytics_service.py new file mode 100644 index 00000000..db37d70c --- /dev/null +++ b/artemis-monitor/services/analytics_service.py @@ -0,0 +1,242 @@ +""" +Analytics service for BGP data analysis and statistics generation. +""" +import logging +from typing import Dict, List, Any, Optional + +logger = logging.getLogger(__name__) + + +def analyze_bgp_updates(bgp_updates: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Analyze BGP updates and provide comprehensive summary statistics. + + Args: + bgp_updates: List of BGP update records + + Returns: + Dictionary containing detailed analytics + """ + if not bgp_updates: + return _get_empty_bgp_analytics() + + prefixes = set() + origin_asns = set() + peer_asns = set() + services = set() + timestamps = [] + announcement_count = 0 + withdrawal_count = 0 + handled_count = 0 + + # Process each update + for update in bgp_updates: + try: + # Count by type + update_type = update.get('type') + if update_type == 'A': + announcement_count += 1 + elif update_type == 'W': + withdrawal_count += 1 + + if update.get('handled'): + handled_count += 1 + + if update.get('prefix'): + prefixes.add(update['prefix']) + if update.get('origin_as'): + origin_asns.add(update['origin_as']) + if update.get('peer_asn'): + peer_asns.add(update['peer_asn']) + if update.get('service'): + services.add(update['service']) + if update.get('timestamp'): + timestamps.append(update['timestamp']) + + except Exception as e: + logger.warning(f"Error processing BGP update: {str(e)}") + continue + + analytics = { + 'total_count': len(bgp_updates), + 'announcement_count': announcement_count, + 'withdrawal_count': withdrawal_count, + 'unique_prefixes': len(prefixes), + 'unique_origin_asns': len(origin_asns), + 'unique_peer_asns': len(peer_asns), + 'handled_count': handled_count, + 'unhandled_count': len(bgp_updates) - handled_count, + 'services': sorted(list(services)), + 'latest_timestamp': max(timestamps) if timestamps else None, + 'oldest_timestamp': min(timestamps) if timestamps else None + } + + logger.debug(f"BGP analytics: {analytics['total_count']} updates analyzed") + return analytics + + +def analyze_hijacks(hijacks: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Analyze hijacks and provide comprehensive summary statistics. + + Args: + hijacks: List of hijack records + + Returns: + Dictionary containing detailed analytics + """ + if not hijacks: + return _get_empty_hijack_analytics() + + prefixes = set() + hijacker_asns = set() + detection_times = [] + + active_count = 0 + resolved_count = 0 + ignored_count = 0 + withdrawn_count = 0 + under_mitigation_count = 0 + dormant_count = 0 + seen_count = 0 + + # Process each hijack + for hijack in hijacks: + try: + if hijack.get('active'): + active_count += 1 + if hijack.get('resolved'): + resolved_count += 1 + if hijack.get('ignored'): + ignored_count += 1 + if hijack.get('withdrawn'): + withdrawn_count += 1 + if hijack.get('under_mitigation'): + under_mitigation_count += 1 + if hijack.get('dormant'): + dormant_count += 1 + if hijack.get('seen'): + seen_count += 1 + + if hijack.get('prefix'): + prefixes.add(hijack['prefix']) + if hijack.get('hijack_as'): + hijacker_asns.add(hijack['hijack_as']) + if hijack.get('time_detected'): + detection_times.append(hijack['time_detected']) + + except Exception as e: + logger.warning(f"Error processing hijack: {str(e)}") + continue + + analytics = { + 'total_count': len(hijacks), + 'active_count': active_count, + 'resolved_count': resolved_count, + 'ignored_count': ignored_count, + 'withdrawn_count': withdrawn_count, + 'under_mitigation_count': under_mitigation_count, + 'dormant_count': dormant_count, + 'seen_count': seen_count, + 'unique_prefixes': len(prefixes), + 'unique_hijacker_asns': len(hijacker_asns), + 'latest_detection': max(detection_times) if detection_times else None, + 'oldest_detection': min(detection_times) if detection_times else None + } + + logger.debug(f"Hijack analytics: {analytics['total_count']} hijacks analyzed") + return analytics + + +def generate_summary_analytics( + bgp_analytics: Dict[str, Any], + hijack_analytics: Dict[str, Any] +) -> Dict[str, Any]: + """ + Generate overall summary analytics across both BGP updates and hijacks. + + Args: + bgp_analytics: BGP updates analytics + hijack_analytics: Hijacks analytics + + Returns: + Dictionary containing overall summary + """ + total_bgp_updates = bgp_analytics.get('total_count', 0) + total_hijacks = hijack_analytics.get('total_count', 0) + + return { + 'total_bgp_updates': total_bgp_updates, + 'total_hijacks': total_hijacks, + 'active_threats': hijack_analytics.get('active_count', 0), + 'unhandled_updates': bgp_analytics.get('unhandled_count', 0), + 'data_freshness': 'recent' if total_bgp_updates > 0 or total_hijacks > 0 else 'no_data', + 'security_status': _determine_security_status(hijack_analytics), + 'processing_status': _determine_processing_status(bgp_analytics) + } + + +def _get_empty_bgp_analytics() -> Dict[str, Any]: + """Return empty BGP analytics structure.""" + return { + 'total_count': 0, + 'announcement_count': 0, + 'withdrawal_count': 0, + 'unique_prefixes': 0, + 'unique_origin_asns': 0, + 'unique_peer_asns': 0, + 'handled_count': 0, + 'unhandled_count': 0, + 'services': [], + 'latest_timestamp': None, + 'oldest_timestamp': None + } + + +def _get_empty_hijack_analytics() -> Dict[str, Any]: + """Return empty hijack analytics structure.""" + return { + 'total_count': 0, + 'active_count': 0, + 'resolved_count': 0, + 'ignored_count': 0, + 'withdrawn_count': 0, + 'under_mitigation_count': 0, + 'dormant_count': 0, + 'seen_count': 0, + 'unique_prefixes': 0, + 'unique_hijacker_asns': 0, + 'latest_detection': None, + 'oldest_detection': None + } + + +def _determine_security_status(hijack_analytics: Dict[str, Any]) -> str: + """Determine overall security status based on hijack data.""" + active_count = hijack_analytics.get('active_count', 0) + under_mitigation_count = hijack_analytics.get('under_mitigation_count', 0) + + if active_count > 0: + return 'critical' if active_count > 5 else 'warning' + elif under_mitigation_count > 0: + return 'monitoring' + else: + return 'secure' + + +def _determine_processing_status(bgp_analytics: Dict[str, Any]) -> str: + """Determine processing status based on BGP update handling.""" + total_count = bgp_analytics.get('total_count', 0) + unhandled_count = bgp_analytics.get('unhandled_count', 0) + + if total_count == 0: + return 'no_data' + + unhandled_ratio = unhandled_count / total_count if total_count > 0 else 0 + + if unhandled_ratio > 0.8: + return 'backlog' + elif unhandled_ratio > 0.5: + return 'delayed' + else: + return 'current' diff --git a/artemis-monitor/services/bgp_service.py b/artemis-monitor/services/bgp_service.py new file mode 100644 index 00000000..d403dc5c --- /dev/null +++ b/artemis-monitor/services/bgp_service.py @@ -0,0 +1,194 @@ +""" +BGP monitoring service for ARTEMIS BGP updates and hijacks. +""" +import asyncio +import logging +import time +from typing import Dict, List, Any, Optional +import aiohttp + +from auth import authenticate +from utils.graphql_client import fetch_bgp_updates, fetch_hijacks +from services.analytics_service import ( + analyze_bgp_updates, + analyze_hijacks, + generate_summary_analytics +) + +logger = logging.getLogger(__name__) + + +async def get_bgp_summary_data(limit: int = 10) -> Dict[str, Any]: + """ + Get comprehensive BGP summary including updates, hijacks, and analytics. + + Args: + limit: Maximum number of records to fetch for each data type + + Returns: + Dictionary containing BGP data and analytics + """ + start_time = time.time() + + try: + # Validate limit parameter + limit = max(1, min(limit, 100)) + + async with aiohttp.ClientSession() as session: + # Authenticate and get JWT token + jwt_token = await authenticate(session) + if not jwt_token: + logger.error("Authentication failed for BGP data fetch") + return { + 'error': 'Authentication failed', + 'success': False, + 'timestamp': time.time() + } + + logger.info(f"Fetching BGP data with limit: {limit}") + updates_task = fetch_bgp_updates(session, jwt_token, limit) + hijacks_task = fetch_hijacks(session, jwt_token, limit) + + # Wait for both requests to complete + updates_result, hijacks_result = await asyncio.gather( + updates_task, hijacks_task, return_exceptions=True + ) + + bgp_updates = [] + hijacks = [] + + if isinstance(updates_result, Exception): + logger.error(f"Failed to fetch BGP updates: {str(updates_result)}") + elif updates_result and 'data' in updates_result: + bgp_updates = updates_result['data'].get('view_bgpupdates', []) + logger.info(f"Fetched {len(bgp_updates)} BGP updates") + else: + logger.warning("No BGP updates data received") + + if isinstance(hijacks_result, Exception): + logger.error(f"Failed to fetch hijacks: {str(hijacks_result)}") + elif hijacks_result and 'data' in hijacks_result: + hijacks = hijacks_result['data'].get('view_hijacks', []) + logger.info(f"Fetched {len(hijacks)} hijacks") + else: + logger.warning("No hijacks data received") + + # Generate analytics + bgp_analytics = analyze_bgp_updates(bgp_updates) + hijack_analytics = analyze_hijacks(hijacks) + summary_analytics = generate_summary_analytics(bgp_analytics, hijack_analytics) + + execution_time = (time.time() - start_time) * 1000 # Convert to milliseconds + + result = { + 'success': True, + 'timestamp': time.time(), + 'query_limit': limit, + 'execution_time_ms': round(execution_time, 2), + 'bgp_updates': bgp_updates, + 'hijacks': hijacks, + 'analytics': { + 'bgp_updates': bgp_analytics, + 'hijacks': hijack_analytics, + 'summary': summary_analytics + } + } + + logger.info(f"BGP summary completed in {execution_time:.2f}ms") + return result + + except Exception as e: + execution_time = (time.time() - start_time) * 1000 + logger.error(f"Error getting BGP summary: {str(e)}") + return { + 'error': f'Failed to get BGP summary: {str(e)}', + 'success': False, + 'timestamp': time.time(), + 'execution_time_ms': round(execution_time, 2) + } + + +async def get_bgp_updates_only(limit: int = 10) -> Dict[str, Any]: + """ + Get only BGP updates data with analytics. + + Args: + limit: Maximum number of updates to fetch + + Returns: + Dictionary containing BGP updates and analytics + """ + try: + limit = max(1, min(limit, 100)) + + async with aiohttp.ClientSession() as session: + jwt_token = await authenticate(session) + if not jwt_token: + return {'error': 'Authentication failed', 'success': False} + + updates_result = await fetch_bgp_updates(session, jwt_token, limit) + + if not updates_result or 'data' not in updates_result: + return {'error': 'No BGP updates data received', 'success': False} + + bgp_updates = updates_result['data'].get('view_bgpupdates', []) + analytics = analyze_bgp_updates(bgp_updates) + + return { + 'success': True, + 'timestamp': time.time(), + 'query_limit': limit, + 'bgp_updates': bgp_updates, + 'analytics': analytics + } + + except Exception as e: + logger.error(f"Error getting BGP updates: {str(e)}") + return { + 'error': f'Failed to get BGP updates: {str(e)}', + 'success': False, + 'timestamp': time.time() + } + + +async def get_hijacks_only(limit: int = 10) -> Dict[str, Any]: + """ + Get only hijacks data with analytics. + + Args: + limit: Maximum number of hijacks to fetch + + Returns: + Dictionary containing hijacks and analytics + """ + try: + limit = max(1, min(limit, 100)) + + async with aiohttp.ClientSession() as session: + jwt_token = await authenticate(session) + if not jwt_token: + return {'error': 'Authentication failed', 'success': False} + + hijacks_result = await fetch_hijacks(session, jwt_token, limit) + + if not hijacks_result or 'data' not in hijacks_result: + return {'error': 'No hijacks data received', 'success': False} + + hijacks = hijacks_result['data'].get('view_hijacks', []) + analytics = analyze_hijacks(hijacks) + + return { + 'success': True, + 'timestamp': time.time(), + 'query_limit': limit, + 'hijacks': hijacks, + 'analytics': analytics + } + + except Exception as e: + logger.error(f"Error getting hijacks: {str(e)}") + return { + 'error': f'Failed to get hijacks: {str(e)}', + 'success': False, + 'timestamp': time.time() + } diff --git a/artemis-monitor/services/health_service.py b/artemis-monitor/services/health_service.py new file mode 100644 index 00000000..17b3029a --- /dev/null +++ b/artemis-monitor/services/health_service.py @@ -0,0 +1,178 @@ +""" +Health monitoring service for ARTEMIS services. +""" +import asyncio +import logging +import time +from typing import Dict, List, Any, Optional +import aiohttp + +logger = logging.getLogger(__name__) + + +async def check_service_health( + session: aiohttp.ClientSession, + service_name: str, + url: str +) -> Dict[str, Any]: + """ + Check health status of a single service. + + Args: + session: Async HTTP client session + service_name: Name of the service being checked + url: Health check URL for the service + + Returns: + Dictionary containing service health information + """ + try: + start_time = time.time() + async with session.get(url, timeout=5) as response: + response_time = (time.time() - start_time) * 1000 + + try: + response_json = await response.json() + service_status = response_json.get('status', 'unknown') + except Exception: + # If JSON parsing fails, determine status from HTTP code + service_status = 'running' if response.status == 200 else 'error' + + result = { + 'service': service_name, + 'service_status': service_status, + 'response_time_ms': round(response_time, 2), + 'status_code': response.status, + 'url': url + } + + logger.debug(f"Health check for {service_name}: {service_status} ({response_time:.2f}ms)") + return result + + except asyncio.TimeoutError: + logger.warning(f"Health check timeout for {service_name}") + return { + 'service': service_name, + 'service_status': 'timeout', + 'error': f'Request timed out after 5 seconds', + 'url': url + } + except Exception as e: + logger.error(f"Error checking {service_name}: {str(e)}") + return { + 'service': service_name, + 'service_status': 'unreachable', + 'error': str(e), + 'url': url + } + + +async def check_all_services_health(services: Dict[str, str]) -> Dict[str, Any]: + """ + Check health status for all services concurrently. + + Args: + services: Dictionary mapping service names to health check URLs + + Returns: + Dictionary containing all service health results and summary + """ + if not services: + logger.warning("No services registered for health checking") + return { + 'services': [], + 'summary': { + 'status_counts': {}, + 'total_services': 0, + 'running_services': 0, + 'average_response_time_ms': 0 + } + } + + results = [] + async with aiohttp.ClientSession() as session: + tasks = [ + check_service_health(session, service_name, url) + for service_name, url in services.items() + ] + + # Execute all health checks concurrently + results = await asyncio.gather(*tasks, return_exceptions=True) + + processed_results = [] + for i, result in enumerate(results): + if isinstance(result, Exception): + service_name = list(services.keys())[i] + logger.error(f"Health check failed for {service_name}: {str(result)}") + processed_results.append({ + 'service': service_name, + 'service_status': 'error', + 'error': str(result), + 'url': services[service_name] + }) + else: + processed_results.append(result) + + results = processed_results + + # Generate summary statistics + summary = _generate_health_summary(results) + + logger.info(f"Health check completed: {summary['running_services']}/{summary['total_services']} services healthy") + + return { + 'services': results, + 'summary': summary + } + + +def _generate_health_summary(results: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Generate summary statistics from health check results. + + Args: + results: List of health check results + + Returns: + Dictionary containing summary statistics + """ + if not results: + return { + 'status_counts': {}, + 'total_services': 0, + 'running_services': 0, + 'average_response_time_ms': 0 + } + + # Count statuses + status_counts = {} + response_times = [] + healthy_statuses = {'running'} + running_count = 0 + + for result in results: + status = result.get('service_status', 'unknown') + + if ',' in status: + status = status.split(',')[0].strip() + + status_counts[status] = status_counts.get(status, 0) + 1 + + if status.lower() in healthy_statuses: + running_count += 1 + + response_time = result.get('response_time_ms') + if response_time is not None: + response_times.append(response_time) + + avg_response_time = ( + sum(response_times) / len(response_times) + if response_times else 0 + ) + + return { + 'status_counts': status_counts, + 'total_services': len(results), + 'running_services': running_count, + 'average_response_time_ms': round(avg_response_time, 2) + } diff --git a/artemis-monitor/services/uptime_service.py b/artemis-monitor/services/uptime_service.py new file mode 100644 index 00000000..1177f4df --- /dev/null +++ b/artemis-monitor/services/uptime_service.py @@ -0,0 +1,174 @@ +""" +Uptime monitoring service for ARTEMIS Docker containers. +""" +import logging +from typing import Dict, Optional, Tuple, Any +import docker + +logger = logging.getLogger(__name__) + + +def get_container_uptime(container: Dict[str, Any]) -> Tuple[Optional[str], Optional[str]]: + """ + Extract uptime from a container if it's running. + + Args: + container: Docker container information dictionary + + Returns: + Tuple of (service_name, uptime) or (None, None) if extraction fails + """ + try: + container_names = container.get('Names', []) + if not container_names: + logger.warning("Container has no names") + return None, None + + name = container_names[0].lstrip('/') # Remove leading slash + service_name = name.replace('artemis_', '').split('_')[0] + + # Get container state and status + state = container.get('State', '') + status = container.get('Status', '') + + if state == 'running' and status.startswith('Up '): + # Extract uptime from status string (e.g., "Up 2 hours") + uptime = status.replace('Up ', '').split(' (')[0] + logger.debug(f"Container {name} uptime: {uptime}") + return service_name, uptime + else: + logger.debug(f"Container {name} not running: state={state}, status={status}") + return service_name, None + + except Exception as e: + container_name = container.get('Names', ['unknown'])[0] if container.get('Names') else 'unknown' + logger.error(f"Failed to get uptime for container {container_name}: {str(e)}") + return None, None + + +def get_artemis_uptime(services: Optional[Dict[str, str]] = None) -> Optional[Dict[str, Any]]: + """ + Get uptime of ARTEMIS containers using Docker Python API. + + Args: + services: Optional dictionary of configured services for filtering + + Returns: + Dictionary containing uptime information or None if operation fails + """ + try: + # Connect to Docker daemon + try: + client = docker.APIClient(base_url='unix://var/run/docker.sock') + logger.debug("Connected to Docker daemon") + except Exception as e: + logger.error(f"Failed to connect to Docker daemon: {str(e)}") + return None + + try: + containers = client.containers(filters={'name': 'artemis_'}) + logger.info(f"Found {len(containers)} ARTEMIS containers") + except Exception as e: + logger.error(f"Failed to list containers: {str(e)}") + return None + + # Process containers and collect uptimes + uptimes = {} + container_details = [] + + for container in containers: + service_name, uptime = get_container_uptime(container) + + if service_name: + uptimes[service_name] = uptime if uptime else 'Not running' + + # Collect additional container details + container_info = { + 'service': service_name, + 'uptime': uptime, + 'status': container.get('Status', 'unknown'), + 'state': container.get('State', 'unknown'), + 'image': container.get('Image', 'unknown'), + 'created': container.get('Created', 0) + } + container_details.append(container_info) + + if uptime: + logger.info(f"Service {service_name} uptime: {uptime}") + else: + logger.warning(f"Service {service_name} is not running") + + # Filter by configured services if provided + if services: + filtered_uptimes = {} + for service_name in services.keys(): + filtered_uptimes[service_name] = uptimes.get(service_name, 'Not found') + uptimes = filtered_uptimes + + summary = _generate_uptime_summary(container_details, services) + + return { + 'uptimes': uptimes, + 'containers': container_details, + 'summary': summary + } + + except Exception as e: + logger.error(f"Unexpected error getting ARTEMIS uptime: {str(e)}") + return None + + +def _generate_uptime_summary( + container_details: list, + services: Optional[Dict[str, str]] = None +) -> Dict[str, Any]: + """ + Generate summary statistics for container uptimes. + + Args: + container_details: List of container information dictionaries + services: Optional configured services for comparison + + Returns: + Dictionary containing uptime summary statistics + """ + if not container_details: + return { + 'total_containers': 0, + 'running_containers': 0, + 'stopped_containers': 0, + 'monitoring_services': len(services) if services else 0, + 'missing_services': [] + } + + running_count = 0 + stopped_count = 0 + found_services = set() + + for container in container_details: + if container.get('uptime'): + running_count += 1 + else: + stopped_count += 1 + + service_name = container.get('service') + if service_name: + found_services.add(service_name) + + # Identify missing services + missing_services = [] + if services: + configured_services = set(services.keys()) + missing_services = list(configured_services - found_services) + + return { + 'total_containers': len(container_details), + 'running_containers': running_count, + 'stopped_containers': stopped_count, + 'monitoring_services': len(services) if services else 0, + 'missing_services': missing_services, + 'service_coverage': ( + len(found_services) / len(services) * 100 + if services else 100 + ) + } diff --git a/artemis-monitor/utils/__init__.py b/artemis-monitor/utils/__init__.py new file mode 100644 index 00000000..b1db4ca6 --- /dev/null +++ b/artemis-monitor/utils/__init__.py @@ -0,0 +1,3 @@ +""" +Utilities package for health monitor shared functionality. +""" diff --git a/artemis-monitor/utils/config.py b/artemis-monitor/utils/config.py new file mode 100644 index 00000000..4c9b8542 --- /dev/null +++ b/artemis-monitor/utils/config.py @@ -0,0 +1,82 @@ +""" +Configuration management utilities for health monitor. +""" +import logging +import yaml +from typing import Dict, Optional, Any + +logger = logging.getLogger(__name__) + + +def load_config(config_path: str = 'config.yaml') -> Optional[Dict[str, Any]]: + """ + Load service configuration from YAML file. + + Args: + config_path: Path to the YAML configuration file + + Returns: + Dictionary containing configuration or None if loading fails + """ + try: + with open(config_path, 'r') as f: + config = yaml.safe_load(f) + logger.info(f"Successfully loaded configuration from {config_path}") + return config + except FileNotFoundError: + logger.error(f"Configuration file not found: {config_path}") + return None + except yaml.YAMLError as e: + logger.error(f"Invalid YAML in {config_path}: {str(e)}") + return None + except Exception as e: + logger.error(f"Failed to load {config_path}: {str(e)}") + return None + + +def build_service_url(host: str, port: int, endpoint: str) -> str: + """ + Build service URL from components. + + Args: + host: Service hostname + port: Service port number + endpoint: API endpoint path + + Returns: + Complete service URL + """ + return f"http://{host}:{port}{endpoint}" + + +def get_services_map(config: Optional[Dict[str, Any]]) -> Dict[str, str]: + """ + Create mapping of service IDs to their health check URLs. + + Args: + config: Configuration dictionary + + Returns: + Dictionary mapping service IDs to URLs + """ + if not config or 'services' not in config: + logger.warning("No services configuration found") + return {} + + services = {} + for service_id, info in config['services'].items(): + try: + url = build_service_url( + host=info['host'], + port=info['port'], + endpoint=info['endpoint'] + ) + services[service_id] = url + logger.debug(f"Configured service {service_id}: {url}") + except KeyError as e: + logger.error(f"Missing required field {e} for service {service_id}") + except Exception as e: + logger.error(f"Error processing service {service_id}: {str(e)}") + + logger.info(f"Configured {len(services)} services") + return services diff --git a/artemis-monitor/utils/graphql_client.py b/artemis-monitor/utils/graphql_client.py new file mode 100644 index 00000000..a12e06da --- /dev/null +++ b/artemis-monitor/utils/graphql_client.py @@ -0,0 +1,167 @@ +""" +GraphQL client utilities for ARTEMIS API communication. +""" +import asyncio +import json +import logging +import ssl +import os +from typing import Optional, Dict, Any +import aiohttp + +logger = logging.getLogger(__name__) + +NGINX_HOST = os.getenv('NGINX_HOST', 'nginx') +GRAPHQL_BASE_URL = f"https://{NGINX_HOST}:443" +GRAPHQL_URL = f"{GRAPHQL_BASE_URL}/api/graphql" + + +async def fetch_graphql_data( + session: aiohttp.ClientSession, + jwt_token: str, + query: str +) -> Optional[Dict[str, Any]]: + """ + Fetch data from GraphQL endpoint with JWT authentication and improved error handling. + + Args: + session: Async HTTP client session + jwt_token: JWT authentication token + query: GraphQL query string + + Returns: + GraphQL response data or None if request fails + """ + try: + # Create SSL context for self-signed certificates if using HTTPS + ssl_context = None + if GRAPHQL_URL.startswith('https://'): + ssl_context = ssl.create_default_context() + ssl_context.check_hostname = False + ssl_context.verify_mode = ssl.CERT_NONE + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {jwt_token}' + } + + payload = {'query': query} + + async with session.post( + GRAPHQL_URL, + json=payload, + headers=headers, + ssl=ssl_context, + timeout=30 + ) as response: + response_text = await response.text() + + if response.status == 200: + try: + data = json.loads(response_text) + if 'errors' in data: + logger.error(f"GraphQL query errors: {data['errors']}") + return None + return data + except json.JSONDecodeError as e: + logger.error(f"Failed to parse GraphQL response: {str(e)}") + return None + else: + logger.error(f"GraphQL request failed with status: {response.status}") + logger.error(f"Response: {response_text}") + return None + + except asyncio.TimeoutError: + logger.error("GraphQL request timed out") + return None + except Exception as e: + logger.error(f"GraphQL request error: {str(e)}") + return None + + +async def fetch_bgp_updates( + session: aiohttp.ClientSession, + jwt_token: str, + limit: int = 10 +) -> Optional[Dict[str, Any]]: + """ + Fetch recent BGP updates with improved query structure. + + Args: + session: Async HTTP client session + jwt_token: JWT authentication token + limit: Maximum number of updates to fetch + + Returns: + BGP updates data or None if request fails + """ + query = f""" + query GetBGPUpdates {{ + view_bgpupdates(limit: {limit}, order_by: {{timestamp: desc}}) {{ + as_path + communities + handled + hijack_key + matched_prefix + orig_path + origin_as + peer_asn + prefix + service + timestamp + type + }} + }} + """ + return await fetch_graphql_data(session, jwt_token, query) + + +async def fetch_hijacks( + session: aiohttp.ClientSession, + jwt_token: str, + limit: int = 10 +) -> Optional[Dict[str, Any]]: + """ + Fetch recent hijacks with improved query structure. + + Args: + session: Async HTTP client session + jwt_token: JWT authentication token + limit: Maximum number of hijacks to fetch + + Returns: + Hijacks data or None if request fails + """ + query = f""" + query GetHijacks {{ + view_hijacks(limit: {limit}, order_by: {{time_last: desc}}) {{ + active + comment + configured_prefix + hijack_as + ignored + dormant + key + mitigation_started + num_asns_inf + num_peers_seen + outdated + peers_seen + peers_withdrawn + prefix + resolved + seen + time_detected + time_ended + time_last + time_started + timestamp_of_config + type + under_mitigation + withdrawn + community_annotation + rpki_status + }} + }} + """ + return await fetch_graphql_data(session, jwt_token, query) diff --git a/docker-compose.artemis-monitor.yaml b/docker-compose.artemis-monitor.yaml new file mode 100644 index 00000000..d7cf8198 --- /dev/null +++ b/docker-compose.artemis-monitor.yaml @@ -0,0 +1,22 @@ +version: '3.4' + +services: + artemis-monitor: + build: ./artemis-monitor + container_name: artemis-monitor + restart: always + ports: + - "3001:3000" + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ./artemis-monitor/config.yaml:/app/config.yaml + networks: + - artemis + environment: + - ADMIN_EMAIL=${ADMIN_EMAIL} + - ADMIN_PASS=${ADMIN_PASS} + - API_KEY=${API_KEY} + - NGINX_HOST=${NGINX_HOST} + +networks: + artemis: