diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml new file mode 100644 index 0000000..f9686a3 --- /dev/null +++ b/.github/workflows/cleanup.yml @@ -0,0 +1,40 @@ +name: Workflow Cleanup + +on: + workflow_dispatch: + inputs: + keep_runs: + description: 'Number of runs to keep per workflow' + required: false + default: '20' + type: string + +jobs: + cleanup: + name: Clean Up Old Workflow Runs + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Authenticate GitHub CLI + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token + + - name: Run cleanup script + run: | + if [ -n "${{ github.event.inputs.keep_runs }}" ]; then + bash deployment/cleanup_workflows.sh --keep "${{ github.event.inputs.keep_runs }}" + else + bash deployment/cleanup_workflows.sh + fi + + - name: Summary + if: always() + run: | + echo "### Workflow Cleanup Complete :broom:" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "- **Repository**: ${{ github.repository }}" >> $GITHUB_STEP_SUMMARY + echo "- **Retention**: Kept last ${{ github.event.inputs.keep_runs || '20' }} runs per workflow" >> $GITHUB_STEP_SUMMARY + echo "- **Time**: $(date -u)" >> $GITHUB_STEP_SUMMARY diff --git a/deployment/cleanup_workflows.sh b/deployment/cleanup_workflows.sh new file mode 100755 index 0000000..7ba35ae --- /dev/null +++ b/deployment/cleanup_workflows.sh @@ -0,0 +1,138 @@ +#!/bin/bash +# GitHub Actions Workflow Cleanup Script +# Removes old workflow runs, keeping only the last 20 for each workflow +# Usage: bash cleanup_workflows.sh [--keep N] +# +# Requirements: +# - GitHub CLI (gh) installed and authenticated +# - Run from repository root or specify repo with GH_REPO env var + +set -e + +# Configuration +KEEP_RUNS=20 # Default: keep last 20 runs per workflow +REPO_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)" + +# Parse arguments +while [[ $# -gt 0 ]]; do + case $1 in + --keep) + KEEP_RUNS="$2" + shift 2 + ;; + *) + echo "Unknown option: $1" + echo "Usage: bash cleanup_workflows.sh [--keep N]" + exit 1 + ;; + esac +done + +# Color codes +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' + +echo -e "${BLUE}=========================================" +echo "GitHub Actions Workflow Cleanup" +echo "=========================================${NC}" +echo "Repository: $(basename $REPO_DIR)" +echo "Keep last: $KEEP_RUNS runs per workflow" +echo "Time: $(date)" +echo "" + +# Check if gh is installed +if ! command -v gh &> /dev/null; then + echo -e "${RED}✗ GitHub CLI (gh) is not installed${NC}" + echo " Install: https://cli.github.com/" + exit 1 +fi + +# Check if authenticated +if ! gh auth status &> /dev/null; then + echo -e "${RED}✗ Not authenticated with GitHub CLI${NC}" + echo " Run: gh auth login" + exit 1 +fi + +echo -e "${GREEN}✓ GitHub CLI ready${NC}" +echo "" + +# Get list of workflow files +cd "$REPO_DIR" +WORKFLOW_FILES=$(ls .github/workflows/*.yml 2>/dev/null || echo "") + +if [ -z "$WORKFLOW_FILES" ]; then + echo -e "${YELLOW}No workflow files found in .github/workflows/${NC}" + exit 0 +fi + +# Track statistics +TOTAL_DELETED=0 +TOTAL_KEPT=0 + +# Process each workflow +for workflow_file in $WORKFLOW_FILES; do + workflow_name=$(basename "$workflow_file") + echo -e "${YELLOW}Processing: $workflow_name${NC}" + + # Get all run IDs for this workflow (sorted newest first) + run_ids=$(gh run list --workflow="$workflow_name" --limit 1000 --json databaseId --jq '.[].databaseId' 2>/dev/null || echo "") + + if [ -z "$run_ids" ]; then + echo " No runs found" + echo "" + continue + fi + + # Count total runs + total=$(echo "$run_ids" | wc -l) + echo " Total runs: $total" + + if [ "$total" -gt "$KEEP_RUNS" ]; then + # Calculate how many to delete + to_delete_count=$((total - KEEP_RUNS)) + echo " Keeping: $KEEP_RUNS (newest)" + echo " Deleting: $to_delete_count (oldest)" + + # Skip first N (newest), delete the rest (oldest) + to_delete=$(echo "$run_ids" | tail -n +$((KEEP_RUNS + 1))) + + deleted=0 + echo "$to_delete" | while read -r run_id; do + if [ -n "$run_id" ]; then + if gh run delete "$run_id" --yes 2>/dev/null; then + ((deleted++)) || true + fi + fi + done + + TOTAL_DELETED=$((TOTAL_DELETED + to_delete_count)) + TOTAL_KEPT=$((TOTAL_KEPT + KEEP_RUNS)) + + echo -e " ${GREEN}✓ Cleanup complete${NC}" + else + echo " No cleanup needed (under retention limit)" + TOTAL_KEPT=$((TOTAL_KEPT + total)) + fi + + echo "" +done + +# Summary +echo -e "${BLUE}=========================================" +echo "Cleanup Summary" +echo "=========================================${NC}" +echo "Runs deleted: $TOTAL_DELETED" +echo "Runs kept: $TOTAL_KEPT" +echo "" + +if [ "$TOTAL_DELETED" -gt 0 ]; then + echo -e "${GREEN}✓ Workflow cleanup completed successfully${NC}" +else + echo -e "${BLUE}ℹ No old workflow runs to clean up${NC}" +fi + +exit 0 diff --git a/docs/ci-cd/GITHUB_ACTIONS.md b/docs/ci-cd/GITHUB_ACTIONS.md index e69de29..8e7356c 100644 --- a/docs/ci-cd/GITHUB_ACTIONS.md +++ b/docs/ci-cd/GITHUB_ACTIONS.md @@ -0,0 +1,366 @@ +# GitHub Actions Workflows + +Complete documentation of all CI/CD workflows, when they run, and what they do. + +--- + +## 📋 Workflow Overview + +| Workflow | Triggers | Purpose | Duration | +|----------|----------|---------|----------| +| **CI - Test and Lint** | PRs to main, Manual | Fast feedback: tests + linting | ~1-2 min | +| **Security Scanning** | PRs to main, Weekly (Mon 9AM UTC), Manual | Vulnerability scanning | ~5-10 min | +| **Deploy to Production** | Manual only | SSH deploy to VPS | ~2-5 min | +| **Production Backup** | Daily at 2 AM UTC, Manual | Backup Redis, configs, logs | ~1-3 min | +| **Health Monitoring** | Daily at 10 AM UTC, Manual | Check all services are healthy | ~30 sec | + +--- + +## 1️⃣ CI - Test and Lint (`.github/workflows/ci.yml`) + +### When It Runs: +- ✅ When you open a PR to `main` +- ✅ When you push new commits to an open PR +- ✅ Manual trigger (useful for testing on any branch) +- ❌ Does NOT run on push to main (already validated by PR) + +### What It Does: + +**Backend:** +- Installs Python 3.12 dependencies +- Runs **Ruff** linting (`ruff check .`) +- Runs **pytest** with full test suite +- Uses in-memory SQLite + Redis for testing + +**Frontend:** +- Installs Node.js 20 dependencies +- Runs **ESLint** (`npm run lint`) +- Builds frontend (`npm run build`) +- Verifies no TypeScript/build errors + +### How to Use: + +**Automatic:** +Just create a PR! The workflow runs automatically. + +```bash +git checkout -b feature/my-feature +git commit -m "Add feature" +git push origin feature/my-feature +# Create PR on GitHub → CI runs automatically +``` + +**Manual:** +``` +GitHub → Actions → CI - Test and Lint → Run workflow +Select branch → Run workflow +``` + +### Status: +- ✅ Green checkmark = All tests passed, ready to merge +- ❌ Red X = Tests failed, needs fixing before merge + +--- + +## 2️⃣ Security Scanning (`.github/workflows/security.yml`) + +### When It Runs: +- ✅ When you open a PR to `main` (runs alongside CI) +- ✅ Every Monday at 9:00 AM UTC (dependency scan only) +- ✅ Manual trigger (useful for testing on any branch) +- ❌ Does NOT run on merge to main (already validated by PR) + +### What It Does: + +**Dependency Scan:** +- Python: **Safety** checks `requirements.txt` for known CVEs +- Node.js: **npm audit** checks `package.json` for vulnerabilities + +**Secret Scan (PR and Manual only):** +- **TruffleHog** scans for accidentally committed secrets + - PRs: Scans only the diff (changes in the PR) + - Manual: Scans entire repository filesystem +- Only fails on **verified** secrets (real working credentials) +- Unverified secrets (examples, docs, expired) show warnings but don't block + +**Docker Image Scan (PR and Manual only):** +- **Trivy** scans built Docker images for vulnerabilities +- Checks backend and frontend images +- **Application libraries:** BLOCKS on CRITICAL/HIGH vulnerabilities (must fix) +- **OS packages:** WARNS on vulnerabilities (doesn't block pipeline) +- Results available as downloadable artifacts (30 days retention) + +### Issue Creation: +- 🚨 **Automatically creates a GitHub Issue** if vulnerabilities found +- Issue includes: + - Which scans failed + - Link to workflow logs + - Quick fix commands + - Labeled `security`, `automated` + +### How to Use: + +**Automatic:** +- Runs on every PR (full scan: dependencies + secrets + docker) +- Runs weekly on Monday 9AM UTC (dependency scan only) + +**Manual:** +``` +GitHub → Actions → Security Scanning → Run workflow +Select branch → Run workflow +``` +- Full scan: dependencies + secrets + docker (same as PRs) + +**View Results:** +- Check Actions tab for scan logs +- Check Issues tab for automated security alerts +- Download artifacts for detailed SARIF reports + +--- + +## 3️⃣ Deploy to Production (`.github/workflows/deploy.yml`) + +### When It Runs: +- ✅ Manual trigger only (for safety) +- ❌ Never runs automatically + +### What It Does: +1. **Validates** deployment confirmation (must type "deploy") +2. **SSH** to VPS using `DEPLOY_SSH_KEY` secret +3. **Pulls** latest code from selected branch +4. **Runs** `deployment/deploy.sh` script: + - Creates timestamped backup + - Builds Docker images + - Runs database migrations + - Restarts services with `docker-compose.prod.yml` +5. **Verifies** services are healthy with `deployment/monitoring.sh` +6. **Reports** deployment summary + +### How to Use: + +**Go to:** GitHub → Actions → Deploy to Production → Run workflow + +**Required Inputs:** +- **Branch:** `main` (default) +- **Confirmation:** Type `deploy` exactly + +**Secrets Required:** +- `DEPLOY_SSH_KEY` - SSH private key for VPS access +- `SERVER_IP` - VPS IP address + +### Safety Features: +- Manual trigger only (no auto-deploy) +- Requires typing "deploy" to confirm +- Health checks after deployment +- Rollback instructions on failure + +--- + +## 4️⃣ Automated Production Backup (`.github/workflows/backup.yml`) + +### When It Runs: +- ✅ Daily at 2:00 AM UTC (automated) +- ✅ Manual trigger anytime + +### What It Does: +1. **SSH** to VPS and runs `deployment/backup.sh --upload` +2. **Backs up** (see complete list in [Backup Documentation](../operations/BACKUPS.md)): + - ⏭️ PostgreSQL database - **SKIPPED by default** (Neon provides 7-day auto-recovery) + - ✅ Redis data (rate limiting, cache) + - ✅ Configuration files (.env, docker-compose, Cloudflare Tunnel) + - ✅ Application logs (last 24 hours) + - ✅ Git commit reference +3. **Uploads** entire backup to Google Drive +4. **Cleans up** backups older than 30 days (VPS and Google Drive) +5. **Sends Discord notification** with backup status + +**Note:** Database backups are disabled by default to save Neon compute hours. Use `--include-database` flag for manual database backups when needed (e.g., before major migrations). + +### How to Use: + +**Manual Backup:** +``` +GitHub → Actions → Automated Production Backup → Run workflow +``` + +**Options:** +- ☐ **Include database backup** - For monthly archives or before migrations +- ☐ **Keep uncompressed files** - For debugging (uses extra disk space) + +**Discord Notifications:** +- ✅ Success: Shows backup details and Google Drive upload confirmation +- ❌ Failure: Alert with troubleshooting steps + +**For complete setup guide:** See [Backup Documentation](../operations/BACKUPS.md) + +--- + +## 5️⃣ Health Check Monitoring (`.github/workflows/monitoring.yml`) + +### When It Runs: +- ✅ Daily at 10:00 AM UTC (automated) +- ✅ Manual trigger anytime + +### What It Does: +1. **SSH** to VPS and runs `deployment/monitoring.sh` +2. **Checks:** + - Backend API responding at `/api/health` + - Frontend accessible + - Cloudflare Tunnel working + - Redis accepting connections + - All Docker containers running +3. **Sends Discord notification ONLY if services are down** + +### How to Use: + +**Manual Health Check:** +``` +GitHub → Actions → Health Check Monitoring → Run workflow +``` + +**Notification Behavior:** +- 🔕 Healthy (scheduled): No notification (prevents spam) +- ✅ Healthy (manual): Success notification +- 🚨 Unhealthy: Immediate Discord alert with `@everyone` ping + +**Discord alerts include:** +- Which service(s) are down +- Timestamp +- Immediate troubleshooting steps + +--- + +## 🎯 Typical Development Workflow + +### Feature Development: +``` +1. Create feature branch + git checkout -b feature/new-feature + +2. Make changes, commit + git commit -m "Add feature" + +3. Push branch + git push origin feature/new-feature + +4. Create PR on GitHub + → CI runs automatically (tests + lint) + → Security scans run automatically (dependencies + secrets + docker) + +5. Review CI and security results + ✅ Green = good to merge + ❌ Red = fix issues, push again + +6. Merge PR + → No workflows run on merge (already validated by PR) +``` + +### Production Deployment: +``` +1. Verify PR merged to main +2. Verify all PR checks passed (CI + security) +3. Actions → Deploy to Production → Run workflow +4. Type "deploy" to confirm +5. Monitor deployment logs +6. Verify production is healthy +``` + +--- + +## 🔔 Notifications + +### Email Notifications: +- ✅ GitHub sends email when workflows **fail** +- ✅ Scheduled security scans email on failure +- ✅ Configure: GitHub Settings → Notifications → Actions + +### Discord Notifications: +- ✅ Backup status (daily at 2 AM UTC) +- ✅ Health check failures (immediate alerts) +- ✅ Setup: See [Backup](../operations/BACKUPS.md) and [Observability](../operations/OBSERVABILITY.md) docs + +### Issue Notifications: +- ✅ GitHub Issue created when security scans find vulnerabilities +- ✅ Issues labeled `security`, `automated` +- ✅ Watch repository to get notified + +--- + +## 🛠️ Maintenance Scripts + +All located in `deployment/` directory: + +- **`deploy.sh`** - Main deployment script (backup → build → migrate → restart) +- **`monitoring.sh`** - Health checks and service status +- **`backup.sh`** - Manual backup script + +--- + +## 📊 Monitoring & Logs + +### View Workflow Logs: +```bash +# On GitHub: Actions tab → Select workflow run → View logs + +# On VPS via SSH: +cd /opt/cpta_blog +docker compose -f docker-compose.prod.yml logs -f +``` + +### Check Service Status: +```bash +docker compose -f docker-compose.prod.yml ps +bash deployment/monitoring.sh +``` + +--- + +## 🚨 Troubleshooting + +### CI Failing: +- **Linting errors:** Run `ruff check . --fix` (backend) or `npm run lint:fix` (frontend) +- **Test failures:** Run `pytest -v` locally to debug +- **Build errors:** Run `npm run build` locally + +### Security Scan Failing: +- **Check the GitHub Issue** created automatically +- **Review workflow logs** for specific vulnerabilities +- **Update dependencies:** + ```bash + cd backend && pip install --upgrade + cd frontend && npm update && npm audit fix + ``` +- **Re-run scan** to verify fixes + +### Deployment Failing: +- **Check workflow logs** in Actions tab +- **SSH to server** and check docker logs +- **Review deployment script** for errors + +--- + +## 📈 Best Practices + +✅ **Always create PRs** - Don't push directly to main +✅ **Fix CI failures immediately** - Don't merge broken code +✅ **Review security issues weekly** - Check Issues tab +✅ **Test locally first** - Use `docker-compose` before deploying +✅ **Monitor deployments** - Watch workflow logs during deploy +✅ **Keep dependencies updated** - Run security scans regularly + +--- + +## 🔗 Quick Links + +- **GitHub Actions Docs:** https://docs.github.com/actions +- **Security Tab:** Security tab in your repository +- **Issues:** Filter by `label:security` to see security alerts + +--- + +**Questions?** See the main docs: +- [README](../README.md) - Project overview +- [DEVELOPMENT](../setup/DEVELOPMENT.md) - Local development setup +- [PRODUCTION](../setup/PRODUCTION.md) - Production deployment guide +- [BACKUPS](../operations/BACKUPS.md) - Backup and restore procedures +- [OBSERVABILITY](../operations/OBSERVABILITY.md) - Monitoring and alerts diff --git a/docs/operations/BACKUPS.md b/docs/operations/BACKUPS.md index e69de29..e6a2e58 100644 --- a/docs/operations/BACKUPS.md +++ b/docs/operations/BACKUPS.md @@ -0,0 +1,297 @@ +# Backup & Restore Guide + +Complete backup strategy with automated Google Drive uploads, Discord notifications, and disaster recovery procedures. + +--- + +## Overview + +### Three Layers of Protection + +1. **Neon's Built-in Backups** (Free Tier) + - 7-day point-in-time recovery + - Automatic, managed by Neon + - Good for quick database-only restores + +2. **Local VPS Backups** + - Daily automated backups via GitHub Actions + - Stored at `/opt/backups/cpta_blog/` on VPS + - 30-day retention + +3. **Google Drive Cloud Backups** ✨ + - Automated uploads to Google Drive + - True disaster recovery (survives VPS failure) + - Stored in `blog_backups/` folder + - 30-day retention + +--- + +## What Gets Backed Up + +Every backup creates a `.tar.gz` archive containing: + +### 1. PostgreSQL Database ⏭️ **SKIPPED BY DEFAULT** +- **Why:** Neon provides automatic 7-day point-in-time recovery +- **When to include:** Monthly archives or before major migrations +- **Size:** ~1-5 MB compressed + +### 2. Redis Data +- Rate limiting counters +- Cache data +- Persistence files + +### 3. Configuration Files +- Environment variables +- Docker Compose configuration +- Cloudflare Tunnel config + +### 4. Application Logs +- Last 24 hours from all services + +### 5. Git Metadata +- Commit hash and info + +**Total backup size:** ~2-10 MB per backup + +--- + +## Automated Backups + +**Schedule:** Daily at 2:00 AM UTC via GitHub Actions + +**What happens:** +1. SSH to VPS +2. Run backup script with `--upload` flag +3. Upload to Google Drive (`blog_backups/` folder) +4. Clean up backups older than 30 days +5. Send Discord notification + +**View history:** +- GitHub Actions → "Automated Production Backup" +- Google Drive → `blog_backups/` folder +- Discord alerts channel + +--- + +## Manual Backups + +### Via GitHub Actions (Recommended) + +1. Go to **Actions** → "Automated Production Backup" +2. Click **"Run workflow"** +3. Optional settings: + - ☐ **Include database backup** (uses Neon compute) + - ☐ **Keep uncompressed files** (debugging only) +4. Monitor progress and check Discord + +### Via SSH + +```bash +# Standard backup (database skipped, upload to Google Drive) +bash deployment/backup.sh --upload + +# Include database (use sparingly) +bash deployment/backup.sh --include-database --upload + +# Local only (no cloud upload) +bash deployment/backup.sh +``` + +--- + +## Setup Guide + +### 1. Install rclone on VPS + +```bash +curl https://rclone.org/install.sh | sudo bash +``` + +### 2. Configure Google Drive + +```bash +rclone config +# Follow prompts to set up Google Drive as "gdrive" +``` + +### 3. Test Connection + +```bash +# List files +rclone lsf gdrive: + +# Test upload +echo "test" > /tmp/test.txt +rclone copy /tmp/test.txt gdrive:blog_backups/ +rclone lsf gdrive:blog_backups/ + +# Cleanup +rclone delete gdrive:blog_backups/test.txt +``` + +### 4. Setup Discord Webhook + +1. Discord server → Channel settings → Integrations → Webhooks +2. Create webhook, copy URL +3. GitHub repo → Settings → Secrets → Actions +4. Add secret: `DISCORD_WEBHOOK_URL` + +--- + +## Restoring from Backup + +### 1. Download Backup + +```bash +# From Google Drive +rclone lsf gdrive:blog_backups/ +rclone copy gdrive:blog_backups/YYYYMMDD_HHMMSS.tar.gz /tmp/ + +# Or use local backup +ls /opt/backups/cpta_blog/*.tar.gz +``` + +### 2. Extract + +```bash +mkdir -p /tmp/restore +tar -xzf /tmp/YYYYMMDD_HHMMSS.tar.gz -C /tmp/restore/ +cd /tmp/restore/YYYYMMDD_HHMMSS/ +``` + +### 3. Restore Database (if included) + +```bash +gunzip database/database.sql.gz +cat database/database.sql | docker compose -f /opt/cpta_blog/docker-compose.prod.yml exec -T blog-backend bash -c "psql \$DATABASE_URL" +``` + +### 4. Restore Redis (optional) + +```bash +REDIS_CONTAINER=$(docker compose -f /opt/cpta_blog/docker-compose.prod.yml ps -q redis) +docker compose -f /opt/cpta_blog/docker-compose.prod.yml stop redis +docker cp redis/. $REDIS_CONTAINER:/data/ +docker compose -f /opt/cpta_blog/docker-compose.prod.yml start redis +``` + +### 5. Restart Services + +```bash +cd /opt/cpta_blog +docker compose -f docker-compose.prod.yml restart +bash deployment/monitoring.sh # Verify health +``` + +--- + +## Discord Notifications + +### Success Message + +``` +✅ Backup Completed Successfully + +Time: YYYY-MM-DD HH:MM:SS UTC +Retention: 30 days + +📦 Database: ⏭️ Skipped (Neon auto-backup) +💾 Redis: ✅ Backed up +⚙️ Config: ✅ Backed up +📋 Logs: ✅ Last 24h backed up +☁️ Google Drive: ✅ Uploaded & Verified +``` + +### Failure Alert + +``` +❌ Backup Failed + +⚠️ Action Required: +1. SSH to server and check logs +2. Verify containers running +3. Check disk space +4. Run manual backup +``` + +--- + +## Troubleshooting + +### Backup Fails + +**Container not running:** +```bash +docker compose -f /opt/cpta_blog/docker-compose.prod.yml ps +docker compose -f /opt/cpta_blog/docker-compose.prod.yml up -d +``` + +**Google Drive upload fails:** +```bash +# Test connection +rclone lsf gdrive: + +# Reconfigure if needed +rclone config +``` + +### Out of Disk Space + +```bash +# Check usage +df -h + +# Remove old backups +find /opt/backups/cpta_blog -name "*.tar.gz" -mtime +30 -delete +``` + +--- + +## Security Notes + +**Backups contain sensitive data:** +- Database credentials +- API keys +- JWT secrets + +**Protections:** +- Google Drive: Encrypted at rest, private to your account +- VPS: File permissions restricted +- rclone: HTTPS-only transfers + +**Best practices:** +- Never commit backups to Git +- Keep webhook URLs secret +- Monitor Drive access regularly + +--- + +## Quick Reference + +```bash +# Manual backup +bash deployment/backup.sh --upload + +# List backups +rclone lsf gdrive:blog_backups/ + +# Download backup +rclone copy gdrive:blog_backups/FILENAME.tar.gz /tmp/ + +# Extract +tar -xzf /tmp/FILENAME.tar.gz -C /tmp/restore/ + +# Test rclone +rclone lsf gdrive: +``` + +--- + +**See also:** +- [Observability](./OBSERVABILITY.md) - Monitoring and alerts +- [GitHub Actions](../ci-cd/GITHUB_ACTIONS.md) - CI/CD workflows +- [Production Guide](../setup/PRODUCTION.md) - Deployment + +--- + +**Last Updated:** January 2026 diff --git a/docs/operations/OBSERVABILITY.md b/docs/operations/OBSERVABILITY.md index e69de29..77f9e75 100644 --- a/docs/operations/OBSERVABILITY.md +++ b/docs/operations/OBSERVABILITY.md @@ -0,0 +1,355 @@ +# Observability & Monitoring + +Complete monitoring guide: health checks, logs, and alerts to keep your production blog running smoothly. + +--- + +## Overview + +### Monitoring Strategy + +Your production blog has **automated health monitoring**: + +1. **Automated Health Checks** (GitHub Actions) + - Runs daily at 10:00 AM UTC + - Checks all services via SSH + - Alerts via Discord if anything is down + +2. **VPS Monitoring** (Your hosting provider) + - CPU, memory, disk usage + - Bandwidth and network stats + - Real-time server metrics + +3. **Application Logs** (Docker) + - Backend, frontend, Redis logs + - Access via SSH + +--- + +## Health Check Monitoring + +### What Gets Checked + +The monitoring script (`deployment/monitoring.sh`) verifies: + +1. **Docker Containers** + - All containers running (`blog-backend`, `blog-frontend`, `cloudflared`, `redis`) + - No containers in "Exited" or "Restarting" state + +2. **Backend API** + - Health endpoint responding + - Returns `{"status": "healthy"}` + +3. **Frontend** + - Accessible and serving content + - No error pages + +4. **Cloudflare Tunnel** + - Tunnel container running + - HTTPS access working + +5. **Redis Cache** + - Accepting connections + - Rate limiting operational + +### Schedule + +- **Automated:** Daily at 10:00 AM UTC via GitHub Actions +- **Manual:** Run anytime from GitHub Actions tab + +### How It Works + +``` +Daily at 10:00 AM UTC: +1. GitHub Actions triggers workflow +2. SSH connection to VPS +3. Runs deployment/monitoring.sh +4. Checks all services +5. If ALL healthy → No notification (silent) +6. If ANY down → Discord alert with @everyone ping +``` + +--- + +## Discord Notifications + +### Setup Discord Webhook + +1. In Discord, go to your server +2. Create or select alert channel +3. Click gear icon → "Integrations" → "Create Webhook" +4. Name: `Blog Production Monitor` +5. Copy the webhook URL +6. In GitHub: Settings → Secrets → Actions +7. New secret: `DISCORD_WEBHOOK_URL` +8. Paste webhook URL → "Add secret" + +### Notification Examples + +#### All Services Healthy (Manual Run Only) + +``` +✅ All Services Healthy + +Production health check passed + +Time: YYYY-MM-DD HH:MM:SS UTC +Backend: ✅ Responding +Frontend: ✅ Accessible +Cloudflared: ✅ Running +Redis: ✅ Connected +Containers: ✅ All running + +Blog - Health Monitor +``` + +**Note:** Daily automated runs do NOT send this to prevent spam. + +#### Services Down (Immediate Alert) + +``` +🚨 Production Health Check Failed +@everyone + +One or more services are not responding + +Time: YYYY-MM-DD HH:MM:SS UTC + +⚠️ Immediate Action Required: +1. Check Docker containers +2. Check logs +3. Verify services are running +4. Check server resources + +Blog - Health Monitor +``` + +--- + +## Viewing Logs + +### Via SSH + +```bash +# View all container logs +cd /opt/cpta_blog +docker compose -f docker-compose.prod.yml logs -f + +# View specific container +docker logs blog_backend_prod -f --tail 100 +docker logs blog_frontend_prod -f --tail 100 +docker logs blog_cloudflared_prod -f --tail 100 +docker logs blog_redis_prod -f --tail 100 + +# Search logs for errors +docker logs blog_backend_prod --since 1h | grep -i error +docker logs blog_backend_prod --since 1h | grep -i exception +``` + +### Save Logs to File + +```bash +docker logs blog_backend_prod --since 24h > /tmp/backend-logs.txt +``` + +--- + +## Manual Health Checks + +### Option 1: Via GitHub Actions + +1. GitHub → Actions → "Health Check Monitoring" +2. Click "Run workflow" +3. Select branch: `main` +4. View results in Discord and GitHub logs + +### Option 2: SSH to VPS + +```bash +# Run monitoring script +cd /opt/cpta_blog +bash deployment/monitoring.sh + +# Check exit code +echo $? # 0 = healthy, 1 = unhealthy +``` + +### Option 3: Quick Manual Checks + +```bash +# Check container status +docker compose -f docker-compose.prod.yml ps + +# Check backend health +curl http://localhost:8000/api/health + +# Check Redis +docker exec blog_redis_prod redis-cli ping +``` + +--- + +## Troubleshooting + +### Health Check Failing + +**Backend not responding:** +```bash +# Check backend logs +docker logs blog_backend_prod --tail 50 + +# Check if container is running +docker compose -f docker-compose.prod.yml ps blog-backend + +# Restart backend +docker compose -f docker-compose.prod.yml restart blog-backend +``` + +**Frontend not accessible:** +```bash +# Check frontend logs +docker logs blog_frontend_prod --tail 50 + +# Restart frontend +docker compose -f docker-compose.prod.yml restart blog-frontend +``` + +**Redis not responding:** +```bash +# Check Redis logs +docker logs blog_redis_prod --tail 50 + +# Test Redis connection +docker exec blog_redis_prod redis-cli ping + +# Restart Redis +docker compose -f docker-compose.prod.yml restart redis +``` + +**All containers stopped:** +```bash +# Check Docker is running +sudo systemctl status docker + +# Start Docker if needed +sudo systemctl start docker + +# Restart all containers +cd /opt/cpta_blog +docker compose -f docker-compose.prod.yml up -d +``` + +### Discord Notifications Not Sending + +**Verify webhook works:** +```bash +curl -H "Content-Type: application/json" \ + -d '{"content": "Test notification"}' \ + YOUR_WEBHOOK_URL +``` + +**Check GitHub secret:** +- GitHub → Settings → Secrets → Actions +- Verify `DISCORD_WEBHOOK_URL` exists + +--- + +## Monitoring Best Practices + +### Regular Checks + +- ✅ Review GitHub Actions history weekly +- ✅ Check VPS metrics for trends +- ✅ Review Docker logs for errors +- ✅ Monitor disk space usage + +### Set Thresholds + +- ✅ CPU > 80% sustained = investigate +- ✅ Memory > 90% = potential issue +- ✅ Disk > 85% = cleanup needed +- ✅ Health check failures > 2 consecutive = urgent + +### Response Plan + +**If you get a Discord alert:** + +1. **Acknowledge:** Respond in Discord +2. **SSH to VPS:** Check what's wrong +3. **Review logs:** `docker logs blog_backend_prod --tail 100` +4. **Check resources:** `htop`, `df -h` +5. **Restart if needed:** `docker compose restart` +6. **Document:** Note what happened + +### Prevent Issues + +- Keep disk space below 80% +- Monitor for memory leaks +- Review error logs weekly +- Update dependencies monthly +- Run health checks before deployments + +--- + +## Metrics to Watch + +### Critical (Act Immediately) + +- 🚨 Health check failures +- 🚨 Disk space > 90% +- 🚨 All containers stopped +- 🚨 Database connection errors + +### Warning (Investigate Soon) + +- ⚠️ CPU > 80% sustained +- ⚠️ Memory > 85% +- ⚠️ Increased error rates in logs +- ⚠️ Slow response times + +### Informational (Monitor) + +- 📊 Normal traffic patterns +- 📊 Average response times +- 📊 Bandwidth usage +- 📊 Cloudflare Tunnel status + +--- + +## Quick Reference + +```bash +# Manual health check +cd /opt/cpta_blog +bash deployment/monitoring.sh + +# Check container status +docker compose -f docker-compose.prod.yml ps + +# View logs +docker logs blog_backend_prod -f --tail 100 + +# Check backend health +curl http://localhost:8000/api/health + +# Check system resources +htop +df -h + +# Restart all services +docker compose -f docker-compose.prod.yml restart + +# Restart specific service +docker compose -f docker-compose.prod.yml restart blog-backend +``` + +--- + +**See also:** +- [Backups](./BACKUPS.md) - Backup strategy with Google Drive +- [GitHub Actions](../ci-cd/GITHUB_ACTIONS.md) - All CI/CD workflows +- [Production Guide](../setup/PRODUCTION.md) - Deployment guide + +--- + +**Last Updated:** January 2026