diff --git a/.github/workflows/cleanup.yml b/.github/workflows/cleanup.yml
new file mode 100644
index 0000000..f9686a3
--- /dev/null
+++ b/.github/workflows/cleanup.yml
@@ -0,0 +1,40 @@
+name: Workflow Cleanup
+
+on:
+  workflow_dispatch:
+    inputs:
+      keep_runs:
+        description: 'Number of runs to keep per workflow'
+        required: false
+        default: '20'
+        type: string
+
+jobs:
+  cleanup:
+    name: Clean Up Old Workflow Runs
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Authenticate GitHub CLI
+        run: |
+          echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
+
+      - name: Run cleanup script
+        run: |
+          if [ -n "${{ github.event.inputs.keep_runs }}" ]; then
+            bash deployment/cleanup_workflows.sh --keep "${{ github.event.inputs.keep_runs }}"
+          else
+            bash deployment/cleanup_workflows.sh
+          fi
+
+      - name: Summary
+        if: always()
+        run: |
+          echo "### Workflow Cleanup Complete :broom:" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "- **Repository**: ${{ github.repository }}" >> $GITHUB_STEP_SUMMARY
+          echo "- **Retention**: Kept last ${{ github.event.inputs.keep_runs || '20' }} runs per workflow" >> $GITHUB_STEP_SUMMARY
+          echo "- **Time**: $(date -u)" >> $GITHUB_STEP_SUMMARY
diff --git a/deployment/cleanup_workflows.sh b/deployment/cleanup_workflows.sh
new file mode 100755
index 0000000..7ba35ae
--- /dev/null
+++ b/deployment/cleanup_workflows.sh
@@ -0,0 +1,138 @@
+#!/bin/bash
+# GitHub Actions Workflow Cleanup Script
+# Removes old workflow runs, keeping only the last 20 for each workflow
+# Usage: bash cleanup_workflows.sh [--keep N]
+#
+# Requirements:
+# - GitHub CLI (gh) installed and authenticated
+# - Run from repository root or specify repo with GH_REPO env var
+
+set -e
+
+# Configuration
+KEEP_RUNS=20  # Default: keep last 20 runs per workflow
+REPO_DIR="$(git rev-parse --show-toplevel 2>/dev/null || pwd)"
+
+# Parse arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --keep)
+            KEEP_RUNS="$2"
+            shift 2
+            ;;
+        *)
+            echo "Unknown option: $1"
+            echo "Usage: bash cleanup_workflows.sh [--keep N]"
+            exit 1
+            ;;
+    esac
+done
+
+# Color codes
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m'
+
+echo -e "${BLUE}========================================="
+echo "GitHub Actions Workflow Cleanup"
+echo "=========================================${NC}"
+echo "Repository: $(basename $REPO_DIR)"
+echo "Keep last: $KEEP_RUNS runs per workflow"
+echo "Time: $(date)"
+echo ""
+
+# Check if gh is installed
+if ! command -v gh &> /dev/null; then
+    echo -e "${RED}✗ GitHub CLI (gh) is not installed${NC}"
+    echo "  Install: https://cli.github.com/"
+    exit 1
+fi
+
+# Check if authenticated
+if ! gh auth status &> /dev/null; then
+    echo -e "${RED}✗ Not authenticated with GitHub CLI${NC}"
+    echo "  Run: gh auth login"
+    exit 1
+fi
+
+echo -e "${GREEN}✓ GitHub CLI ready${NC}"
+echo ""
+
+# Get list of workflow files
+cd "$REPO_DIR"
+WORKFLOW_FILES=$(ls .github/workflows/*.yml 2>/dev/null || echo "")
+
+if [ -z "$WORKFLOW_FILES" ]; then
+    echo -e "${YELLOW}No workflow files found in .github/workflows/${NC}"
+    exit 0
+fi
+
+# Track statistics
+TOTAL_DELETED=0
+TOTAL_KEPT=0
+
+# Process each workflow
+for workflow_file in $WORKFLOW_FILES; do
+    workflow_name=$(basename "$workflow_file")
+    echo -e "${YELLOW}Processing: $workflow_name${NC}"
+
+    # Get all run IDs for this workflow (sorted newest first)
+    run_ids=$(gh run list --workflow="$workflow_name" --limit 1000 --json databaseId --jq '.[].databaseId' 2>/dev/null || echo "")
+
+    if [ -z "$run_ids" ]; then
+        echo "  No runs found"
+        echo ""
+        continue
+    fi
+
+    # Count total runs
+    total=$(echo "$run_ids" | wc -l)
+    echo "  Total runs: $total"
+
+    if [ "$total" -gt "$KEEP_RUNS" ]; then
+        # Calculate how many to delete
+        to_delete_count=$((total - KEEP_RUNS))
+        echo "  Keeping: $KEEP_RUNS (newest)"
+        echo "  Deleting: $to_delete_count (oldest)"
+
+        # Skip first N (newest), delete the rest (oldest)
+        to_delete=$(echo "$run_ids" | tail -n +$((KEEP_RUNS + 1)))
+
+        deleted=0
+        echo "$to_delete" | while read -r run_id; do
+            if [ -n "$run_id" ]; then
+                if gh run delete "$run_id" --yes 2>/dev/null; then
+                    ((deleted++)) || true
+                fi
+            fi
+        done
+
+        TOTAL_DELETED=$((TOTAL_DELETED + to_delete_count))
+        TOTAL_KEPT=$((TOTAL_KEPT + KEEP_RUNS))
+
+        echo -e "  ${GREEN}✓ Cleanup complete${NC}"
+    else
+        echo "  No cleanup needed (under retention limit)"
+        TOTAL_KEPT=$((TOTAL_KEPT + total))
+    fi
+
+    echo ""
+done
+
+# Summary
+echo -e "${BLUE}========================================="
+echo "Cleanup Summary"
+echo "=========================================${NC}"
+echo "Runs deleted: $TOTAL_DELETED"
+echo "Runs kept: $TOTAL_KEPT"
+echo ""
+
+if [ "$TOTAL_DELETED" -gt 0 ]; then
+    echo -e "${GREEN}✓ Workflow cleanup completed successfully${NC}"
+else
+    echo -e "${BLUE}ℹ  No old workflow runs to clean up${NC}"
+fi
+
+exit 0
diff --git a/docs/ci-cd/GITHUB_ACTIONS.md b/docs/ci-cd/GITHUB_ACTIONS.md
index e69de29..8e7356c 100644
--- a/docs/ci-cd/GITHUB_ACTIONS.md
+++ b/docs/ci-cd/GITHUB_ACTIONS.md
@@ -0,0 +1,366 @@
+# GitHub Actions Workflows
+
+Complete documentation of all CI/CD workflows, when they run, and what they do.
+
+---
+
+## 📋 Workflow Overview
+
+| Workflow | Triggers | Purpose | Duration |
+|----------|----------|---------|----------|
+| **CI - Test and Lint** | PRs to main, Manual | Fast feedback: tests + linting | ~1-2 min |
+| **Security Scanning** | PRs to main, Weekly (Mon 9AM UTC), Manual | Vulnerability scanning | ~5-10 min |
+| **Deploy to Production** | Manual only | SSH deploy to VPS | ~2-5 min |
+| **Production Backup** | Daily at 2 AM UTC, Manual | Backup Redis, configs, logs | ~1-3 min |
+| **Health Monitoring** | Daily at 10 AM UTC, Manual | Check all services are healthy | ~30 sec |
+
+---
+
+## 1️⃣ CI - Test and Lint (`.github/workflows/ci.yml`)
+
+### When It Runs:
+- ✅ When you open a PR to `main`
+- ✅ When you push new commits to an open PR
+- ✅ Manual trigger (useful for testing on any branch)
+- ❌ Does NOT run on push to main (already validated by PR)
+
+### What It Does:
+
+**Backend:**
+- Installs Python 3.12 dependencies
+- Runs **Ruff** linting (`ruff check .`)
+- Runs **pytest** with full test suite
+- Uses in-memory SQLite + Redis for testing
+
+**Frontend:**
+- Installs Node.js 20 dependencies
+- Runs **ESLint** (`npm run lint`)
+- Builds frontend (`npm run build`)
+- Verifies no TypeScript/build errors
+
+### How to Use:
+
+**Automatic:**
+Just create a PR! The workflow runs automatically.
+
+```bash
+git checkout -b feature/my-feature
+git commit -m "Add feature"
+git push origin feature/my-feature
+# Create PR on GitHub → CI runs automatically
+```
+
+**Manual:**
+```
+GitHub → Actions → CI - Test and Lint → Run workflow
+Select branch → Run workflow
+```
+
+### Status:
+- ✅ Green checkmark = All tests passed, ready to merge
+- ❌ Red X = Tests failed, needs fixing before merge
+
+---
+
+## 2️⃣ Security Scanning (`.github/workflows/security.yml`)
+
+### When It Runs:
+- ✅ When you open a PR to `main` (runs alongside CI)
+- ✅ Every Monday at 9:00 AM UTC (dependency scan only)
+- ✅ Manual trigger (useful for testing on any branch)
+- ❌ Does NOT run on merge to main (already validated by PR)
+
+### What It Does:
+
+**Dependency Scan:**
+- Python: **Safety** checks `requirements.txt` for known CVEs
+- Node.js: **npm audit** checks `package.json` for vulnerabilities
+
+**Secret Scan (PR and Manual only):**
+- **TruffleHog** scans for accidentally committed secrets
+  - PRs: Scans only the diff (changes in the PR)
+  - Manual: Scans entire repository filesystem
+- Only fails on **verified** secrets (real working credentials)
+- Unverified secrets (examples, docs, expired) show warnings but don't block
+
+**Docker Image Scan (PR and Manual only):**
+- **Trivy** scans built Docker images for vulnerabilities
+- Checks backend and frontend images
+- **Application libraries:** BLOCKS on CRITICAL/HIGH vulnerabilities (must fix)
+- **OS packages:** WARNS on vulnerabilities (doesn't block pipeline)
+- Results available as downloadable artifacts (30 days retention)
+
+### Issue Creation:
+- 🚨 **Automatically creates a GitHub Issue** if vulnerabilities found
+- Issue includes:
+  - Which scans failed
+  - Link to workflow logs
+  - Quick fix commands
+  - Labeled `security`, `automated`
+
+### How to Use:
+
+**Automatic:**
+- Runs on every PR (full scan: dependencies + secrets + docker)
+- Runs weekly on Monday 9AM UTC (dependency scan only)
+
+**Manual:**
+```
+GitHub → Actions → Security Scanning → Run workflow
+Select branch → Run workflow
+```
+- Full scan: dependencies + secrets + docker (same as PRs)
+
+**View Results:**
+- Check Actions tab for scan logs
+- Check Issues tab for automated security alerts
+- Download artifacts for detailed SARIF reports
+
+---
+
+## 3️⃣ Deploy to Production (`.github/workflows/deploy.yml`)
+
+### When It Runs:
+- ✅ Manual trigger only (for safety)
+- ❌ Never runs automatically
+
+### What It Does:
+1. **Validates** deployment confirmation (must type "deploy")
+2. **SSH** to VPS using `DEPLOY_SSH_KEY` secret
+3. **Pulls** latest code from selected branch
+4. **Runs** `deployment/deploy.sh` script:
+   - Creates timestamped backup
+   - Builds Docker images
+   - Runs database migrations
+   - Restarts services with `docker-compose.prod.yml`
+5. **Verifies** services are healthy with `deployment/monitoring.sh`
+6. **Reports** deployment summary
+
+### How to Use:
+
+**Go to:** GitHub → Actions → Deploy to Production → Run workflow
+
+**Required Inputs:**
+- **Branch:** `main` (default)
+- **Confirmation:** Type `deploy` exactly
+
+**Secrets Required:**
+- `DEPLOY_SSH_KEY` - SSH private key for VPS access
+- `SERVER_IP` - VPS IP address
+
+### Safety Features:
+- Manual trigger only (no auto-deploy)
+- Requires typing "deploy" to confirm
+- Health checks after deployment
+- Rollback instructions on failure
+
+---
+
+## 4️⃣ Automated Production Backup (`.github/workflows/backup.yml`)
+
+### When It Runs:
+- ✅ Daily at 2:00 AM UTC (automated)
+- ✅ Manual trigger anytime
+
+### What It Does:
+1. **SSH** to VPS and runs `deployment/backup.sh --upload`
+2. **Backs up** (see complete list in [Backup Documentation](../operations/BACKUPS.md)):
+   - ⏭️ PostgreSQL database - **SKIPPED by default** (Neon provides 7-day auto-recovery)
+   - ✅ Redis data (rate limiting, cache)
+   - ✅ Configuration files (.env, docker-compose, Cloudflare Tunnel)
+   - ✅ Application logs (last 24 hours)
+   - ✅ Git commit reference
+3. **Uploads** entire backup to Google Drive
+4. **Cleans up** backups older than 30 days (VPS and Google Drive)
+5. **Sends Discord notification** with backup status
+
+**Note:** Database backups are disabled by default to save Neon compute hours. Use `--include-database` flag for manual database backups when needed (e.g., before major migrations).
+
+### How to Use:
+
+**Manual Backup:**
+```
+GitHub → Actions → Automated Production Backup → Run workflow
+```
+
+**Options:**
+- ☐ **Include database backup** - For monthly archives or before migrations
+- ☐ **Keep uncompressed files** - For debugging (uses extra disk space)
+
+**Discord Notifications:**
+- ✅ Success: Shows backup details and Google Drive upload confirmation
+- ❌ Failure: Alert with troubleshooting steps
+
+**For complete setup guide:** See [Backup Documentation](../operations/BACKUPS.md)
+
+---
+
+## 5️⃣ Health Check Monitoring (`.github/workflows/monitoring.yml`)
+
+### When It Runs:
+- ✅ Daily at 10:00 AM UTC (automated)
+- ✅ Manual trigger anytime
+
+### What It Does:
+1. **SSH** to VPS and runs `deployment/monitoring.sh`
+2. **Checks:**
+   - Backend API responding at `/api/health`
+   - Frontend accessible
+   - Cloudflare Tunnel working
+   - Redis accepting connections
+   - All Docker containers running
+3. **Sends Discord notification ONLY if services are down**
+
+### How to Use:
+
+**Manual Health Check:**
+```
+GitHub → Actions → Health Check Monitoring → Run workflow
+```
+
+**Notification Behavior:**
+- 🔕 Healthy (scheduled): No notification (prevents spam)
+- ✅ Healthy (manual): Success notification
+- 🚨 Unhealthy: Immediate Discord alert with `@everyone` ping
+
+**Discord alerts include:**
+- Which service(s) are down
+- Timestamp
+- Immediate troubleshooting steps
+
+---
+
+## 🎯 Typical Development Workflow
+
+### Feature Development:
+```
+1. Create feature branch
+   git checkout -b feature/new-feature
+
+2. Make changes, commit
+   git commit -m "Add feature"
+
+3. Push branch
+   git push origin feature/new-feature
+
+4. Create PR on GitHub
+   → CI runs automatically (tests + lint)
+   → Security scans run automatically (dependencies + secrets + docker)
+
+5. Review CI and security results
+   ✅ Green = good to merge
+   ❌ Red = fix issues, push again
+
+6. Merge PR
+   → No workflows run on merge (already validated by PR)
+```
+
+### Production Deployment:
+```
+1. Verify PR merged to main
+2. Verify all PR checks passed (CI + security)
+3. Actions → Deploy to Production → Run workflow
+4. Type "deploy" to confirm
+5. Monitor deployment logs
+6. Verify production is healthy
+```
+
+---
+
+## 🔔 Notifications
+
+### Email Notifications:
+- ✅ GitHub sends email when workflows **fail**
+- ✅ Scheduled security scans email on failure
+- ✅ Configure: GitHub Settings → Notifications → Actions
+
+### Discord Notifications:
+- ✅ Backup status (daily at 2 AM UTC)
+- ✅ Health check failures (immediate alerts)
+- ✅ Setup: See [Backup](../operations/BACKUPS.md) and [Observability](../operations/OBSERVABILITY.md) docs
+
+### Issue Notifications:
+- ✅ GitHub Issue created when security scans find vulnerabilities
+- ✅ Issues labeled `security`, `automated`
+- ✅ Watch repository to get notified
+
+---
+
+## 🛠️ Maintenance Scripts
+
+All located in `deployment/` directory:
+
+- **`deploy.sh`** - Main deployment script (backup → build → migrate → restart)
+- **`monitoring.sh`** - Health checks and service status
+- **`backup.sh`** - Manual backup script
+
+---
+
+## 📊 Monitoring & Logs
+
+### View Workflow Logs:
+```bash
+# On GitHub: Actions tab → Select workflow run → View logs
+
+# On VPS via SSH:
+cd /opt/cpta_blog
+docker compose -f docker-compose.prod.yml logs -f
+```
+
+### Check Service Status:
+```bash
+docker compose -f docker-compose.prod.yml ps
+bash deployment/monitoring.sh
+```
+
+---
+
+## 🚨 Troubleshooting
+
+### CI Failing:
+- **Linting errors:** Run `ruff check . --fix` (backend) or `npm run lint:fix` (frontend)
+- **Test failures:** Run `pytest -v` locally to debug
+- **Build errors:** Run `npm run build` locally
+
+### Security Scan Failing:
+- **Check the GitHub Issue** created automatically
+- **Review workflow logs** for specific vulnerabilities
+- **Update dependencies:**
+  ```bash
+  cd backend && pip install --upgrade <package>
+  cd frontend && npm update && npm audit fix
+  ```
+- **Re-run scan** to verify fixes
+
+### Deployment Failing:
+- **Check workflow logs** in Actions tab
+- **SSH to server** and check docker logs
+- **Review deployment script** for errors
+
+---
+
+## 📈 Best Practices
+
+✅ **Always create PRs** - Don't push directly to main
+✅ **Fix CI failures immediately** - Don't merge broken code
+✅ **Review security issues weekly** - Check Issues tab
+✅ **Test locally first** - Use `docker-compose` before deploying
+✅ **Monitor deployments** - Watch workflow logs during deploy
+✅ **Keep dependencies updated** - Run security scans regularly
+
+---
+
+## 🔗 Quick Links
+
+- **GitHub Actions Docs:** https://docs.github.com/actions
+- **Security Tab:** Security tab in your repository
+- **Issues:** Filter by `label:security` to see security alerts
+
+---
+
+**Questions?** See the main docs:
+- [README](../README.md) - Project overview
+- [DEVELOPMENT](../setup/DEVELOPMENT.md) - Local development setup
+- [PRODUCTION](../setup/PRODUCTION.md) - Production deployment guide
+- [BACKUPS](../operations/BACKUPS.md) - Backup and restore procedures
+- [OBSERVABILITY](../operations/OBSERVABILITY.md) - Monitoring and alerts
diff --git a/docs/operations/BACKUPS.md b/docs/operations/BACKUPS.md
index e69de29..e6a2e58 100644
--- a/docs/operations/BACKUPS.md
+++ b/docs/operations/BACKUPS.md
@@ -0,0 +1,297 @@
+# Backup & Restore Guide
+
+Complete backup strategy with automated Google Drive uploads, Discord notifications, and disaster recovery procedures.
+
+---
+
+## Overview
+
+### Three Layers of Protection
+
+1. **Neon's Built-in Backups** (Free Tier)
+   - 7-day point-in-time recovery
+   - Automatic, managed by Neon
+   - Good for quick database-only restores
+
+2. **Local VPS Backups**
+   - Daily automated backups via GitHub Actions
+   - Stored at `/opt/backups/cpta_blog/` on VPS
+   - 30-day retention
+
+3. **Google Drive Cloud Backups** ✨
+   - Automated uploads to Google Drive
+   - True disaster recovery (survives VPS failure)
+   - Stored in `blog_backups/` folder
+   - 30-day retention
+
+---
+
+## What Gets Backed Up
+
+Every backup creates a `.tar.gz` archive containing:
+
+### 1. PostgreSQL Database ⏭️ **SKIPPED BY DEFAULT**
+- **Why:** Neon provides automatic 7-day point-in-time recovery
+- **When to include:** Monthly archives or before major migrations
+- **Size:** ~1-5 MB compressed
+
+### 2. Redis Data
+- Rate limiting counters
+- Cache data
+- Persistence files
+
+### 3. Configuration Files
+- Environment variables
+- Docker Compose configuration
+- Cloudflare Tunnel config
+
+### 4. Application Logs
+- Last 24 hours from all services
+
+### 5. Git Metadata
+- Commit hash and info
+
+**Total backup size:** ~2-10 MB per backup
+
+---
+
+## Automated Backups
+
+**Schedule:** Daily at 2:00 AM UTC via GitHub Actions
+
+**What happens:**
+1. SSH to VPS
+2. Run backup script with `--upload` flag
+3. Upload to Google Drive (`blog_backups/` folder)
+4. Clean up backups older than 30 days
+5. Send Discord notification
+
+**View history:**
+- GitHub Actions → "Automated Production Backup"
+- Google Drive → `blog_backups/` folder
+- Discord alerts channel
+
+---
+
+## Manual Backups
+
+### Via GitHub Actions (Recommended)
+
+1. Go to **Actions** → "Automated Production Backup"
+2. Click **"Run workflow"**
+3. Optional settings:
+   - ☐ **Include database backup** (uses Neon compute)
+   - ☐ **Keep uncompressed files** (debugging only)
+4. Monitor progress and check Discord
+
+### Via SSH
+
+```bash
+# Standard backup (database skipped, upload to Google Drive)
+bash deployment/backup.sh --upload
+
+# Include database (use sparingly)
+bash deployment/backup.sh --include-database --upload
+
+# Local only (no cloud upload)
+bash deployment/backup.sh
+```
+
+---
+
+## Setup Guide
+
+### 1. Install rclone on VPS
+
+```bash
+curl https://rclone.org/install.sh | sudo bash
+```
+
+### 2. Configure Google Drive
+
+```bash
+rclone config
+# Follow prompts to set up Google Drive as "gdrive"
+```
+
+### 3. Test Connection
+
+```bash
+# List files
+rclone lsf gdrive:
+
+# Test upload
+echo "test" > /tmp/test.txt
+rclone copy /tmp/test.txt gdrive:blog_backups/
+rclone lsf gdrive:blog_backups/
+
+# Cleanup
+rclone delete gdrive:blog_backups/test.txt
+```
+
+### 4. Setup Discord Webhook
+
+1. Discord server → Channel settings → Integrations → Webhooks
+2. Create webhook, copy URL
+3. GitHub repo → Settings → Secrets → Actions
+4. Add secret: `DISCORD_WEBHOOK_URL`
+
+---
+
+## Restoring from Backup
+
+### 1. Download Backup
+
+```bash
+# From Google Drive
+rclone lsf gdrive:blog_backups/
+rclone copy gdrive:blog_backups/YYYYMMDD_HHMMSS.tar.gz /tmp/
+
+# Or use local backup
+ls /opt/backups/cpta_blog/*.tar.gz
+```
+
+### 2. Extract
+
+```bash
+mkdir -p /tmp/restore
+tar -xzf /tmp/YYYYMMDD_HHMMSS.tar.gz -C /tmp/restore/
+cd /tmp/restore/YYYYMMDD_HHMMSS/
+```
+
+### 3. Restore Database (if included)
+
+```bash
+gunzip database/database.sql.gz
+cat database/database.sql | docker compose -f /opt/cpta_blog/docker-compose.prod.yml exec -T blog-backend bash -c "psql \$DATABASE_URL"
+```
+
+### 4. Restore Redis (optional)
+
+```bash
+REDIS_CONTAINER=$(docker compose -f /opt/cpta_blog/docker-compose.prod.yml ps -q redis)
+docker compose -f /opt/cpta_blog/docker-compose.prod.yml stop redis
+docker cp redis/. $REDIS_CONTAINER:/data/
+docker compose -f /opt/cpta_blog/docker-compose.prod.yml start redis
+```
+
+### 5. Restart Services
+
+```bash
+cd /opt/cpta_blog
+docker compose -f docker-compose.prod.yml restart
+bash deployment/monitoring.sh  # Verify health
+```
+
+---
+
+## Discord Notifications
+
+### Success Message
+
+```
+✅ Backup Completed Successfully
+
+Time: YYYY-MM-DD HH:MM:SS UTC
+Retention: 30 days
+
+📦 Database: ⏭️ Skipped (Neon auto-backup)
+💾 Redis: ✅ Backed up
+⚙️ Config: ✅ Backed up
+📋 Logs: ✅ Last 24h backed up
+☁️ Google Drive: ✅ Uploaded & Verified
+```
+
+### Failure Alert
+
+```
+❌ Backup Failed
+
+⚠️ Action Required:
+1. SSH to server and check logs
+2. Verify containers running
+3. Check disk space
+4. Run manual backup
+```
+
+---
+
+## Troubleshooting
+
+### Backup Fails
+
+**Container not running:**
+```bash
+docker compose -f /opt/cpta_blog/docker-compose.prod.yml ps
+docker compose -f /opt/cpta_blog/docker-compose.prod.yml up -d
+```
+
+**Google Drive upload fails:**
+```bash
+# Test connection
+rclone lsf gdrive:
+
+# Reconfigure if needed
+rclone config
+```
+
+### Out of Disk Space
+
+```bash
+# Check usage
+df -h
+
+# Remove old backups
+find /opt/backups/cpta_blog -name "*.tar.gz" -mtime +30 -delete
+```
+
+---
+
+## Security Notes
+
+**Backups contain sensitive data:**
+- Database credentials
+- API keys
+- JWT secrets
+
+**Protections:**
+- Google Drive: Encrypted at rest, private to your account
+- VPS: File permissions restricted
+- rclone: HTTPS-only transfers
+
+**Best practices:**
+- Never commit backups to Git
+- Keep webhook URLs secret
+- Monitor Drive access regularly
+
+---
+
+## Quick Reference
+
+```bash
+# Manual backup
+bash deployment/backup.sh --upload
+
+# List backups
+rclone lsf gdrive:blog_backups/
+
+# Download backup
+rclone copy gdrive:blog_backups/FILENAME.tar.gz /tmp/
+
+# Extract
+tar -xzf /tmp/FILENAME.tar.gz -C /tmp/restore/
+
+# Test rclone
+rclone lsf gdrive:
+```
+
+---
+
+**See also:**
+- [Observability](./OBSERVABILITY.md) - Monitoring and alerts
+- [GitHub Actions](../ci-cd/GITHUB_ACTIONS.md) - CI/CD workflows
+- [Production Guide](../setup/PRODUCTION.md) - Deployment
+
+---
+
+**Last Updated:** January 2026
diff --git a/docs/operations/OBSERVABILITY.md b/docs/operations/OBSERVABILITY.md
index e69de29..77f9e75 100644
--- a/docs/operations/OBSERVABILITY.md
+++ b/docs/operations/OBSERVABILITY.md
@@ -0,0 +1,355 @@
+# Observability & Monitoring
+
+Complete monitoring guide: health checks, logs, and alerts to keep your production blog running smoothly.
+
+---
+
+## Overview
+
+### Monitoring Strategy
+
+Your production blog has **automated health monitoring**:
+
+1. **Automated Health Checks** (GitHub Actions)
+   - Runs daily at 10:00 AM UTC
+   - Checks all services via SSH
+   - Alerts via Discord if anything is down
+
+2. **VPS Monitoring** (Your hosting provider)
+   - CPU, memory, disk usage
+   - Bandwidth and network stats
+   - Real-time server metrics
+
+3. **Application Logs** (Docker)
+   - Backend, frontend, Redis logs
+   - Access via SSH
+
+---
+
+## Health Check Monitoring
+
+### What Gets Checked
+
+The monitoring script (`deployment/monitoring.sh`) verifies:
+
+1. **Docker Containers**
+   - All containers running (`blog-backend`, `blog-frontend`, `cloudflared`, `redis`)
+   - No containers in "Exited" or "Restarting" state
+
+2. **Backend API**
+   - Health endpoint responding
+   - Returns `{"status": "healthy"}`
+
+3. **Frontend**
+   - Accessible and serving content
+   - No error pages
+
+4. **Cloudflare Tunnel**
+   - Tunnel container running
+   - HTTPS access working
+
+5. **Redis Cache**
+   - Accepting connections
+   - Rate limiting operational
+
+### Schedule
+
+- **Automated:** Daily at 10:00 AM UTC via GitHub Actions
+- **Manual:** Run anytime from GitHub Actions tab
+
+### How It Works
+
+```
+Daily at 10:00 AM UTC:
+1. GitHub Actions triggers workflow
+2. SSH connection to VPS
+3. Runs deployment/monitoring.sh
+4. Checks all services
+5. If ALL healthy → No notification (silent)
+6. If ANY down → Discord alert with @everyone ping
+```
+
+---
+
+## Discord Notifications
+
+### Setup Discord Webhook
+
+1. In Discord, go to your server
+2. Create or select alert channel
+3. Click gear icon → "Integrations" → "Create Webhook"
+4. Name: `Blog Production Monitor`
+5. Copy the webhook URL
+6. In GitHub: Settings → Secrets → Actions
+7. New secret: `DISCORD_WEBHOOK_URL`
+8. Paste webhook URL → "Add secret"
+
+### Notification Examples
+
+#### All Services Healthy (Manual Run Only)
+
+```
+✅ All Services Healthy
+
+Production health check passed
+
+Time: YYYY-MM-DD HH:MM:SS UTC
+Backend: ✅ Responding
+Frontend: ✅ Accessible
+Cloudflared: ✅ Running
+Redis: ✅ Connected
+Containers: ✅ All running
+
+Blog - Health Monitor
+```
+
+**Note:** Daily automated runs do NOT send this to prevent spam.
+
+#### Services Down (Immediate Alert)
+
+```
+🚨 Production Health Check Failed
+@everyone
+
+One or more services are not responding
+
+Time: YYYY-MM-DD HH:MM:SS UTC
+
+⚠️ Immediate Action Required:
+1. Check Docker containers
+2. Check logs
+3. Verify services are running
+4. Check server resources
+
+Blog - Health Monitor
+```
+
+---
+
+## Viewing Logs
+
+### Via SSH
+
+```bash
+# View all container logs
+cd /opt/cpta_blog
+docker compose -f docker-compose.prod.yml logs -f
+
+# View specific container
+docker logs blog_backend_prod -f --tail 100
+docker logs blog_frontend_prod -f --tail 100
+docker logs blog_cloudflared_prod -f --tail 100
+docker logs blog_redis_prod -f --tail 100
+
+# Search logs for errors
+docker logs blog_backend_prod --since 1h | grep -i error
+docker logs blog_backend_prod --since 1h | grep -i exception
+```
+
+### Save Logs to File
+
+```bash
+docker logs blog_backend_prod --since 24h > /tmp/backend-logs.txt
+```
+
+---
+
+## Manual Health Checks
+
+### Option 1: Via GitHub Actions
+
+1. GitHub → Actions → "Health Check Monitoring"
+2. Click "Run workflow"
+3. Select branch: `main`
+4. View results in Discord and GitHub logs
+
+### Option 2: SSH to VPS
+
+```bash
+# Run monitoring script
+cd /opt/cpta_blog
+bash deployment/monitoring.sh
+
+# Check exit code
+echo $?  # 0 = healthy, 1 = unhealthy
+```
+
+### Option 3: Quick Manual Checks
+
+```bash
+# Check container status
+docker compose -f docker-compose.prod.yml ps
+
+# Check backend health
+curl http://localhost:8000/api/health
+
+# Check Redis
+docker exec blog_redis_prod redis-cli ping
+```
+
+---
+
+## Troubleshooting
+
+### Health Check Failing
+
+**Backend not responding:**
+```bash
+# Check backend logs
+docker logs blog_backend_prod --tail 50
+
+# Check if container is running
+docker compose -f docker-compose.prod.yml ps blog-backend
+
+# Restart backend
+docker compose -f docker-compose.prod.yml restart blog-backend
+```
+
+**Frontend not accessible:**
+```bash
+# Check frontend logs
+docker logs blog_frontend_prod --tail 50
+
+# Restart frontend
+docker compose -f docker-compose.prod.yml restart blog-frontend
+```
+
+**Redis not responding:**
+```bash
+# Check Redis logs
+docker logs blog_redis_prod --tail 50
+
+# Test Redis connection
+docker exec blog_redis_prod redis-cli ping
+
+# Restart Redis
+docker compose -f docker-compose.prod.yml restart redis
+```
+
+**All containers stopped:**
+```bash
+# Check Docker is running
+sudo systemctl status docker
+
+# Start Docker if needed
+sudo systemctl start docker
+
+# Restart all containers
+cd /opt/cpta_blog
+docker compose -f docker-compose.prod.yml up -d
+```
+
+### Discord Notifications Not Sending
+
+**Verify webhook works:**
+```bash
+curl -H "Content-Type: application/json" \
+     -d '{"content": "Test notification"}' \
+     YOUR_WEBHOOK_URL
+```
+
+**Check GitHub secret:**
+- GitHub → Settings → Secrets → Actions
+- Verify `DISCORD_WEBHOOK_URL` exists
+
+---
+
+## Monitoring Best Practices
+
+### Regular Checks
+
+- ✅ Review GitHub Actions history weekly
+- ✅ Check VPS metrics for trends
+- ✅ Review Docker logs for errors
+- ✅ Monitor disk space usage
+
+### Set Thresholds
+
+- ✅ CPU > 80% sustained = investigate
+- ✅ Memory > 90% = potential issue
+- ✅ Disk > 85% = cleanup needed
+- ✅ Health check failures > 2 consecutive = urgent
+
+### Response Plan
+
+**If you get a Discord alert:**
+
+1. **Acknowledge:** Respond in Discord
+2. **SSH to VPS:** Check what's wrong
+3. **Review logs:** `docker logs blog_backend_prod --tail 100`
+4. **Check resources:** `htop`, `df -h`
+5. **Restart if needed:** `docker compose restart`
+6. **Document:** Note what happened
+
+### Prevent Issues
+
+- Keep disk space below 80%
+- Monitor for memory leaks
+- Review error logs weekly
+- Update dependencies monthly
+- Run health checks before deployments
+
+---
+
+## Metrics to Watch
+
+### Critical (Act Immediately)
+
+- 🚨 Health check failures
+- 🚨 Disk space > 90%
+- 🚨 All containers stopped
+- 🚨 Database connection errors
+
+### Warning (Investigate Soon)
+
+- ⚠️ CPU > 80% sustained
+- ⚠️ Memory > 85%
+- ⚠️ Increased error rates in logs
+- ⚠️ Slow response times
+
+### Informational (Monitor)
+
+- 📊 Normal traffic patterns
+- 📊 Average response times
+- 📊 Bandwidth usage
+- 📊 Cloudflare Tunnel status
+
+---
+
+## Quick Reference
+
+```bash
+# Manual health check
+cd /opt/cpta_blog
+bash deployment/monitoring.sh
+
+# Check container status
+docker compose -f docker-compose.prod.yml ps
+
+# View logs
+docker logs blog_backend_prod -f --tail 100
+
+# Check backend health
+curl http://localhost:8000/api/health
+
+# Check system resources
+htop
+df -h
+
+# Restart all services
+docker compose -f docker-compose.prod.yml restart
+
+# Restart specific service
+docker compose -f docker-compose.prod.yml restart blog-backend
+```
+
+---
+
+**See also:**
+- [Backups](./BACKUPS.md) - Backup strategy with Google Drive
+- [GitHub Actions](../ci-cd/GITHUB_ACTIONS.md) - All CI/CD workflows
+- [Production Guide](../setup/PRODUCTION.md) - Deployment guide
+
+---
+
+**Last Updated:** January 2026