diff --git a/.github/workflows/build-and-push.yml b/.github/workflows/build-and-push.yml new file mode 100644 index 000000000000..bf851f5b3da3 --- /dev/null +++ b/.github/workflows/build-and-push.yml @@ -0,0 +1,199 @@ +# Build and Push to ECR +# This workflow builds the Docker image and pushes it to AWS ECR +# Triggered on pushes to stage branch (tag-based releases can be enabled later) +# +# On pull_request: build-only (validates Dockerfile, no AWS auth required) +# On push to stage: build + push to ECR (requires OIDC role below) +# +# Authentication: GitHub OIDC +# Prerequisites: +# 1. AWS OIDC provider for token.actions.githubusercontent.com (already exists) +# 2. IAM role with trust policy condition: +# "StringEquals": { "token.actions.githubusercontent.com:aud": "sts.amazonaws.com" } +# "StringLike": { "token.actions.githubusercontent.com:sub": "repo:thunderbird/addons-server:ref:refs/heads/stage" } +# 3. Repository variable: AWS_ROLE_ARN (role ARN from step 2) +# Note: Can later be moved to an environment for stricter controls +# See: https://tinyurl.com/ghAwsOidc +# +# Publishing is gated on BOTH: +# - Event type (push, not pull_request) +# - vars.AWS_ROLE_ARN is set +# If either condition fails, then build succeeds but publish is skipped +# +# Required IAM permissions for the OIDC role: +# - ecr:GetAuthorizationToken +# - ecr:BatchCheckLayerAvailability +# - ecr:BatchGetImage +# - ecr:CompleteLayerUpload +# - ecr:DescribeImages +# - ecr:InitiateLayerUpload +# - ecr:GetDownloadUrlForLayer +# - ecr:ListImages +# - ecr:UploadLayerPart +# - ecr:PutImage + +name: Build and Push to ECR + +on: + push: + branches: + - stage + # tags: + # - 'v*' # Uncomment when tag-based releases are defined + pull_request: + branches: + - stage + - master + +env: + AWS_REGION: us-west-2 + ECR_REPOSITORY: atn-stage-addons-server + AWS_ACCOUNT_ID: "768512802988" + +jobs: + # Build job: always runs, validates Dockerfile, no AWS permissions needed + build: + name: Build + runs-on: ubuntu-latest + permissions: + contents: read + # Note: no id-token here - minimum privilege for PR/build-only scenarios + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.AWS_ACCOUNT_ID }}.dkr.ecr.${{ env.AWS_REGION }}.amazonaws.com/${{ env.ECR_REPOSITORY }} + tags: | + type=ref,event=branch + type=ref,event=pr + type=sha,prefix= + # type=semver,pattern={{version}} # Enable when tag triggers are added + # type=semver,pattern={{major}}.{{minor}} + + - name: Build Docker image + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile.ecs + push: false + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + OLYMPIA_UID=9500 + OLYMPIA_GID=9500 + + # Informational job: shows why publishing skipped when not configured role + publish-disabled: + name: Publish (skipped - AWS_ROLE_ARN not set) + needs: build + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/stage' && vars.AWS_ROLE_ARN == '' + steps: + - name: Publishing not configured + run: | + echo "::notice::Publish skipped: AWS_ROLE_ARN repo variable not set (OIDC role not configured yet)" + echo "See workflow header comments for IAM role setup instructions" + + # Publish job: only runs on push to stage when OIDC role is configured + publish: + name: Publish to ECR + needs: build + runs-on: ubuntu-latest + if: github.event_name == 'push' && github.ref == 'refs/heads/stage' && vars.AWS_ROLE_ARN != '' + concurrency: + group: ecr-stage-publish + cancel-in-progress: true + permissions: + contents: read + id-token: write # Required for OIDC authn - only granted when actually publishing + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Configure AWS credentials (OIDC) + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v2 + + - name: Extract metadata for Docker + id: meta + uses: docker/metadata-action@v5 + with: + images: ${{ env.AWS_ACCOUNT_ID }}.dkr.ecr.${{ env.AWS_REGION }}.amazonaws.com/${{ env.ECR_REPOSITORY }} + tags: | + type=ref,event=branch + type=sha,prefix= + type=raw,value=stage-latest + + - name: Build and push Docker image + id: build-image + uses: docker/build-push-action@v5 + with: + context: . + file: ./Dockerfile.ecs + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + OLYMPIA_UID=9500 + OLYMPIA_GID=9500 + + # Generate build metadata (future: bake into image or upload to S3 for traceability) + - name: Generate version.json + run: | + echo '{ + "commit": "${{ github.sha }}", + "version": "${{ github.ref_name }}", + "source": "https://github.com/${{ github.repository }}", + "build": "https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" + }' > version.json + cat version.json + + - name: Image digest + run: echo "Image pushed with digest ${{ steps.build-image.outputs.digest }}" + + # Deploy to ECS (optional - we would uncomment this when ready, or move to separate deploy.yml) + # deploy: + # name: Deploy to ECS + # needs: publish + # runs-on: ubuntu-latest + # permissions: + # contents: read + # id-token: write + # + # steps: + # - name: Configure AWS credentials (OIDC) + # uses: aws-actions/configure-aws-credentials@v4 + # with: + # role-to-assume: ${{ vars.AWS_ROLE_ARN }} + # aws-region: ${{ env.AWS_REGION }} + # + # - name: Update ECS services + # run: | + # for service in web worker versioncheck; do + # aws ecs update-service \ + # --cluster thunderbird-addons-stage-${service}-cluster \ + # --service thunderbird-addons-stage-${service}-service \ + # --force-new-deployment + # done diff --git a/.github/workflows/deploy-stage.yml b/.github/workflows/deploy-stage.yml new file mode 100644 index 000000000000..393be635b349 --- /dev/null +++ b/.github/workflows/deploy-stage.yml @@ -0,0 +1,171 @@ +--- +# Deploy to Stage +# +# Runs `pulumi up` against the stage stack after a successful image push to ECR +# This requires manual approval via the _staging_ environment protection rule +# +# Trigger options +# - workflow_dispatch: manual trigger from GH Actions UI +# - workflow_run: automatically after build-and-push succeeds on stage +# +# Authentication: GH OIDC -> AWS IAM role +# Required secrets: PULUMI_ACCESS_TOKEN, PULUMI_PASSPHRASE +# Required variables: AWS_ROLE_ARN + +name: deploy-stage + +concurrency: + group: deploy-stage + cancel-in-progress: false # To avoid cancelling in-progress deploys + +on: + workflow_dispatch: + inputs: + preview_only: + description: "Run pulumi preview only (no apply)" + required: false + default: "false" + type: choice + options: + - "false" + - "true" + + workflow_run: + workflows: ["Build and Push to ECR"] + types: [completed] + branches: [stage] + +permissions: + contents: read + id-token: write # OIDC authn + +env: + AWS_REGION: us-west-2 + PULUMI_STACK: thunderbird/thunderbird-addons/stage + +jobs: + # Gate: only proceed if the triggering workflow succeeded (or manual) + check-trigger: + runs-on: ubuntu-latest + outputs: + should-deploy: ${{ steps.check.outputs.result }} + steps: + - name: Evaluate trigger + id: check + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + echo "result=true" >> $GITHUB_OUTPUT + elif [[ "${{ github.event.workflow_run.conclusion }}" == "success" ]]; then + echo "result=true" >> $GITHUB_OUTPUT + else + echo "result=false" >> $GITHUB_OUTPUT + echo "Skipping: triggering workflow did not succeed" + fi + + deploy: + needs: check-trigger + if: needs.check-trigger.outputs.should-deploy == 'true' + runs-on: ubuntu-latest + timeout-minutes: 30 + environment: staging # Would require approval if environment protection rules are set + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: ${{ vars.AWS_ROLE_ARN }} + aws-region: ${{ env.AWS_REGION }} + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: "3.13" + + - name: Install Pulumi CLI + uses: pulumi/actions@v6 + with: + command: version + + - name: Install dependencies + working-directory: infra/pulumi + run: | + python -m venv .venv + source .venv/bin/activate + pip install -r requirements.txt + + - name: Pulumi preview + working-directory: infra/pulumi + env: + PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }} + PULUMI_CONFIG_PASSPHRASE: ${{ secrets.PULUMI_PASSPHRASE }} + run: | + source .venv/bin/activate + pulumi stack select ${{ env.PULUMI_STACK }} + pulumi preview --diff + + - name: Pulumi up + if: github.event.inputs.preview_only != 'true' + working-directory: infra/pulumi + env: + PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }} + PULUMI_CONFIG_PASSPHRASE: ${{ secrets.PULUMI_PASSPHRASE }} + run: | + source .venv/bin/activate + pulumi stack select ${{ env.PULUMI_STACK }} + pulumi up --yes --diff + + - name: "Post-deploy: scale services to 0" + if: github.event.inputs.preview_only != 'true' + run: | + echo "Scaling ECS services to 0 (safety: prevents writes to shared stage DB)" + echo "Scale up manually after RO healthcheck validation" + PREFIX="thunderbird-addons-stage" + for svc in web worker versioncheck; do + CLUSTER="${PREFIX}-${svc}" + SERVICE="${PREFIX}-${svc}" + RESOURCE_ID="service/${CLUSTER}/${SERVICE}" + + echo " Suspending autoscaling for ${svc}..." + aws application-autoscaling register-scalable-target \ + --service-namespace ecs \ + --scalable-dimension ecs:service:DesiredCount \ + --resource-id "${RESOURCE_ID}" \ + --suspended-state '{"DynamicScalingInSuspended":true,"DynamicScalingOutSuspended":true,"ScheduledScalingSuspended":true}' \ + --region ${{ env.AWS_REGION }} 2>/dev/null || echo " (no autoscaling target for ${svc}, skipping)" + + echo " Scaling ${svc} to 0..." + aws ecs update-service \ + --cluster "${CLUSTER}" \ + --service "${SERVICE}" \ + --desired-count 0 \ + --region ${{ env.AWS_REGION }} \ + --query 'service.[serviceName,desiredCount]' \ + --output text + done + + - name: Post-deploy summary + if: github.event.inputs.preview_only != 'true' + working-directory: infra/pulumi + env: + PULUMI_ACCESS_TOKEN: ${{ secrets.PULUMI_ACCESS_TOKEN }} + PULUMI_CONFIG_PASSPHRASE: ${{ secrets.PULUMI_PASSPHRASE }} + run: | + source .venv/bin/activate + echo "## Deploy Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Stack: ${{ env.PULUMI_STACK }}" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + pulumi stack output --json | python -c " + import sys, json + outputs = json.load(sys.stdin) + print('| Output | Value |') + print('|--------|-------|') + for k, v in sorted(outputs.items()): + if isinstance(v, list): + v = ', '.join(str(i) for i in v) + print(f'| {k} | \`{v}\` |') + " >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "Services scaled to 0. Run RO healthcheck before scaling up" >> $GITHUB_STEP_SUMMARY diff --git a/.github/workflows/validate.yml b/.github/workflows/validate.yml new file mode 100644 index 000000000000..c46f0a0b78b0 --- /dev/null +++ b/.github/workflows/validate.yml @@ -0,0 +1,86 @@ +--- +name: validate + +concurrency: + group: validate-${{ github.ref }} + cancel-in-progress: true + +on: + push: + branches: + - stage + - master + pull_request: + branches: + - stage + - master + workflow_dispatch: + +permissions: + contents: read + +jobs: + detect-changes: + runs-on: ubuntu-latest + outputs: + iac-changed: ${{ steps.check.outputs.iac-changed }} + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - uses: dorny/paths-filter@v3 + id: check + with: + filters: | + iac-changed: + - 'infra/pulumi/**' + - 'infra/tests/**' + - 'infra/scripts/**' + - '.github/workflows/*.yml' + + validate-iac: + needs: detect-changes + if: needs.detect-changes.outputs.iac-changed == 'true' + runs-on: ubuntu-latest + timeout-minutes: 10 + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Ruff format check + uses: chartboost/ruff-action@v1 + with: + src: "infra/pulumi" + args: "format --check" + + - name: Ruff lint check + uses: chartboost/ruff-action@v1 + with: + src: "infra/pulumi" + + - name: Validate YAML config + run: | + pip install --quiet pyyaml + python3 -c " + import yaml, sys + for f in ['infra/pulumi/config.stage.yaml']: + try: + yaml.safe_load(open(f)) + print(f' [OK] {f}') + except Exception as e: + print(f' [FAIL] {f}: {e}') + sys.exit(1) + " + + - name: Validate Python syntax + run: | + python3 -c " + import ast, sys + for f in ['infra/pulumi/__main__.py', 'infra/tests/smoke_test.py']: + try: + ast.parse(open(f).read()) + print(f' [OK] {f}') + except SyntaxError as e: + print(f' [FAIL] {f}: {e}') + sys.exit(1) + " diff --git a/.gitignore b/.gitignore index 56b4bc597499..f69500f2fdf0 100644 --- a/.gitignore +++ b/.gitignore @@ -46,4 +46,10 @@ storage/guarded-addons/* storage/shared_storage/* supervisord.pid tmp/* -venv* \ No newline at end of file +venv* + +# Pulumi local artefacts (outputs, notes, analysis) +infra/pulumi/pulumi-*.txt +infra/pulumi/preview-output-*.txt +infra/pulumi/analysis.md +infra/pulumi/infrastructure-inventory.md \ No newline at end of file diff --git a/Dockerfile.ecs b/Dockerfile.ecs new file mode 100644 index 000000000000..18ec3f4f2df0 --- /dev/null +++ b/Dockerfile.ecs @@ -0,0 +1,196 @@ +# Dockerfile.ecs +# ECS Fargate-optimised Dockerfile for Thunderbird Add-ons Server +# +# This Dockerfile is intended for production deployment on AWS ECS Fargate +# Key features: +# - Non-root user execution (olympia) +# - Environment-driven configuration +# - Built-in health check +# - Predictable entrypoint supporting multiple service modes +# +# Service modes (via ENTRYPOINT): +# web - Django application via uWSGI (default) +# worker - Celery background workers +# versioncheck - Versioncheck API service +# scheduler - Celery beat scheduler +# manage - Django management commands +# +# Required environment variables: +# DJANGO_SETTINGS_MODULE - Django settings module (default: settings) +# DATABASE_URL - MySQL connection string +# CELERY_BROKER_URL - RabbitMQ connection string +# CELERY_RESULT_BACKEND - Redis connection string +# ELASTICSEARCH_LOCATION - OpenSearch/Elasticsearch host:port +# MEMCACHE_LOCATION - Memcached host:port +# +# Optional environment variables: +# UWSGI_PROCESSES - Number of uWSGI worker processes (default: 4) +# UWSGI_THREADS - Threads per process (default: 4) +# UWSGI_PORT - HTTP port (default: 8000) +# CELERY_CONCURRENCY - Celery worker concurrency (default: 4) +# CELERY_QUEUES - Comma-separated queue names +# CELERY_LOGLEVEL - Celery log level (default: info) +# SENTRY_DSN - Sentry error reporting DSN +# +# Build: +# docker build -f Dockerfile.ecs -t addons-server:latest . +# +# Run examples: +# docker run -e DATABASE_URL=... addons-server:latest web +# docker run -e DATABASE_URL=... addons-server:latest worker +# docker run -e DATABASE_URL=... addons-server:latest versioncheck + +FROM python:3.6-slim-stretch + +LABEL maintainer="Thunderbird Team" +LABEL org.opencontainers.image.title="Thunderbird Add-ons Server" +LABEL org.opencontainers.image.description="ECS Fargate deployment image for addons.thunderbird.net" + +# Build arguments +ARG OLYMPIA_UID=9500 +ARG OLYMPIA_GID=9500 + +ENV PYTHON_VERSION_MAJOR=3 +ENV SWIG_FEATURES="-D__x86_64__" +ENV PYTHONDONTWRITEBYTECODE=1 +ENV PYTHONUNBUFFERED=1 +ENV LANG=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 + +# Default runtime environment (can be overridden at container start) +ENV DJANGO_SETTINGS_MODULE=settings +ENV UWSGI_PROCESSES=4 +ENV UWSGI_THREADS=4 +ENV UWSGI_PORT=8000 +ENV CELERY_CONCURRENCY=4 +ENV CELERY_LOGLEVEL=info + +# Create olympia user and group (non-root execution) +RUN groupadd -g ${OLYMPIA_GID} olympia && \ + useradd -u ${OLYMPIA_UID} -g ${OLYMPIA_GID} -s /bin/bash -m olympia + +# Update the main repositories to the archived repository (Stretch is EOL) +RUN echo "deb http://archive.debian.org/debian stretch main contrib non-free" > /etc/apt/sources.list + +# Add nodesource repository and requirements +ADD docker/nodesource.gpg.key /etc/pki/gpg/GPG-KEY-nodesource +RUN apt-get update && apt-get install -y \ + apt-transport-https \ + gnupg2 \ + && rm -rf /var/lib/apt/lists/* +RUN cat /etc/pki/gpg/GPG-KEY-nodesource | apt-key add - +ADD docker/debian-stretch-nodesource-repo /etc/apt/sources.list.d/nodesource.list +ADD docker/debian-stretch-backports-repo /etc/apt/sources.list.d/backports.list + +# Install system dependencies +# Note: Debian Stretch is EOL, some packages have dependency issues +# We skip libssl-dev as it has broken deps in the archive; cryptography is pre-built in pip +RUN apt-get update && apt-get install -y \ + # General dependencies + bash-completion \ + build-essential \ + curl \ + libjpeg-dev \ + libsasl2-dev \ + libxml2-dev \ + libxslt-dev \ + locales \ + zlib1g-dev \ + libffi-dev \ + libmagic-dev \ + nodejs \ + # Git for git-checkout dependencies + git \ + # MySQL client and development headers + mysql-client \ + default-libmysqlclient-dev \ + swig \ + gettext \ + # SVG rendering for theme previews + librsvg2-bin \ + # PNG optimisation for uploaded images + pngcrush \ + # Makefile and UI tests require uuid + uuid \ + # GeoIP lookups + libmaxminddb0 \ + libmaxminddb-dev \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get clean + +# Install tini for proper signal handling in containers +# tini is not available in Debian Stretch apt repos, so we download from GitHub +ARG TINI_VERSION=v0.19.0 +RUN curl -fsSL https://github.com/krallin/tini/releases/download/${TINI_VERSION}/tini -o /usr/bin/tini \ + && chmod +x /usr/bin/tini + +# Compile required locale +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# Create application directories with correct ownership +RUN mkdir -p /data/olympia /app /var/log/olympia /var/run/olympia && \ + chown -R olympia:olympia /data /app /var/log/olympia /var/run/olympia + +# Copy version.json first (for cache invalidation) +COPY --chown=olympia:olympia version.json /app/version.json + +# Copy application code +COPY --chown=olympia:olympia . /data/olympia +WORKDIR /data/olympia + +# Install Python dependencies (as root for system packages then we fix ownership) +RUN pip3 install --no-cache-dir --exists-action=w --no-deps -r requirements/system.txt && \ + pip3 install --no-cache-dir --exists-action=w --no-deps -r requirements/prod_py3.txt && \ + pip3 install --no-cache-dir --exists-action=w --no-deps -e . + +# Link uwsgi to expected paths +RUN ln -s /usr/local/bin/uwsgi /usr/bin/uwsgi && \ + ln -s /usr/bin/uwsgi /usr/sbin/uwsgi + +# Install uwsgi dogstatsd plugin for metrics +WORKDIR /usr/lib/uwsgi/plugins +RUN uwsgi --build-plugin https://github.com/Datadog/uwsgi-dogstatsd && \ + rm -rf uwsgi-dogstatsd + +# Build static assets +WORKDIR /data/olympia +RUN echo "from olympia.lib.settings_base import *\n\ +LESS_BIN = 'node_modules/less/bin/lessc'\n\ +CLEANCSS_BIN = 'node_modules/clean-css-cli/bin/cleancss'\n\ +UGLIFY_BIN = 'node_modules/uglify-js/bin/uglifyjs'\n\ +FXA_CONFIG = {'default': {}, 'internal': {}}\n"\ +> settings_local.py + +RUN DJANGO_SETTINGS_MODULE='settings_local' locale/compile-mo.sh locale + +RUN npm install \ + && make -f Makefile-docker copy_node_js \ + && DJANGO_SETTINGS_MODULE='settings_local' python manage.py compress_assets \ + && DJANGO_SETTINGS_MODULE='settings_local' python manage.py generate_jsi18n_files \ + && DJANGO_SETTINGS_MODULE='settings_local' python manage.py collectstatic --noinput + +# Clean up build-time files +RUN rm -f settings_local.py settings_local.pyc && \ + rm -rf /root/.npm /root/.cache && \ + find /data/olympia -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true && \ + find /data/olympia -type f -name "*.pyc" -delete 2>/dev/null || true + +# Make entrypoint executable +RUN chmod +x /data/olympia/docker/docker-entrypoint.sh + +# Switch to non-root user +USER olympia + +# Expose the application port +EXPOSE 8000 + +# Health check - calls the Django monitor endpoint +# Adjust interval/timeout based on application startup time +HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \ + CMD curl -f http://localhost:${UWSGI_PORT}/services/monitor.json || exit 1 + +# Use tini as init system for proper signal handling +ENTRYPOINT ["/usr/bin/tini", "--", "/data/olympia/docker/docker-entrypoint.sh"] + +# Default command: run web server +CMD ["web"] diff --git a/docker/docker-entrypoint.sh b/docker/docker-entrypoint.sh new file mode 100755 index 000000000000..0e890116c7c6 --- /dev/null +++ b/docker/docker-entrypoint.sh @@ -0,0 +1,196 @@ +#!/bin/bash +# docker-entrypoint.sh +# Entrypoint script for ECS Fargate containers +# +# Service modes: +# web -- Run uWSGI Django application (default) +# worker -- Run Celery worker +# versioncheck -- Run versioncheck uWSGI service +# scheduler -- Run Celery beat scheduler +# shell -- Drop into bash shell +# manage -- Run Django management command +# +# Usage examples: +# docker run image web +# docker run image worker --queues default,priority +# docker run image manage migrate +# +set -e + +# Colour codes for logging (disabled if not a TTY) +if [ -t 1 ]; then + RED='\033[0;31m' + GREEN='\033[0;32m' + YELLOW='\033[1;33m' + NC='\033[0m' +else + RED='' + GREEN='' + YELLOW='' + NC='' +fi + +log_info() { + echo -e "${GREEN}[INFO]${NC} $1" +} + +log_warn() { + echo -e "${YELLOW}[WARN]${NC} $1" +} + +log_error() { + echo -e "${RED}[ERROR]${NC} $1" +} + +# Default values (can be overridden by environment variables) +: "${UWSGI_PROCESSES:=4}" +: "${UWSGI_THREADS:=4}" +: "${UWSGI_PORT:=8000}" +: "${UWSGI_HARAKIRI:=30}" +: "${UWSGI_MAX_REQUESTS:=5000}" +: "${CELERY_CONCURRENCY:=4}" +: "${CELERY_QUEUES:=default,devhub,images,limited,priority,reviews}" +: "${CELERY_LOGLEVEL:=info}" +: "${DJANGO_SETTINGS_MODULE:=settings}" + +export DJANGO_SETTINGS_MODULE + +# Working directory +cd /data/olympia + +# Function to run Django management commands +run_manage() { + log_info "Running management command: $*" + exec python3 manage.py "$@" +} + +# Function to start uWSGI for web service +start_web() { + log_info "Starting uWSGI web server..." + log_info "Processes: ${UWSGI_PROCESSES}, Threads: ${UWSGI_THREADS}, Port: ${UWSGI_PORT}" + + # Update product details before starting + python3 manage.py update_product_details || log_warn "Failed to update product details" + + exec uwsgi \ + --module=olympia.wsgi:django_app \ + --master \ + --http=0.0.0.0:${UWSGI_PORT} \ + --processes=${UWSGI_PROCESSES} \ + --threads=${UWSGI_THREADS} \ + --enable-threads \ + --offload-threads=2 \ + --harakiri=${UWSGI_HARAKIRI} \ + --max-requests=${UWSGI_MAX_REQUESTS} \ + --buffer-size=32768 \ + --limit-post=100000000 \ + --post-buffering=8192 \ + --http-timeout=20 \ + --http-connect-timeout=20 \ + --http-keepalive=1 \ + --thunder-lock \ + --single-interpreter \ + --need-app \ + --die-on-term \ + --vacuum \ + --ignore-sigpipe \ + --ignore-write-errors \ + --disable-write-exception \ + --log-5xx \ + --log-slow=1000 \ + --stats=:9191 \ + --stats-http \ + "$@" +} + +# Function to start uWSGI for versioncheck service +start_versioncheck() { + log_info "Starting uWSGI versioncheck server..." + log_info "Processes: ${UWSGI_PROCESSES}, Threads: ${UWSGI_THREADS}, Port: ${UWSGI_PORT}" + + exec uwsgi \ + --module=services.wsgi.versioncheck:application \ + --master \ + --http-socket=0.0.0.0:${UWSGI_PORT} \ + --processes=${UWSGI_PROCESSES} \ + --threads=${UWSGI_THREADS} \ + --enable-threads \ + --offload-threads=2 \ + --max-requests=${UWSGI_MAX_REQUESTS} \ + --need-app \ + --die-on-term \ + --vacuum \ + --ignore-sigpipe \ + --ignore-write-errors \ + --disable-write-exception \ + --stats=:9191 \ + --stats-http \ + "$@" +} + +# Function to start Celery worker +start_worker() { + log_info "Starting Celery worker..." + log_info "Concurrency: ${CELERY_CONCURRENCY}, Queues: ${CELERY_QUEUES}, Loglevel: ${CELERY_LOGLEVEL}" + + exec celery \ + -A olympia.amo.celery \ + worker \ + --loglevel=${CELERY_LOGLEVEL} \ + --concurrency=${CELERY_CONCURRENCY} \ + -Q "${CELERY_QUEUES}" \ + "$@" +} + +# Function to start Celery beat scheduler +start_scheduler() { + log_info "Starting Celery beat scheduler..." + + exec celery \ + -A olympia.amo.celery \ + beat \ + --loglevel=${CELERY_LOGLEVEL} \ + "$@" +} + +# Main entrypoint logic +main() { + SERVICE_MODE="${1:-web}" + + log_info "Service mode: ${SERVICE_MODE}" + log_info "Django settings: ${DJANGO_SETTINGS_MODULE}" + + case "${SERVICE_MODE}" in + web) + shift || true + start_web "$@" + ;; + versioncheck) + shift || true + start_versioncheck "$@" + ;; + worker) + shift || true + start_worker "$@" + ;; + scheduler|beat) + shift || true + start_scheduler "$@" + ;; + shell) + log_info "Starting shell..." + exec /bin/bash + ;; + manage) + shift + run_manage "$@" + ;; + *) + # If the command doesn't match a known service, run it directly + log_info "Running command: $*" + exec "$@" + ;; + esac +} + +main "$@" diff --git a/infra/pulumi/Pulumi.stage.yaml b/infra/pulumi/Pulumi.stage.yaml new file mode 100644 index 000000000000..16f12c3f27e4 --- /dev/null +++ b/infra/pulumi/Pulumi.stage.yaml @@ -0,0 +1,2 @@ +config: + aws:region: us-west-2 diff --git a/infra/pulumi/Pulumi.yaml b/infra/pulumi/Pulumi.yaml new file mode 100644 index 000000000000..b4a3b6b67e8d --- /dev/null +++ b/infra/pulumi/Pulumi.yaml @@ -0,0 +1,12 @@ +name: thunderbird-addons +runtime: + name: python + options: + virtualenv: .venv +description: Infrastructure for Thunderbird Add-ons (addons.thunderbird.net) + +# Project configuration +config: + pulumi:tags: + value: + pulumi:template: aws-python diff --git a/infra/pulumi/README.md b/infra/pulumi/README.md new file mode 100644 index 000000000000..ccd0c801d522 --- /dev/null +++ b/infra/pulumi/README.md @@ -0,0 +1,156 @@ +# Thunderbird Add-ons Infra (Pulumi) + +ECS Fargate infrastructure for addons-server + +## Prerequisites + +- Python 3.13+ +- Pulumi CLI +- AWS credentials (local credentials only; CI uses OIDC) + +## Setup + +```bash +python3.13 -m venv .venv +source .venv/bin/activate +pip install -r requirements.txt +pulumi login # browser-based authn flow Pulumi Cloud + +# Select the staging stack (name may vary depending on org setup) +pulumi stack select thunderbird/thunderbird-addons/stage +``` + +## Preview Changes + +```bash +pulumi preview +``` + +## CI/CD + +GitHub Actions workflow (`.github/workflows/build-and-push.yml`) handles image builds. + +- **Pull requests**: Build validation only (no AWS auth) +- **Push to stage**: Build + push to ECR via OIDC + +### Enabling ECR Publishing + +1. Ensure AWS OIDC provider exists for `token.actions.githubusercontent.com` +2. IAM role is created by Pulumi with trust policy scoped to `refs/heads/stage` +3. Set repository variable: `AWS_ROLE_ARN` (from Pulumi output `gha_ecr_publish_role_arn`) + +## Scheduled Tasks + +Scheduled tasks mirror the existing cron workload from the legacy environment and are executed as ECS tasks via EventBridge Scheduler. + +16 cron jobs run via EventBridge Scheduler: + +| Task | Schedule | Command | +|------|----------|---------| +| auto-approve | Every 5 min | `manage auto_approve` | +| addon-last-updated | Hourly (:20) | `manage addon_last_updated` | +| info-request-warning | Hourly (:15) | `manage send_info_request_warning` | +| update-addon-appsupport | Hourly (:45) | `manage update_addon_appsupport` | +| cleanup-extracted-file | Hourly (:50) | `manage cleanup_extracted_file` | +| unhide-disabled-files | Hourly (:55) | `manage unhide_disabled_files` | +| hide-disabled-files | 05:25, 17:25 UTC | `manage hide_disabled_files` | +| cleanup-image-files | 06:25, 18:25 UTC | `manage cleanup_image_files` | +| update-user-ratings | 01:00 UTC | `manage update_user_ratings` | +| gc | 22:00 UTC | `manage gc` | +| dump-apps | 01:30 UTC | `manage dump_apps` | +| update-product-details | 01:45 UTC | `manage update_product_details` | +| add-latest-appversion | 02:00 UTC | `manage add_latest_appversion` | +| category-totals | 14:30 UTC | `manage category_totals` | +| update-global-totals | 00:40 UTC | `manage update_addon_hotness` | +| update-addon-daily-users | 00:20 UTC | `manage update_addon_daily_users` | + +## Image Tagging + +- `atn-stage-addons-server:stage-latest` - current stage build +- ECR lifecycle: keep 50 tagged images, expire untagged after 7 days + +## Secrets + +No secrets are stored in the repository. + +Application expects Secrets Manager paths under `atn/stage/*`: +- Database credentials +- Django secret key +- External service API keys + +See `settings_local_stage.py` for full mapping. + +## Post-Deployment Verification + +All commands are read-only + +### ECR Repository + +```bash +aws ecr describe-images \ + --repository-name atn-stage-addons-server \ + --region us-west-2 \ + --query 'imageDetails[*].[imageTags,imagePushedAt]' \ + --output table +``` + +### ECS Services + +```bash +# List services +aws ecs list-services --cluster atn-stage-web-cluster --region us-west-2 +aws ecs list-services --cluster atn-stage-worker-cluster --region us-west-2 + +# Check service status +aws ecs describe-services \ + --cluster atn-stage-web-cluster \ + --services atn-stage-web \ + --region us-west-2 \ + --query 'services[*].[serviceName,runningCount,desiredCount,status]' +``` + +### Scheduled Tasks + +```bash +aws scheduler list-schedules \ + --group-name thunderbird-addons-stage-cron \ + --region us-west-2 \ + --query 'Schedules[*].[Name,State,ScheduleExpression]' \ + --output table +``` + +### CloudWatch Logs + +```bash +# Recent web logs +aws logs tail /ecs/thunderbird-addons-stage-web --since 5m --region us-west-2 + +# Recent cron logs +aws logs tail /ecs/thunderbird-addons-stage-cron --since 5m --region us-west-2 +``` + +### ALB Health Check + +```bash +# Get ALB DNS (after deployment) +pulumi stack output --json | jq -r '.web_alb_dns' + +# Test health endpoint +curl -I https://{alb-dns}/services/monitor +``` + +## Resources Created + +- New VPC with public/private subnets across 3 AZs (connectivity to existing RDS may require VPC peering - confirm with Andrei) +- ECR repository with lifecycle policy +- ECS clusters (web, worker) +- Fargate services (web, worker, versioncheck) +- ElastiCache Redis cluster +- 16 EventBridge scheduled tasks +- ALB with HTTPS listener +- IAM roles (task execution, task, scheduler, OIDC) +- CloudWatch log groups + +## Workflow + +All infrastructure changes are proposed via pull requests and reviewed before deployment. Direct `pulumi up` execution is restricted to approved paths. diff --git a/infra/pulumi/__main__.py b/infra/pulumi/__main__.py new file mode 100755 index 000000000000..6cbf83b6a06b --- /dev/null +++ b/infra/pulumi/__main__.py @@ -0,0 +1,966 @@ +#!/usr/bin/env python3 +""" +Thunderbird Add-ons Server Infra + +This Pulumi program aims to define the AWS infra for the Thunderbird Add-ons +server (ATN), migrating from EC2/Ansible to ECS Fargate + +Architecture: + - VPC with public/private subnets + - ECR repository for container images + - Fargate services: web, worker, versioncheck + - ElastiCache Redis for Celery + - (Future) RDS MySQL, OpenSearch, EFS + +Usage: + pulumi preview # See planned changes + pulumi up # Apply changes + +Configuration is defined in config.{stack}.yaml files +""" + +import json + +import pulumi +import pulumi_aws as aws +import tb_pulumi +import tb_pulumi.autoscale +import tb_pulumi.elasticache +import tb_pulumi.fargate +import tb_pulumi.network + + +def main(): + # Create a ThunderbirdPulumiProject to aggregate resources + # This loads config.{stack}.yaml automatically + project = tb_pulumi.ThunderbirdPulumiProject() + + # ========================================================================= + # Extended resource tags + # ========================================================================= + # tb_pulumi sets 4 default tags: environment, project, pulumi_project, + # pulumi_stack. We extend with operational and FinOps tags BEFORE creating + # any resources so all ThunderbirdComponentResources inherit them via + # their __init__ copy of common_tags. Resources created directly via + # aws.* also pick these up through project.common_tags spread + project.common_tags.update( + { + "managed_by": "pulumi", + "repository": "thunderbird/addons-server", + "repository_url": "https://github.com/thunderbird/addons-server", + "owner": "thunderbird", + "service": "addons", + "lifecycle": "ephemeral" if project.stack == "stage" else "persistent", + } + ) + + # Pull the resources configuration + resources = project.config.get("resources", {}) + + # ========================================================================= + # VPC - Multi-tier network with public/private subnets + # ========================================================================= + vpc_config = resources.get("tb:network:MultiTierVpc", {}).get("vpc", {}) + + if vpc_config: + vpc = tb_pulumi.network.MultiTierVpc( + name=f"{project.name_prefix}-vpc", + project=project, + **vpc_config, + ) + + # Extract subnets for use by other resources + private_subnets = vpc.resources.get("private_subnets", []) + public_subnets = vpc.resources.get("public_subnets", []) + vpc_resource = vpc.resources.get("vpc") + + # ----------------------------------------------------------------- + # VPC Peering to default VPC (RDS, Redis, RabbitMQ, ES, EFS) + # ----------------------------------------------------------------- + # We handle peering manually (not via MultiTierVpc config) because + # MultiTierVpc places peering routes on vpc.default_route_table_id, + # but our subnets use custom public/private route tables created by + # egress_via_internet_gateway / egress_via_nat_gateway. Routes on + # the default table would/should be unreachable + + # Create the peering connection + default_vpc_peer = aws.ec2.VpcPeeringConnection( + f"{project.name_prefix}-pcx-default-vpc", + vpc_id=vpc_resource.id, + peer_vpc_id="vpc-441e5e22", + auto_accept=True, + tags={ + **project.common_tags, + "Name": f"{project.name_prefix}-to-default-vpc", + }, + opts=pulumi.ResourceOptions(depends_on=[vpc_resource]), + ) + + # Enable DNS resolution across the peering connection + aws.ec2.PeeringConnectionOptions( + f"{project.name_prefix}-pcx-requester-dns", + vpc_peering_connection_id=default_vpc_peer.id, + requester=aws.ec2.PeeringConnectionOptionsRequesterArgs( + allow_remote_vpc_dns_resolution=True, + ), + opts=pulumi.ResourceOptions(depends_on=[default_vpc_peer]), + ) + + # Add peering route to the PRIVATE route table (ECS tasks need + # to reach RDS/Redis/RabbitMQ/ES/EFS in 172.31.0.0/16) + # Extract route table ID from the route table associations that + # MultiTierVpc exposes (the actual RouteTable is a local variable + # inside the component and not directly accessible) + private_rt_assocs = vpc.resources.get( + "private_route_table_subnet_associations", [] + ) + if private_rt_assocs: + aws.ec2.Route( + f"{project.name_prefix}-private-rt-pcx-route", + route_table_id=private_rt_assocs[0].route_table_id, + destination_cidr_block="172.31.0.0/16", + vpc_peering_connection_id=default_vpc_peer.id, + opts=pulumi.ResourceOptions( + depends_on=[default_vpc_peer, private_rt_assocs[0]] + ), + ) + + # Add peering route to the PUBLIC route table (ALB health checks + # may need to reach backends in the default VPC) + public_rt_assocs = vpc.resources.get( + "public_route_table_subnet_associations", [] + ) + if public_rt_assocs: + aws.ec2.Route( + f"{project.name_prefix}-public-rt-pcx-route", + route_table_id=public_rt_assocs[0].route_table_id, + destination_cidr_block="172.31.0.0/16", + vpc_peering_connection_id=default_vpc_peer.id, + opts=pulumi.ResourceOptions( + depends_on=[default_vpc_peer, public_rt_assocs[0]] + ), + ) + + # Return route: default VPC -> our VPC via peering + aws.ec2.Route( + f"{project.name_prefix}-default-vpc-return-route", + # Default VPC's sole route table (overrideable via config below) + route_table_id=resources.get("tb:network:DefaultVpcIngressRules", {}).get( + "default_vpc_route_table_id", + "rtb-0657e07f", + ), + destination_cidr_block="10.100.0.0/16", + vpc_peering_connection_id=default_vpc_peer.id, + opts=pulumi.ResourceOptions(depends_on=[default_vpc_peer]), + ) + + # SG rules on existing security groups in the default VPC + # Smoke test revealed that different services use different SGs: + # + # sg-d5539ea9 (amo-services-prod-tb): + # Redis, Memcached, ES/OpenSearch, EFS + # sg-5133b52c (default VPC SG): + # RDS MySQL, RabbitMQ (and self-referencing for internal comms) + # + # We add our VPC CIDR to both SGs for the relevant ports + + # --- sg-d5539ea9: services SG (Redis, Memcached, ES, EFS) --- + default_vpc_ingress_cfg = resources.get("tb:network:DefaultVpcIngressRules", {}) + stage_vpc_cidr = default_vpc_ingress_cfg.get("stage_vpc_cidr", "10.100.0.0/16") + + services_sg_ids = default_vpc_ingress_cfg.get( + "services_sg_ids", + ["sg-d5539ea9"], + ) + services_sg_ports = { + "redis": 6379, + "memcached": 11211, + "elasticsearch": 9200, + "elasticsearch-https": 443, # Managed AWS ES speaks HTTPS + "efs": 2049, + } + for sg_id in services_sg_ids: + for svc_name, port in services_sg_ports.items(): + aws.ec2.SecurityGroupRule( + f"{project.name_prefix}-default-vpc-sg-{svc_name}-{sg_id[-4:]}", + type="ingress", + security_group_id=sg_id, + from_port=port, + to_port=port, + protocol="tcp", + cidr_blocks=[stage_vpc_cidr], + description=f"Allow {svc_name} from ATN stage VPC", + opts=pulumi.ResourceOptions(depends_on=[default_vpc_peer]), + ) + + # --- sg-5133b52c: default VPC SG (RDS, RabbitMQ) --- + default_sg_ids = default_vpc_ingress_cfg.get( + "default_sg_ids", + ["sg-5133b52c"], + ) + default_sg_ports = { + "mysql": 3306, + "rabbitmq": 5672, + } + for sg_id in default_sg_ids: + for svc_name, port in default_sg_ports.items(): + aws.ec2.SecurityGroupRule( + f"{project.name_prefix}-default-vpc-defsg-{svc_name}-{sg_id[-4:]}", + type="ingress", + security_group_id=sg_id, + from_port=port, + to_port=port, + protocol="tcp", + cidr_blocks=[stage_vpc_cidr], + description=f"Allow {svc_name} from ATN stage VPC", + opts=pulumi.ResourceOptions(depends_on=[default_vpc_peer]), + ) + + else: + private_subnets = [] + public_subnets = [] + vpc_resource = None + + # ========================================================================= + # ECR Repository + # ========================================================================= + # ECR is not part of tb_pulumi, so we use the AWS provider directly + # This creates a private repository for the addons-server container images + ecr_config = resources.get("aws:ecr:Repository", {}) + ecr_repositories = {} + + for repo_name, repo_config in ecr_config.items(): + # Create ECR repository + # force_delete allows pulumi destroy to succeed even if images exist + # (safe for staging; prod should set this to False) + ecr_repo = aws.ecr.Repository( + f"{project.name_prefix}-{repo_name}", + name=repo_config.get("name", f"{project.name_prefix}-{repo_name}"), + image_tag_mutability=repo_config.get("image_tag_mutability", "MUTABLE"), + force_delete=repo_config.get("force_delete", False), + image_scanning_configuration=aws.ecr.RepositoryImageScanningConfigurationArgs( + scan_on_push=repo_config.get("scan_on_push", True), + ), + encryption_configurations=[ + aws.ecr.RepositoryEncryptionConfigurationArgs( + encryption_type=repo_config.get("encryption_type", "AES256"), + ) + ], + tags={ + **project.common_tags, + "Name": f"{project.name_prefix}-{repo_name}", + }, + ) + + # Lifecycle policy to manage image retention + lifecycle_policy = repo_config.get("lifecycle_policy") + if lifecycle_policy: + aws.ecr.LifecyclePolicy( + f"{project.name_prefix}-{repo_name}-lifecycle", + repository=ecr_repo.name, + policy=lifecycle_policy, + opts=pulumi.ResourceOptions(parent=ecr_repo), + ) + + ecr_repositories[repo_name] = ecr_repo + + # Export repository URL for CI/CD pipelines + pulumi.export(f"ecr_{repo_name}_url", ecr_repo.repository_url) + + # ========================================================================= + # GitHub Actions OIDC Role for ECR publishing + # ========================================================================= + # This role allows GH Actions to push images to ECR via OIDC + # + # Prerequisites + # - OIDC provider exists: token.actions.githubusercontent.com + # - After deployment we set AWS_ROLE_ARN as GitHub repo variable + # + # Trust policy restricts to + # - this specific repository + # - the stage branch only + # - only the build-and-push.yml workflow + gha_oidc_config = resources.get("aws:iam:GitHubActionsOIDCRole", {}) + addons_repo = ecr_repositories.get("addons-server") + + if gha_oidc_config and not addons_repo: + pulumi.log.warn( + "OIDC role config present but aws:ecr:Repository.addons-server not defined " + "in this stack; so skipping OIDC role creation" + ) + + if gha_oidc_config and addons_repo: + github_org = gha_oidc_config.get("github_org", "thunderbird") + github_repo = gha_oidc_config.get("github_repo", "addons-server") + allowed_branches = gha_oidc_config.get("allowed_branches", ["stage"]) + workflow_file = gha_oidc_config.get( + "workflow_file", ".github/workflows/build-and-push.yml" + ) + + # Build the subject conditions for allowed branches + sub_conditions = [ + f"repo:{github_org}/{github_repo}:ref:refs/heads/{branch}" + for branch in allowed_branches + ] + + # Build workflow ref conditions (job_workflow_ref hardening) + workflow_ref_conditions = [ + f"{github_org}/{github_repo}/{workflow_file}@refs/heads/{branch}" + for branch in allowed_branches + ] + + gha_trust_policy = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": { + "Federated": f"arn:aws:iam::{project.aws_account_id}:oidc-provider/token.actions.githubusercontent.com" + }, + "Action": "sts:AssumeRoleWithWebIdentity", + "Condition": { + "StringEquals": { + "token.actions.githubusercontent.com:aud": "sts.amazonaws.com", + "token.actions.githubusercontent.com:iss": "https://token.actions.githubusercontent.com", + }, + "StringLike": { + "token.actions.githubusercontent.com:sub": sub_conditions + if len(sub_conditions) > 1 + else sub_conditions[0], + "token.actions.githubusercontent.com:job_workflow_ref": workflow_ref_conditions + if len(workflow_ref_conditions) > 1 + else workflow_ref_conditions[0], + }, + }, + } + ], + } + ) + + gha_ecr_publish_role = aws.iam.Role( + f"{project.name_prefix}-gha-ecr-publish", + name=f"{project.name_prefix}-gha-ecr-publish", + description=f"GitHub Actions OIDC role for ECR publishing ({github_org}/{github_repo})", + assume_role_policy=gha_trust_policy, + tags=project.common_tags, + ) + + # ECR push permissions derive ARN from actual repo to avoid drifts + gha_ecr_policy_doc = addons_repo.arn.apply( + lambda arn: json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "ECRAuth", + "Effect": "Allow", + "Action": "ecr:GetAuthorizationToken", + "Resource": "*", + }, + { + "Sid": "ECRPush", + "Effect": "Allow", + "Action": [ + "ecr:BatchCheckLayerAvailability", + "ecr:BatchGetImage", + "ecr:CompleteLayerUpload", + "ecr:DescribeImages", + "ecr:DescribeRepositories", + "ecr:GetDownloadUrlForLayer", + "ecr:InitiateLayerUpload", + "ecr:ListImages", + "ecr:PutImage", + "ecr:UploadLayerPart", + ], + "Resource": arn, + }, + ], + } + ) + ) + + gha_ecr_policy = aws.iam.Policy( + f"{project.name_prefix}-gha-ecr-push-policy", + name=f"{project.name_prefix}-gha-ecr-push", + description="Allows GitHub Actions to push images to ECR", + policy=gha_ecr_policy_doc, + tags=project.common_tags, + ) + + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-gha-ecr-policy-attachment", + role=gha_ecr_publish_role.name, + policy_arn=gha_ecr_policy.arn, + ) + + # Export the role ARN for GitHub repo variable setup + pulumi.export("gha_ecr_publish_role_arn", gha_ecr_publish_role.arn) + + # ========================================================================= + # Security Groups (accounts-repo pattern) + # ========================================================================= + # Pattern is: separate load_balancers and containers sections + # For each service, matching entries in both. Workers with no LB set to + # null. Code dynamically wires source_security_group_id from LB SG to + # container ingress + sg_configs = resources.get("tb:network:SecurityGroupWithRules", {}) + lb_sg_configs = sg_configs.get("load_balancers", {}) + container_sg_configs = sg_configs.get("containers", {}) + + # Build security groups for load balancers + lb_sgs = {} + for service, sg_config in lb_sg_configs.items(): + if sg_config is None: + lb_sgs[service] = None + continue + if vpc_resource: + sg_config["vpc_id"] = vpc_resource.id + lb_sgs[service] = tb_pulumi.network.SecurityGroupWithRules( + name=f"{project.name_prefix}-sg-lb-{service}", + project=project, + opts=pulumi.ResourceOptions(depends_on=[vpc] if vpc_config else None), + **sg_config, + ) + + # Build security groups for containers + # Wire source_security_group_id from LB SG to container ingress rules + container_sgs = {} + for service, sg_config in container_sg_configs.items(): + if service not in lb_sg_configs: + pulumi.log.warn( + f"Container SG '{service}' has no matching load_balancers entry" + ) + # Dynamically set source_security_group_id for ingress rules + if lb_sgs.get(service) is not None: + for rule in sg_config.get("rules", {}).get("ingress", []): + if "self" not in rule or not rule.get("self"): + rule["source_security_group_id"] = ( + lb_sgs[service].resources["sg"].id + ) + if vpc_resource: + sg_config["vpc_id"] = vpc_resource.id + depends_on = [] + if lb_sgs.get(service): + depends_on.append(lb_sgs[service].resources["sg"]) + if vpc_config: + depends_on.append(vpc) + container_sgs[service] = tb_pulumi.network.SecurityGroupWithRules( + name=f"{project.name_prefix}-sg-cont-{service}", + project=project, + opts=pulumi.ResourceOptions(depends_on=depends_on) if depends_on else None, + **sg_config, + ) + + # ========================================================================= + # Fargate App Task Role + # ========================================================================= + # tb_pulumi creates a task_role per FargateClusterWithLogging but only + # sets it as execution_role_arn (image pulls, log writes, ECS-injected + # secrets). It does NOT set task_role_arn on the ECS task definition, so + # the container has no IAM identity at runtime -- boto3 calls (e.g., the + # app fetching secrets directly from Secrets Manager) would fail + # + # Approach: create a shared app-level task role with runtime permissions, + # inject its ARN into each service task_definition config dict before + # passing to FargateClusterWithLogging. The dict gets splatted into + # aws.ecs.TaskDefinition(**task_def), so task_role_arn propagates cleanly + fargate_app_task_role = None + if vpc_resource: + app_task_assume_role = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "ecs-tasks.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + ) + + fargate_app_task_role = aws.iam.Role( + f"{project.name_prefix}-fargate-app-task-role", + name=f"{project.name_prefix}-fargate-app-task-role", + description="Runtime IAM role for Fargate containers (boto3 / SDK calls)", + assume_role_policy=app_task_assume_role, + tags=project.common_tags, + ) + + # Attach the atn/{stack}/* secrets policy so the app can fetch secrets + # at runtime via boto3 (settings_local.py reads from Secrets Manager) + # NOTE: here if any secret uses a customer-managed KMS key, kms:Decrypt + # will also be needed here -- add as a follow-up if GetSecretValue + # returns AccessDenied + app_task_secrets_policy_doc = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowATNSecretsAccess", + "Effect": "Allow", + "Action": "secretsmanager:GetSecretValue", + "Resource": f"arn:aws:secretsmanager:{project.aws_region}:{project.aws_account_id}:secret:atn/{project.stack}/*", + } + ], + } + ) + + app_task_secrets_policy = aws.iam.Policy( + f"{project.name_prefix}-app-task-secrets-policy", + name=f"{project.name_prefix}-app-task-secrets", + description="Allows Fargate app containers to read atn secrets at runtime", + policy=app_task_secrets_policy_doc, + tags=project.common_tags, + ) + + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-app-task-secrets-attachment", + role=fargate_app_task_role.name, + policy_arn=app_task_secrets_policy.arn, + ) + + # ========================================================================= + # Fargate Services + # ========================================================================= + fargate_configs = resources.get("tb:fargate:FargateClusterWithLogging", {}) + fargate_services = {} + + for service_name, service_config in fargate_configs.items(): + # Inject subnet IDs based on whether service is internal or external + is_internal = service_config.get("internal", True) + subnets = private_subnets if is_internal else public_subnets + + if subnets: + # Get security groups for this service + lb_sg = lb_sgs.get(service_name) + container_sg = container_sgs.get(service_name) + + # Extract SG IDs + lb_sg_ids = [lb_sg.resources["sg"].id] if lb_sg else [] + container_sg_ids = [container_sg.resources["sg"].id] if container_sg else [] + + # Inject task_role_arn into the task definition so containers + # have an IAM identity at runtime (cf. Fargate App Task Role + # section above for why this is needed) + # setdefault ensures the dict is on service_config even if + # task_definition was absent, so the ARN isn't dropped when + # **service_config is spread into the constructor. + task_def = service_config.setdefault("task_definition", {}) + if fargate_app_task_role and "task_role_arn" not in task_def: + task_def["task_role_arn"] = fargate_app_task_role.arn + + # Build depends_on list + depends_on = [*subnets] + if container_sg: + depends_on.append(container_sg.resources["sg"]) + if lb_sg: + depends_on.append(lb_sg.resources["sg"]) + if fargate_app_task_role: + depends_on.append(fargate_app_task_role) + + fargate_services[service_name] = ( + tb_pulumi.fargate.FargateClusterWithLogging( + name=f"{project.name_prefix}-{service_name}", + project=project, + subnets=subnets, # Pass subnet objects; tb_pulumi extracts .id internally + container_security_groups=container_sg_ids, + load_balancer_security_groups=lb_sg_ids if not is_internal else [], + opts=pulumi.ResourceOptions(depends_on=depends_on), + **service_config, + ) + ) + + # ========================================================================= + # Additional Secrets Manager access for Fargate execution roles + # ========================================================================= + # tb_pulumi scopes its auto-created secrets policy to + # {project}/{stack}/* = thunderbird-addons/stage/*, but the app expects + # atn/stage/* (existing convention). We attach an additional policy to + # each tb_pulumi-managed execution role so the ECS agent can inject + # atn secrets into containers at launch time. + # + # Note: runtime boto3 access is here handled by the separate app task role + # (fargate_app_task_role) created above, which has its own secrets policy. + atn_exec_secrets_policy_doc = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "AllowATNSecretsAccess", + "Effect": "Allow", + "Action": "secretsmanager:GetSecretValue", + "Resource": f"arn:aws:secretsmanager:{project.aws_region}:{project.aws_account_id}:secret:atn/{project.stack}/*", + } + ], + } + ) + + atn_exec_secrets_policy = aws.iam.Policy( + f"{project.name_prefix}-atn-secrets-policy", + name=f"{project.name_prefix}-atn-secrets", + description=f"Allows ECS execution role to access atn/{project.stack}/* secrets", + policy=atn_exec_secrets_policy_doc, + tags=project.common_tags, + ) + + for service_name, fargate_service in fargate_services.items(): + task_role = fargate_service.resources.get("task_role") + if task_role: + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-{service_name}-atn-secrets", + role=task_role.name, + policy_arn=atn_exec_secrets_policy.arn, + ) + + # ========================================================================= + # ECS Service Autoscaling + # ========================================================================= + # Target-tracking policies for CPU and memory. Thresholds are sensible + # defaults based on the thunderbird-accounts pattern; to be tuned after + # observing real workload performance + # + # Config-driven: each service can optionally have an "autoscaling" key in + # config.stage.yaml. If absent, no autoscaler is created for that service + autoscaling_configs = resources.get("tb:autoscale:EcsServiceAutoscaler", {}) + + for service_name, scaling_config in autoscaling_configs.items(): + fargate_svc = fargate_services.get(service_name) + if not fargate_svc: + pulumi.log.warn( + f"Autoscaling config for '{service_name}' but no matching Fargate service" + ) + continue + + ecs_service = fargate_svc.resources.get("service") + if ecs_service: + tb_pulumi.autoscale.EcsServiceAutoscaler( + name=f"{project.name_prefix}-{service_name}-autoscaler", + project=project, + service=ecs_service, + **scaling_config, + ) + + # ========================================================================= + # ElastiCache - Redis + # ========================================================================= + elasticache_configs = resources.get( + "tb:elasticache:ElastiCacheReplicationGroup", {} + ) + elasticache_clusters = {} + + for cluster_name, cluster_config in elasticache_configs.items(): + if private_subnets: + # Add source access from private subnets + if "source_cidrs" not in cluster_config: + cluster_config["source_cidrs"] = ["10.100.0.0/16"] # VPC CIDR + + elasticache_clusters[cluster_name] = ( + tb_pulumi.elasticache.ElastiCacheReplicationGroup( + name=f"{project.name_prefix}-{cluster_name}", + project=project, + subnets=private_subnets, + **cluster_config, + ) + ) + + # ========================================================================= + # ECS Scheduled Tasks (Cron Jobs) + # ========================================================================= + # Uses EventBridge Scheduler to run management commands on schedule + # Each scheduled task runs as a Fargate task with command override + scheduled_tasks_config = resources.get("aws:scheduler:ScheduledTasks", {}) + addons_ecr_repo = ecr_repositories.get("addons-server") + + if scheduled_tasks_config and private_subnets and addons_ecr_repo: + # --------------------------------------------------------------------- + # Task Execution Role (ECS to pull images and write logs) + # --------------------------------------------------------------------- + task_execution_assume_role = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "ecs-tasks.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + ) + + cron_execution_role = aws.iam.Role( + f"{project.name_prefix}-cron-execution-role", + name=f"{project.name_prefix}-cron-execution-role", + assume_role_policy=task_execution_assume_role, + tags=project.common_tags, + ) + + # Attach AWS managed policy for ECS task execution + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-cron-execution-policy", + role=cron_execution_role.name, + policy_arn="arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy", + ) + + # Additional policy for Secrets Manager access + cron_secrets_policy_doc = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["secretsmanager:GetSecretValue"], + "Resource": [ + f"arn:aws:secretsmanager:{project.aws_region}:{project.aws_account_id}:secret:atn/{project.stack}/*" + ], + } + ], + } + ) + + cron_secrets_policy = aws.iam.Policy( + f"{project.name_prefix}-cron-secrets-policy", + name=f"{project.name_prefix}-cron-secrets-policy", + policy=cron_secrets_policy_doc, + tags=project.common_tags, + ) + + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-cron-secrets-attachment", + role=cron_execution_role.name, + policy_arn=cron_secrets_policy.arn, + ) + + # --------------------------------------------------------------------- + # Task Role (container to access AWS resources) + # --------------------------------------------------------------------- + cron_task_role = aws.iam.Role( + f"{project.name_prefix}-cron-task-role", + name=f"{project.name_prefix}-cron-task-role", + assume_role_policy=task_execution_assume_role, + tags=project.common_tags, + ) + + # Also attach secrets policy to the cron TASK role (not just + # execution role). The execution role is used by ECS to pull + # images/inject secrets; the task role is used by the container + # at runtime for boto3 calls (e.g. fetching secrets directly). + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-cron-task-secrets-attachment", + role=cron_task_role.name, + policy_arn=cron_secrets_policy.arn, + ) + + # --------------------------------------------------------------------- + # CloudWatch Log Group for cron tasks + # --------------------------------------------------------------------- + aws.cloudwatch.LogGroup( + f"{project.name_prefix}-cron-logs", + name=f"/ecs/{project.name_prefix}-cron", + retention_in_days=30, + tags=project.common_tags, + ) + + # --------------------------------------------------------------------- + # Cron Task Definition + # --------------------------------------------------------------------- + # Lightweight task definition for management commands + # Command here is overridden per schedule via container overrides + cron_container_def = addons_ecr_repo.repository_url.apply( + lambda url: json.dumps( + [ + { + "name": "cron", + "image": f"{url}:stage-latest", + "essential": True, + "command": [ + "manage", + "help", + ], # Default; again overridden per schedule + "environment": [ + {"name": "DJANGO_SETTINGS_MODULE", "value": "settings"} + ], + "logConfiguration": { + "logDriver": "awslogs", + "options": { + "awslogs-group": f"/ecs/{project.name_prefix}-cron", + "awslogs-region": project.aws_region, + "awslogs-stream-prefix": "cron", + }, + }, + } + ] + ) + ) + + cron_task_definition = aws.ecs.TaskDefinition( + f"{project.name_prefix}-cron", + family=f"{project.name_prefix}-cron", + cpu="512", # 0.5 vCPU - probably sufficient for management commands + memory="1024", # 1 GB + network_mode="awsvpc", + requires_compatibilities=["FARGATE"], + execution_role_arn=cron_execution_role.arn, + task_role_arn=cron_task_role.arn, + container_definitions=cron_container_def, + tags=project.common_tags, + ) + + # --------------------------------------------------------------------- + # EventBridge Scheduler IAM Role + # --------------------------------------------------------------------- + scheduler_assume_role_policy = json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Principal": {"Service": "scheduler.amazonaws.com"}, + "Action": "sts:AssumeRole", + } + ], + } + ) + + scheduler_role = aws.iam.Role( + f"{project.name_prefix}-scheduler-role", + name=f"{project.name_prefix}-scheduler-role", + assume_role_policy=scheduler_assume_role_policy, + tags=project.common_tags, + ) + + # Policy for Scheduler to run ECS tasks and pass roles. + # PassRole is scoped to only the cron execution and task roles + # (not Resource: * which would allow privilege escalation) + scheduler_policy_doc = pulumi.Output.all( + cron_task_definition.arn, + cron_execution_role.arn, + cron_task_role.arn, + ).apply( + lambda args: json.dumps( + { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "RunTask", + "Effect": "Allow", + "Action": ["ecs:RunTask"], + "Resource": [args[0]], + "Condition": { + "ArnLike": { + "ecs:cluster": f"arn:aws:ecs:{project.aws_region}:{project.aws_account_id}:cluster/{project.name_prefix}-worker-cluster" + } + }, + }, + { + "Sid": "PassRole", + "Effect": "Allow", + "Action": ["iam:PassRole"], + "Resource": [args[1], args[2]], + "Condition": { + "StringLike": { + "iam:PassedToService": "ecs-tasks.amazonaws.com" + } + }, + }, + ], + } + ) + ) + + scheduler_policy = aws.iam.Policy( + f"{project.name_prefix}-scheduler-policy", + name=f"{project.name_prefix}-scheduler-policy", + policy=scheduler_policy_doc, + tags=project.common_tags, + ) + + aws.iam.RolePolicyAttachment( + f"{project.name_prefix}-scheduler-policy-attachment", + role=scheduler_role.name, + policy_arn=scheduler_policy.arn, + ) + + # --------------------------------------------------------------------- + # Schedule Group (organises all cron schedules) + # --------------------------------------------------------------------- + schedule_group = aws.scheduler.ScheduleGroup( + f"{project.name_prefix}-cron-group", + name=f"{project.name_prefix}-cron", + tags=project.common_tags, + ) + + # --------------------------------------------------------------------- + # Create EventBridge Schedules per each cron job + # --------------------------------------------------------------------- + # Get worker security group for network config + worker_sg = container_sgs.get("worker") + worker_sg_id = worker_sg.resources["sg"].id if worker_sg else None + + # Get private subnet IDs + private_subnet_ids = [s.id for s in private_subnets] + + for task_name, task_config in scheduled_tasks_config.items(): + schedule_expr = task_config.get("schedule_expression", "rate(1 day)") + command = task_config.get("command", ["manage", "help"]) + description = task_config.get("description", f"Scheduled task: {task_name}") + + # Create the schedule + aws.scheduler.Schedule( + f"{project.name_prefix}-{task_name}", + name=f"{project.name_prefix}-{task_name}", + group_name=schedule_group.name, + schedule_expression=schedule_expr, + schedule_expression_timezone="UTC", + description=description, + flexible_time_window=aws.scheduler.ScheduleFlexibleTimeWindowArgs( + mode="OFF" # Would run exactly at scheduled time + ), + target=aws.scheduler.ScheduleTargetArgs( + arn=f"arn:aws:ecs:{project.aws_region}:{project.aws_account_id}:cluster/{project.name_prefix}-worker-cluster", + role_arn=scheduler_role.arn, + ecs_parameters=aws.scheduler.ScheduleTargetEcsParametersArgs( + task_definition_arn=cron_task_definition.arn, + task_count=1, + launch_type="FARGATE", + platform_version="LATEST", + network_configuration=aws.scheduler.ScheduleTargetEcsParametersNetworkConfigurationArgs( + subnets=private_subnet_ids, + security_groups=[worker_sg_id] if worker_sg_id else [], + assign_public_ip=False, + ), + ), + input=json.dumps( + {"containerOverrides": [{"name": "cron", "command": command}]} + ), + ), + state="ENABLED", + opts=pulumi.ResourceOptions( + parent=schedule_group, + depends_on=[cron_task_definition, scheduler_role], + ), + ) + + pulumi.log.info(f"Scheduled task: {task_name} - {schedule_expr}") + + # Export scheduled task info + pulumi.export("scheduled_tasks_count", len(scheduled_tasks_config)) + pulumi.export("cron_task_definition_arn", cron_task_definition.arn) + pulumi.export("cron_schedule_group", schedule_group.name) + + # ========================================================================= + # Outputs + # ========================================================================= + # Export useful values for reference + if vpc_resource: + pulumi.export("vpc_id", vpc_resource.id) + if private_subnets: + pulumi.export("private_subnet_ids", [s.id for s in private_subnets]) + if public_subnets: + pulumi.export("public_subnet_ids", [s.id for s in public_subnets]) + + +if __name__ == "__main__": + main() diff --git a/infra/pulumi/config.stage.yaml b/infra/pulumi/config.stage.yaml new file mode 100644 index 000000000000..8a04967f89aa --- /dev/null +++ b/infra/pulumi/config.stage.yaml @@ -0,0 +1,564 @@ +--- +# Thunderbird Add-ons Server - Stage Environment +# This configuration mirrors the current EC2 setup, targeting ECS Fargate +# +# Current architecture (EC2/Ansible) - based on our analysis: +# - Web servers -> Fargate web service +# - 2x celery workers -> Fargate worker service +# - 1x versioncheck server -> Fargate versioncheck service +# - 1x admin/cron server -> Fargate scheduled tasks +# +# Reference: thundernest-ansible/addons/env/*.yml + +resources: + # ============================================================================= + # ECR Repository + # ============================================================================= + # Container registry for addons-server images + # Images are pushed by CI/CD pipeline (CircleCI/GitHub Actions) + aws:ecr:Repository: + addons-server: + name: atn-stage-addons-server + image_tag_mutability: MUTABLE + force_delete: true # Stage only: allows pulumi destroy with images present + scan_on_push: true + encryption_type: AES256 + # Lifecycle policy keep last 50 tagged images (here any tag), expire untagged after 7 days + # This catches SHA tags, stage-latest, and any future tag patterns + lifecycle_policy: | + { + "rules": [ + { + "rulePriority": 1, + "description": "Keep last 50 tagged images (any tag)", + "selection": { + "tagStatus": "tagged", + "tagPrefixList": ["stage-", "sha-"], + "countType": "imageCountMoreThan", + "countNumber": 50 + }, + "action": { + "type": "expire" + } + }, + { + "rulePriority": 2, + "description": "Delete untagged images older than 7 days", + "selection": { + "tagStatus": "untagged", + "countType": "sinceImagePushed", + "countUnit": "days", + "countNumber": 7 + }, + "action": { + "type": "expire" + } + } + ] + } + + # ============================================================================= + # GitHub Actions OIDC Role + # ============================================================================= + # Allows GH Actions to push images to ECR via OIDC + # + # After deployment + # 1. we copy the exported 'gha_ecr_publish_role_arn' value + # 2. we set it as repository variable AWS_ROLE_ARN in GitHub + # 3. this pushes to stage branch will then publish to ECR + # + # Trust policy restricts assumption to: + # - repo:thunderbird/addons-server:ref:refs/heads/stage + # - job_workflow_ref: .github/workflows/build-and-push.yml@refs/heads/stage + aws:iam:GitHubActionsOIDCRole: + github_org: thunderbird + github_repo: addons-server + allowed_branches: + - stage + workflow_file: .github/workflows/build-and-push.yml + + # ============================================================================= + # VPC Configuration + # ============================================================================= + # Using MultiTierVpc for public/private subnet separation + # Public: Load balancers + # Private: Fargate tasks, RDS, ElastiCache, OpenSearch + tb:network:MultiTierVpc: + vpc: + cidr_block: 10.100.0.0/16 + enable_dns_hostnames: true + enable_internet_gateway: true + enable_nat_gateway: true + egress_via_internet_gateway: true + egress_via_nat_gateway: true + + # VPC Endpoints for AWS services (reduces NAT Gateway costs) + endpoint_interfaces: + - ecr.api + - ecr.dkr + - logs + - secretsmanager + - ssm + endpoint_gateways: + - s3 + + # Public subnets for ALBs + public_subnets: + us-west-2a: + - 10.100.1.0/24 + us-west-2b: + - 10.100.2.0/24 + us-west-2c: + - 10.100.3.0/24 + + # Private subnets for Fargate tasks + private_subnets: + us-west-2a: + - 10.100.101.0/24 + us-west-2b: + - 10.100.102.0/24 + us-west-2c: + - 10.100.103.0/24 + + # VPC Peering is handled in __main__.py (not here) because + # MultiTierVpc places peering routes on the default route table, + # but our subnets use custom public/private route tables. + # See __main__.py for peering connection, routes, and SG rules. + + # ============================================================================= + # Default VPC ingress (peering) rules + # ============================================================================= + # These control which existing default VPC security groups accept inbound + # connections from the ATN stage VPC (10.100.0.0/16) for legacy backends + # + # If Secrets Manager points at a different RDS / ElastiCache than expected, + # we would add the relevant SG IDs here so ECS tasks can reach them + tb:network:DefaultVpcIngressRules: + stage_vpc_cidr: 10.100.0.0/16 + default_vpc_route_table_id: rtb-0657e07f + # Redis / Memcached / ES/OpenSearch / EFS + services_sg_ids: + - sg-d5539ea9 + # RDS MySQL / RabbitMQ + default_sg_ids: + - sg-5133b52c + + # ============================================================================= + # Web Service - Fargate (intended to replace current web tier) + # ============================================================================= + # uwsgi with 4 processes, 4 threads per process + # Image: 768512802988.dkr.ecr.us-west-2.amazonaws.com/atn-stage-addons-server:stage-latest + tb:fargate:FargateClusterWithLogging: + web: + # desired_count intentionally omitted: autoscaling owns the count. + # tb_pulumi sets ignore_changes on desired_count when not specified, + # preventing Pulumi from fighting the autoscaler. min_capacity in + # the autoscaling config acts as the effective baseline + assign_public_ip: false + internal: false # Public-facing ALB + enable_container_insights: true + health_check_grace_period_seconds: 120 + + services: + web: + name: atn-stage-web + container_name: web + container_port: 8000 + listener_port: 443 + listener_proto: HTTPS + listener_cert_arn: arn:aws:acm:us-west-2:768512802988:certificate/2cff184f-31a3-4e9e-b478-eff82076f06f + health_check: + path: /services/monitor.json + interval: 30 + timeout: 10 + healthy_threshold: 2 + unhealthy_threshold: 3 + matcher: '200' + + task_definition: + cpu: '1024' # 1 vCPU + memory: '2048' # 2 GB + network_mode: awsvpc + requires_compatibilities: + - FARGATE + + container_definitions: + web: + # ECR image URL - updated by CI/CD pipeline + image: 768512802988.dkr.ecr.us-west-2.amazonaws.com/atn-stage-addons-server:stage-latest + essential: true + # Entrypoint uses docker-entrypoint.sh, command specifies service mode + command: + - web + portMappings: + - containerPort: 8000 + protocol: tcp + environment: + - name: DJANGO_SETTINGS_MODULE + value: settings_local_stage + - name: UWSGI_PROCESSES + value: '4' + - name: UWSGI_THREADS + value: '4' + - name: UWSGI_PORT + value: '8000' + # logConfiguration: omitted; tb_pulumi injects defaults targeting + # the log group it creates (thunderbird-addons-stage-web-fargate-logs) + + # =========================================================================== + # Celery Worker Service - Fargate (intended to replace current worker tier) + # =========================================================================== + # t3a.large has 8GB RAM - needed for addons-linter memory requirements + # Multiple queue groups for different workloads + worker: + # desired_count omitted: autoscaling owns the count (see web comment above) + assign_public_ip: false + internal: true + build_load_balancer: false # Workers don't need ALB + enable_container_insights: true + + task_definition: + cpu: '2048' # 2 vCPU (match t3a.large) + memory: '8192' # 8 GB (addons-linter needs significant memory) + network_mode: awsvpc + requires_compatibilities: + - FARGATE + + container_definitions: + worker: + image: 768512802988.dkr.ecr.us-west-2.amazonaws.com/atn-stage-addons-server:stage-latest + essential: true + # Uses docker-entrypoint.sh worker mode + command: + - worker + environment: + - name: DJANGO_SETTINGS_MODULE + value: settings_local_stage + - name: CELERY_CONCURRENCY + value: '4' + - name: CELERY_QUEUES + value: default,devhub,images,limited,priority,reviews,celery,api,files,search,tags,cron,ratings,reviewers,zadmin,stats,crypto + - name: CELERY_LOGLEVEL + value: info + # logConfiguration: omitted; tb_pulumi injects defaults targeting + # the log group it creates (thunderbird-addons-stage-worker-fargate-logs) + + # =========================================================================== + # Versioncheck Service - Fargate (intended to replace current versioncheck) + # =========================================================================== + # Separate ALB endpoint for version checking API (versioncheck.addons.thunderbird.net) + # Lightweight service - c7a.medium equivalent + versioncheck: + # desired_count omitted: autoscaling owns the count (see web comment) + assign_public_ip: false + internal: false + enable_container_insights: true + health_check_grace_period_seconds: 60 + + services: + versioncheck: + name: atn-stage-vc + container_name: versioncheck + container_port: 8000 + listener_port: 443 + listener_proto: HTTPS + listener_cert_arn: arn:aws:acm:us-west-2:768512802988:certificate/2cff184f-31a3-4e9e-b478-eff82076f06f + health_check: + # Versioncheck uses a simpler health endpoint + path: /services/monitor.json + interval: 30 + timeout: 5 + healthy_threshold: 2 + unhealthy_threshold: 3 + matcher: '200' + + task_definition: + cpu: '512' # 0.5 vCPU (c7a.medium has 1 vCPU) + memory: '1024' # 1 GB (c7a.medium has 2 GB) + network_mode: awsvpc + requires_compatibilities: + - FARGATE + + container_definitions: + versioncheck: + image: 768512802988.dkr.ecr.us-west-2.amazonaws.com/atn-stage-addons-server:stage-latest + essential: true + # Uses docker-entrypoint.sh versioncheck mode + command: + - versioncheck + portMappings: + - containerPort: 8000 + protocol: tcp + environment: + - name: DJANGO_SETTINGS_MODULE + value: settings_local_stage + - name: UWSGI_PROCESSES + value: '4' + - name: UWSGI_THREADS + value: '4' + - name: UWSGI_PORT + value: '8000' + # logConfiguration: omitted; tb_pulumi injects defaults targeting + # the log group it creates (thunderbird-addons-stage-versioncheck-fargate-logs) + + # ============================================================================= + # ElastiCache - Redis (intended to replace current Redis setup) + # ============================================================================= + # Used for: Celery result backend + tb:elasticache:ElastiCacheReplicationGroup: + redis: + description: ATN Stage Redis cluster for Celery + engine: redis + engine_version: '7.1' + node_type: cache.t3.small + num_cache_nodes: 1 + port: 6379 + # source_cidrs will be populated from VPC CIDR + + # ============================================================================= + # ECS Service Autoscaling + # ============================================================================= + # Target-tracking policies for CPU and memory utilisation + # Thresholds are here also sensible defaults; to be tuned + # after observing real workload performance + # Workers have higher memory threshold (addons-linter is memory-intensive) + tb:autoscale:EcsServiceAutoscaler: + web: + cpu_threshold: 70 + ram_threshold: 70 + min_capacity: 2 + max_capacity: 8 + cooldown: 300 + + worker: + cpu_threshold: 70 + ram_threshold: 80 # Higher: addons-linter memory spikes are normal + min_capacity: 2 + max_capacity: 6 + cooldown: 300 + + versioncheck: + cpu_threshold: 70 + ram_threshold: 70 + min_capacity: 1 + max_capacity: 4 + cooldown: 300 + + # ============================================================================= + # ElastiCache - Memcached (intended to replace current Memcached setup) + # ============================================================================= + # Note: tb_pulumi has ElastiCacheReplicationGroup for Redis + # Memcached may need custom implementation or AWS provider direct use + # For stage, we might use Redis for both caching and Celery + + # ============================================================================= + # Security Groups + # ============================================================================= + # Pattern from thunderbird-accounts: separate load_balancers and containers + # sections. For each Fargate service, matching entries in both. Workers with + # no ALB set load_balancers entry to null. Code dynamically wires + # source_security_group_id from LB SG to container ingress. + tb:network:SecurityGroupWithRules: + load_balancers: + web: + description: ALB security group for web service + rules: + ingress: + - description: HTTPS from internet + from_port: 443 + to_port: 443 + protocol: tcp + cidr_blocks: + - 0.0.0.0/0 + - description: HTTP redirect + from_port: 80 + to_port: 80 + protocol: tcp + cidr_blocks: + - 0.0.0.0/0 + egress: + - description: Allow all outbound + from_port: 0 + to_port: 0 + protocol: '-1' + cidr_blocks: + - 0.0.0.0/0 + versioncheck: + description: ALB security group for versioncheck service + rules: + ingress: + - description: HTTPS from internet + from_port: 443 + to_port: 443 + protocol: tcp + cidr_blocks: + - 0.0.0.0/0 + egress: + - description: Allow all outbound + from_port: 0 + to_port: 0 + protocol: '-1' + cidr_blocks: + - 0.0.0.0/0 + worker: null # Workers have no ALB + + containers: + web: + description: Container security group for web Fargate tasks + rules: + ingress: + - description: From ALB to container port + from_port: 8000 + to_port: 8000 + protocol: tcp + # source_security_group_id wired dynamically in __main__.py + egress: + - description: Allow all outbound + from_port: 0 + to_port: 0 + protocol: '-1' + cidr_blocks: + - 0.0.0.0/0 + versioncheck: + description: Container security group for versioncheck Fargate tasks + rules: + ingress: + - description: From ALB to container port + from_port: 8000 + to_port: 8000 + protocol: tcp + # source_security_group_id wired dynamically in __main__.py + egress: + - description: Allow all outbound + from_port: 0 + to_port: 0 + protocol: '-1' + cidr_blocks: + - 0.0.0.0/0 + worker: + description: Container security group for worker Fargate tasks + rules: + ingress: [] # Workers have no inbound traffic + egress: + - description: Allow all outbound + from_port: 0 + to_port: 0 + protocol: '-1' + cidr_blocks: + - 0.0.0.0/0 + + # ============================================================================= + # ECS Scheduled Tasks (Cron Jobs) + # ============================================================================= + # Intended to replace the cron jobs from the current admin/cron setup + # Uses EventBridge Scheduler to trigger ECS tasks on schedule + aws:scheduler:ScheduledTasks: + # Every 5 minutes + auto-approve: + schedule_expression: rate(5 minutes) + command: [manage, auto_approve] + description: Auto-approve add-ons that meet criteria + + # Hourly tasks + info-request-warning: + schedule_expression: cron(15 * * * ? *) + command: [manage, send_info_request_last_warning_notifications] + description: Send info request last warning notifications + + addon-last-updated: + schedule_expression: cron(20 * * * ? *) + command: [manage, cron, addon_last_updated] + description: Update addon last_updated timestamps + + update-addon-appsupport: + schedule_expression: cron(45 * * * ? *) + command: [manage, cron, update_addon_appsupport] + description: Update addon application support info + + cleanup-extracted-file: + schedule_expression: cron(50 * * * ? *) + command: [manage, cron, cleanup_extracted_file] + description: Clean up extracted files + + unhide-disabled-files: + schedule_expression: cron(55 * * * ? *) + command: [manage, cron, unhide_disabled_files] + description: Unhide disabled files + + # Twice daily (5am and 5pm, 6am and 6pm UTC) + hide-disabled-files: + schedule_expression: cron(25 5,17 * * ? *) + command: [manage, cron, hide_disabled_files] + description: Hide disabled files + + cleanup-image-files: + schedule_expression: cron(25 6,18 * * ? *) + command: [manage, cron, cleanup_image_files] + description: Clean up orphaned image files + + # Daily tasks + update-user-ratings: + schedule_expression: cron(0 1 * * ? *) + command: [manage, cron, update_user_ratings] + description: Update user ratings aggregations + + category-totals: + schedule_expression: cron(30 14 * * ? *) + command: [manage, cron, category_totals] + description: Update category totals + + gc: + schedule_expression: cron(0 22 * * ? *) + command: [manage, cron, gc] + description: Garbage collection + + dump-apps: + schedule_expression: cron(30 1 * * ? *) + command: [manage, dump_apps] + description: Dump application data + + update-product-details: + schedule_expression: cron(45 1 * * ? *) + command: [manage, update_product_details] + description: Update product details from remote + + add-latest-appversion: + schedule_expression: cron(0 2 * * ? *) + command: [manage, cron, add_latest_appversion] + description: Add latest application versions + + update-global-totals: + schedule_expression: cron(40 0 * * ? *) + command: [manage, cron, update_global_totals] + description: Update global statistics totals + + update-addon-daily-users: + schedule_expression: cron(20 0 * * ? *) + command: [manage, cron, update_addon_average_daily_users] + description: Update addon average daily users + +# ============================================================================= +# Secrets Manager - Required Secrets +# ============================================================================= +# These secrets must be created manually or via separate Pulumi stack +# before deploying the ECS services +# +# Required secrets (path: atn/stage/): +# - mysql (JSON with host, port, username, password) +# - celery_broker (connection string) +# - django_secret_key +# - fxa (JSON with client_id, client_secret) +# - cache_host +# - email_url +# - recaptcha (JSON with public, private) +# - inbound_email (JSON with secret_key, validation_key) +# +# See docs/environment-variables.md for full reference + +# ============================================================================= +# Notes for implementation: +# ============================================================================= +# 1. RDS MySQL - Not yet in tb_pulumi, may need custom component +# 2. OpenSearch - Not yet in tb_pulumi, may need custom component +# 3. EFS - For shared add-on file storage, needs custom component +# 4. Amazon MQ (RabbitMQ) - Decision pending: keep EC2 or migrate to Amazon MQ diff --git a/infra/pulumi/requirements.txt b/infra/pulumi/requirements.txt new file mode 100644 index 000000000000..2a394d8f9de6 --- /dev/null +++ b/infra/pulumi/requirements.txt @@ -0,0 +1,13 @@ +# Thunderbird Pulumi library +# Requires Python 3.13+ (tb_pulumi v0.0.15+) +tb_pulumi @ git+https://github.com/thunderbird/pulumi.git@v0.0.16 + +# Core Pulumi dependencies (tb_pulumi will pull these, but explicit is better) +pulumi>=3.0.0,<4.0.0 +pulumi-aws>=6.0.0,<7.0.0 + +# AWS SDK for boto3 operations in tb_pulumi +boto3>=1.28.0 + +# YAML parsing for config files +pyyaml>=6.0 diff --git a/infra/scripts/guardduty-cleanup.sh b/infra/scripts/guardduty-cleanup.sh new file mode 100755 index 000000000000..eda6ac639404 --- /dev/null +++ b/infra/scripts/guardduty-cleanup.sh @@ -0,0 +1,270 @@ +#!/usr/bin/env bash +# guardduty-cleanup.sh +# +# Cleans up GuardDuty-provisioned artefacts that block VPC deletion after +# `pulumi destroy`. When GuardDuty is enabled and a VPC exists, AWS +# automatically creates: +# - A VPC endpoint (com.amazonaws.guardduty-data) +# - ENIs attached to that endpoint +# - A security group for the endpoint +# +# These are NOT managed by Pulumi and thus not removed by `pulumi destroy`, +# which causes the VPC deletion to fail. +# +# Basic usage: +# ./guardduty-cleanup.sh [--dry-run] [--force] +# +# Examples: +# ./guardduty-cleanup.sh vpc-02ed42011af62798d --dry-run # preview only +# ./guardduty-cleanup.sh vpc-02ed42011af62798d # delete (with tag check) +# ./guardduty-cleanup.sh vpc-02ed42011af62798d --force # skip tag check +# +# Safe to run in a SECOND TERMINAL while `pulumi destroy` is retrying the +# VPC deletion. The GuardDuty resources are not in Pulumi state, so removing +# them out-of-band is safe -- Pulumi's next retry will find the VPC clean +# and delete it successfully. +# +# Safety: +# - Only targets GuardDuty VPC endpoints (matched by service name) +# - Only deletes ENIs and SGs that belong to those specific endpoints +# - Refuses to operate unless VPC has pulumi_project=thunderbird-addons tag +# (override with --force) +# - Idempotent: safe to run multiple times +# - Use --dry-run first to preview what would be deleted +# +# Prerequisites: +# - AWS CLI v2 configured with appropriate credentials +# - jq installed + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Arguments +# --------------------------------------------------------------------------- +VPC_ID="${1:-}" +DRY_RUN=false +FORCE=false + +if [[ -z "$VPC_ID" ]]; then + echo "Usage: $0 [--dry-run] [--force]" + echo "" + echo "Options:" + echo " --dry-run Preview what would be deleted without making changes" + echo " --force Skip VPC tag safety check" + exit 1 +fi + +shift +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=true ;; + --force) FORCE=true ;; + *) echo "Unknown option: $arg"; exit 1 ;; + esac +done + +REGION="${AWS_DEFAULT_REGION:-$(aws configure get region 2>/dev/null || echo us-west-2)}" +echo "Region: $REGION" +echo "VPC: $VPC_ID" +echo "Dry run: $DRY_RUN" +echo "" + +# --------------------------------------------------------------------------- +# Safety check: verify VPC has expected tags +# --------------------------------------------------------------------------- +if [[ "$FORCE" == false ]]; then + echo "=== Safety Check: VPC Tags ===" + VPC_PROJECT_TAG=$(aws ec2 describe-vpcs \ + --region "$REGION" \ + --vpc-ids "$VPC_ID" \ + --query 'Vpcs[0].Tags[?Key==`pulumi_project`].Value | [0]' \ + --output text 2>/dev/null || echo "None") + + if [[ "$VPC_PROJECT_TAG" != "thunderbird-addons" ]]; then + echo " ERROR: VPC $VPC_ID does not have tag pulumi_project=thunderbird-addons" + echo " Found: pulumi_project=$VPC_PROJECT_TAG" + echo "" + echo " This safety check should prevent accidental cleanup of the wrong VPC" + echo " Use --force to override if you are certain this is correct" + exit 1 + fi + echo " VPC tag check passed (pulumi_project=thunderbird-addons)" + echo "" +fi + +# --------------------------------------------------------------------------- +# Step 1: Find GuardDuty VPC endpoints and collect their ENI/SG metadata +# --------------------------------------------------------------------------- +echo "=== Step 1: Discover GuardDuty VPC Endpoints ===" + +GUARDDUTY_SERVICE="com.amazonaws.${REGION}.guardduty-data" + +ENDPOINT_JSON=$(aws ec2 describe-vpc-endpoints \ + --region "$REGION" \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=service-name,Values=$GUARDDUTY_SERVICE" \ + --output json \ + --query 'VpcEndpoints' 2>/dev/null || echo "[]") + +ENDPOINT_COUNT=$(echo "$ENDPOINT_JSON" | jq 'length') + +if [[ "$ENDPOINT_COUNT" -eq 0 ]]; then + echo " No GuardDuty VPC endpoints found. Nothing to clean up." + exit 0 +fi + +# Extract the endpoint IDs, their ENI IDs, and their SG IDs +ENDPOINT_IDS=$(echo "$ENDPOINT_JSON" | jq -r '.[].VpcEndpointId') +ENI_IDS=$(echo "$ENDPOINT_JSON" | jq -r '.[].NetworkInterfaceIds[]' 2>/dev/null | sort -u || true) +SG_IDS=$(echo "$ENDPOINT_JSON" | jq -r '.[].Groups[].GroupId' 2>/dev/null | sort -u || true) + +echo " Found $ENDPOINT_COUNT GuardDuty endpoint(s):" +for eid in $ENDPOINT_IDS; do echo " - $eid"; done +echo "" +if [[ -n "$ENI_IDS" ]]; then + echo " Associated ENIs:" + for eni in $ENI_IDS; do echo " - $eni"; done + echo "" +fi +if [[ -n "$SG_IDS" ]]; then + echo " Associated SGs:" + for sg in $SG_IDS; do echo " - $sg"; done + echo "" +fi + +# --------------------------------------------------------------------------- +# Step 2: Delete the GuardDuty VPC endpoints +# --------------------------------------------------------------------------- +echo "=== Step 2: Delete GuardDuty VPC Endpoints ===" + +for ENDPOINT_ID in $ENDPOINT_IDS; do + if [[ "$DRY_RUN" == false ]]; then + echo " Deleting endpoint $ENDPOINT_ID..." + aws ec2 delete-vpc-endpoints \ + --region "$REGION" \ + --vpc-endpoint-ids "$ENDPOINT_ID" + echo " Deleted." + else + echo " [DRY RUN] Would delete endpoint $ENDPOINT_ID" + fi +done + +# Wait for ENIs to release with retry backoff (can take 15-60s) +if [[ "$DRY_RUN" == false && -n "$ENI_IDS" ]]; then + MAX_RETRIES=4 + WAIT_SECS=15 + for ATTEMPT in $(seq 1 $MAX_RETRIES); do + echo " Waiting ${WAIT_SECS}s for ENI release (attempt ${ATTEMPT}/${MAX_RETRIES})..." + sleep "$WAIT_SECS" + + ALL_CLEAR=true + for ENI_ID in $ENI_IDS; do + ENI_STATUS=$(aws ec2 describe-network-interfaces \ + --region "$REGION" \ + --network-interface-ids "$ENI_ID" \ + --query 'NetworkInterfaces[0].Status' \ + --output text 2>/dev/null || echo "not-found") + if [[ "$ENI_STATUS" == "in-use" ]]; then + ALL_CLEAR=false + break + fi + done + + if [[ "$ALL_CLEAR" == true ]]; then + echo " All ENIs released." + break + fi + + if [[ "$ATTEMPT" -eq "$MAX_RETRIES" ]]; then + echo " Some ENIs still in-use after ${MAX_RETRIES} attempts. Proceeding with best effort" + fi + WAIT_SECS=$((WAIT_SECS + 10)) + done +fi +echo "" + +# --------------------------------------------------------------------------- +# Step 3: Delete endpoint-linked ENIs (if still present after endpoint delete) +# --------------------------------------------------------------------------- +echo "=== Step 3: Clean Up Endpoint ENIs ===" + +if [[ -z "$ENI_IDS" ]]; then + echo " No endpoint ENIs to clean up." +else + for ENI_ID in $ENI_IDS; do + ENI_STATUS=$(aws ec2 describe-network-interfaces \ + --region "$REGION" \ + --network-interface-ids "$ENI_ID" \ + --query 'NetworkInterfaces[0].Status' \ + --output text 2>/dev/null || echo "not-found") + + if [[ "$ENI_STATUS" == "not-found" ]]; then + echo " ENI $ENI_ID already gone (released with endpoint)." + continue + fi + + if [[ "$ENI_STATUS" == "available" ]]; then + if [[ "$DRY_RUN" == false ]]; then + echo " Deleting ENI $ENI_ID..." + aws ec2 delete-network-interface \ + --region "$REGION" \ + --network-interface-id "$ENI_ID" + echo " Deleted." + else + echo " [DRY RUN] Would delete ENI $ENI_ID (status: $ENI_STATUS)" + fi + else + echo " ENI $ENI_ID still in '$ENI_STATUS' state -- skipping" + fi + done +fi +echo "" + +# --------------------------------------------------------------------------- +# Step 4: Delete endpoint-linked SGs (if not the VPC default SG) +# --------------------------------------------------------------------------- +echo "=== Step 4: Clean Up Endpoint Security Groups ===" + +if [[ -z "$SG_IDS" ]]; then + echo " No endpoint SGs to clean up." +else + # Get the default SG for this VPC (cannot be deleted) + DEFAULT_SG=$(aws ec2 describe-security-groups \ + --region "$REGION" \ + --filters "Name=vpc-id,Values=$VPC_ID" "Name=group-name,Values=default" \ + --query 'SecurityGroups[0].GroupId' \ + --output text 2>/dev/null || echo "") + + for SG_ID in $SG_IDS; do + if [[ "$SG_ID" == "$DEFAULT_SG" ]]; then + echo " SG $SG_ID is the VPC default SG -- skipping." + continue + fi + + if [[ "$DRY_RUN" == false ]]; then + echo " Deleting SG $SG_ID..." + aws ec2 delete-security-group \ + --region "$REGION" \ + --group-id "$SG_ID" 2>/dev/null \ + && echo " Deleted." \ + || echo " Could not delete (may still be referenced by an ENI; retry after ENI cleanup)." + else + echo " [DRY RUN] Would delete SG $SG_ID" + fi + done +fi +echo "" + +# --------------------------------------------------------------------------- +# Summary +# --------------------------------------------------------------------------- +echo "=== Done ===" +if [[ "$DRY_RUN" == true ]]; then + echo "Dry run complete. Re-run without --dry-run to apply changes." +else + echo "Cleanup complete. Pulumi's next VPC deletion retry should succeed." + echo "" + echo "If VPC deletion still fails, check for:" + echo " - ENIs still in 'in-use' state (wait a few seconds and re-run)" + echo " - Other non-Pulumi resources in the VPC:" + echo " aws ec2 describe-network-interfaces --filters Name=vpc-id,Values=$VPC_ID" +fi diff --git a/infra/tests/.env.example b/infra/tests/.env.example new file mode 100644 index 000000000000..5450ca40637a --- /dev/null +++ b/infra/tests/.env.example @@ -0,0 +1,27 @@ +# Example environment for infra/tests/smoke_test.py +# +# Copy to .env (do not commit, though) and fill in real values. + +AWS_REGION=us-west-2 + +# TCP connectivity targets +RDS_HOST= +RDS_PORT=3306 + +REDIS_NEW_HOST= +REDIS_EXISTING_HOST= +REDIS_PORT=6379 + +RABBITMQ_HOST= +RABBITMQ_PORT=5672 + +ES_HOST= +ES_PORT=443 + +# Comma-separated lists +DNS_HOSTNAMES= +REQUIRED_SECRETS= + +# NAT egress check +NAT_EGRESS_URL=https://httpbin.org/status/200 + diff --git a/infra/tests/Dockerfile b/infra/tests/Dockerfile new file mode 100644 index 000000000000..d30da4bee9bc --- /dev/null +++ b/infra/tests/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.11-slim + +RUN pip install --no-cache-dir boto3 + +COPY smoke_test.py /smoke_test.py + +ENTRYPOINT ["python", "/smoke_test.py"] diff --git a/infra/tests/smoke_test.py b/infra/tests/smoke_test.py new file mode 100644 index 000000000000..c0eee3928455 --- /dev/null +++ b/infra/tests/smoke_test.py @@ -0,0 +1,313 @@ +#!/usr/bin/env python3 +""" +ATN Stage Infrastructure Smoke Test (Read-Only) + +This just validates connectivity from ECS tasks to all backend services +without writing any data. Designed to run as a one-off ECS task before +enabling application containers + +Basic usage: + python smoke_test.py # Run all checks + python smoke_test.py --check secrets # Run a specific check + python smoke_test.py --json # Output as JSON + +Configuration: + This script is intended to be committed to a public repo so it does not + embed any environment-specific endpoints or private IP addresses + + Provide targets via environment variables; checks are SKIPped when their + required variables are not set. + + Required (per check) + - RDS_HOST + - REDIS_NEW_HOST + - REDIS_EXISTING_HOST + - RABBITMQ_HOST + - ES_HOST + - DNS_HOSTNAMES (comma-separated list) + - REQUIRED_SECRETS (comma-separated list of Secrets Manager names) + + Optional + - AWS_REGION (default: us-west-2) + - RDS_PORT (default: 3306) + - REDIS_PORT (default: 6379) + - RABBITMQ_PORT (default: 5672) + - ES_PORT (default: 443) + - NAT_EGRESS_URL (default: https://httpbin.org/status/200) + +Exit codes: + 0 - All checks passed + 1 - One or more checks failed +""" + +import argparse +import json +import os +import socket +import sys +import time +from urllib.request import urlopen +from urllib.error import URLError + + +# --------------------------------------------------------------------------- +# Configuration -- sourced from environment or defaults +# --------------------------------------------------------------------------- +AWS_REGION = os.environ.get("AWS_REGION", "us-west-2") + +def _csv_env(var_name: str): + value = os.environ.get(var_name, "").strip() + if not value: + return [] + return [item.strip() for item in value.split(",") if item.strip()] + + +# Backend endpoints (provided via environment; no embedded targets) +CHECKS = { + "rds_stage": { + "description": "RDS MySQL (stage)", + "type": "tcp", + "host_env": "RDS_HOST", + "port": int(os.environ.get("RDS_PORT", "3306")), + }, + "redis_new": { + "description": "ElastiCache Redis (new, this stack)", + "type": "tcp", + "host_env": "REDIS_NEW_HOST", + "port": int(os.environ.get("REDIS_PORT", "6379")), + }, + "redis_existing": { + "description": "ElastiCache Redis (existing, default VPC)", + "type": "tcp", + "host_env": "REDIS_EXISTING_HOST", + "port": int(os.environ.get("REDIS_PORT", "6379")), + }, + "rabbitmq": { + "description": "RabbitMQ (default VPC)", + "type": "tcp", + "host_env": "RABBITMQ_HOST", + "port": int(os.environ.get("RABBITMQ_PORT", "5672")), + }, + "elasticsearch": { + "description": "Elasticsearch/OpenSearch (managed endpoint, HTTPS)", + "type": "tcp", + "host_env": "ES_HOST", + "port": int(os.environ.get("ES_PORT", "443")), + }, + "secrets_manager": { + "description": "Secrets Manager (read access)", + "type": "secrets", + "required_secrets": _csv_env("REQUIRED_SECRETS"), + }, + "dns_resolution": { + "description": "DNS resolution (cross-VPC)", + "type": "dns", + "hostnames": _csv_env("DNS_HOSTNAMES"), + }, + "nat_egress": { + "description": "NAT Gateway egress (internet connectivity)", + "type": "http", + "url": os.environ.get("NAT_EGRESS_URL", "https://httpbin.org/status/200"), + }, +} + + +# --------------------------------------------------------------------------- +# Check implementations +# --------------------------------------------------------------------------- +def check_tcp(host, port, timeout=5): + """Test TCP connectivity to a host:port.""" + try: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.settimeout(timeout) + start = time.time() + sock.connect((host, port)) + latency_ms = (time.time() - start) * 1000 + sock.close() + return { + "status": "PASS", + "message": f"Connected to {host}:{port} ({latency_ms:.0f}ms)", + } + except socket.timeout: + return {"status": "FAIL", "message": f"Timeout connecting to {host}:{port}"} + except socket.error as e: + return {"status": "FAIL", "message": f"Connection failed: {host}:{port} - {e}"} + + +def check_dns(hostnames): + """Test DNS resolution for a list of hostnames.""" + results = [] + all_pass = True + for hostname in hostnames: + try: + ip = socket.gethostbyname(hostname) + results.append(f"{hostname} -> {ip}") + except socket.gaierror as e: + results.append(f"{hostname} -> FAILED ({e})") + all_pass = False + return { + "status": "PASS" if all_pass else "FAIL", + "message": "; ".join(results), + } + + +def check_http(url, timeout=10): + """Test HTTP(S) connectivity.""" + try: + start = time.time() + response = urlopen(url, timeout=timeout) + latency_ms = (time.time() - start) * 1000 + return { + "status": "PASS", + "message": f"HTTP {response.status} from {url} ({latency_ms:.0f}ms)", + } + except (URLError, OSError) as e: + return {"status": "FAIL", "message": f"HTTP request failed: {url} - {e}"} + + +def check_secrets(required_secrets=None): + """Test Secrets Manager read access without listing or printing values. + + Attempts GetSecretValue on each required secret and reports + accessible vs denied vs not-found, without printing any values. + """ + if not required_secrets: + return { + "status": "SKIP", + "message": "Set REQUIRED_SECRETS (comma-separated) to enable this check", + } + + try: + import boto3 + + client = boto3.client("secretsmanager", region_name=AWS_REGION) + accessible = [] + denied = [] + not_found = [] + + for secret_name in required_secrets: + try: + client.get_secret_value(SecretId=secret_name) + accessible.append(secret_name) + except client.exceptions.AccessDeniedException: + denied.append(secret_name) + except client.exceptions.ResourceNotFoundException: + not_found.append(secret_name) + + parts = [f"{len(accessible)}/{len(required_secrets)} accessible"] + if denied: + parts.append(f"{len(denied)} denied: {', '.join(denied)}") + if not_found: + parts.append(f"{len(not_found)} not found: {', '.join(not_found)}") + + has_failures = len(denied) > 0 or len(not_found) > 0 + return { + "status": "FAIL" if has_failures else "PASS", + "message": "; ".join(parts), + } + except ImportError: + return {"status": "SKIP", "message": "boto3 not available"} + except Exception as e: + return {"status": "FAIL", "message": f"Secrets Manager error: {e}"} + + +# --------------------------------------------------------------------------- +# Runner +# --------------------------------------------------------------------------- +def run_checks(filter_check=None): + """Run all or a specific check and return results.""" + results = {} + for name, config in CHECKS.items(): + if filter_check and name != filter_check: + continue + + check_type = config["type"] + description = config["description"] + + if check_type == "tcp": + host_env = config.get("host_env") + host = os.environ.get(host_env) if host_env else None + if not host: + result = { + "status": "SKIP", + "message": f"Set {host_env} to enable this check", + } + else: + result = check_tcp(host, config["port"]) + elif check_type == "dns": + hostnames = config.get("hostnames") or [] + if not hostnames: + result = { + "status": "SKIP", + "message": "Set DNS_HOSTNAMES (comma-separated) to enable this check", + } + else: + result = check_dns(hostnames) + elif check_type == "http": + result = check_http(config["url"]) + elif check_type == "secrets": + result = check_secrets(config.get("required_secrets")) + else: + result = {"status": "SKIP", "message": f"Unknown check type: {check_type}"} + + results[name] = { + "description": description, + **result, + } + + return results + + +def print_results(results, as_json=False): + """Print results in human-readable or JSON format.""" + if as_json: + print(json.dumps(results, indent=2)) + return + + print("\n" + "=" * 70) + print("ATN Stage Infrastructure Smoke Test") + print("=" * 70) + + passed = 0 + failed = 0 + skipped = 0 + + for name, result in results.items(): + status = result["status"] + icon = {"PASS": "[OK]", "FAIL": "[FAIL]", "SKIP": "[SKIP]"}.get( + status, "[??]" + ) + + if status == "PASS": + passed += 1 + elif status == "FAIL": + failed += 1 + else: + skipped += 1 + + print(f"\n {icon} {result['description']}") + print(f" {result['message']}") + + print("\n" + "-" * 70) + print(f" Results: {passed} passed, {failed} failed, {skipped} skipped") + print("=" * 70 + "\n") + + +def main(): + parser = argparse.ArgumentParser(description="ATN Stage Smoke Test (Read-Only)") + parser.add_argument("--check", help="Run a specific check only") + parser.add_argument( + "--json", action="store_true", help="Output results as JSON" + ) + args = parser.parse_args() + + results = run_checks(filter_check=args.check) + print_results(results, as_json=args.json) + + # Exit 1 if any check failed + has_failures = any(r["status"] == "FAIL" for r in results.values()) + sys.exit(1 if has_failures else 0) + + +if __name__ == "__main__": + main() diff --git a/settings_local_stage.py b/settings_local_stage.py new file mode 100644 index 000000000000..80ea678a7bad --- /dev/null +++ b/settings_local_stage.py @@ -0,0 +1,407 @@ +# -*- coding: utf-8 -*- + +import logging +import os +import datetime +import json + +import boto3 +from botocore.exceptions import ClientError + +from olympia.lib.settings_base import * # noqa + + +# AWS Secrets Manager helper +_secrets_cache = {} + +def get_secret(secret_name, region_name="us-west-2"): + """Retrieve a secret from AWS Secrets Manager with caching.""" + if secret_name in _secrets_cache: + return _secrets_cache[secret_name] + + client = boto3.client(service_name='secretsmanager', region_name=region_name) + try: + response = client.get_secret_value(SecretId=secret_name) + secret = response['SecretString'] + # Try to parse as JSON, otherwise return raw string + try: + secret = json.loads(secret) + except json.JSONDecodeError: + pass + _secrets_cache[secret_name] = secret + return secret + except ClientError as e: + raise Exception(f"Failed to retrieve secret {secret_name}: {e}") + + +# Retrieve secrets from AWS Secrets Manager +_email_url_secret = get_secret('atn/stage/email_url') +_mysql_secret = get_secret('atn/stage/mysql') +_inbound_email_secret = get_secret('atn/stage/inbound_email') +_django_secret = get_secret('atn/stage/django_secret_key') +_celery_broker_secret = get_secret('atn/stage/celery_broker') +_recaptcha_secret = get_secret('atn/stage/recaptcha') +_fxa_secret = get_secret('atn/stage/fxa') +_cache_host_secret = get_secret('atn/stage/cache_host') +_celery_result_backend_secret = get_secret('atn/stage/celery_result_backend') +_es_host_secret = get_secret('atn/stage/elasticsearch_host') + + +EMAIL_URL = env.email_url('EMAIL_URL', default=_email_url_secret) +EMAIL_HOST = EMAIL_URL['EMAIL_HOST'] +EMAIL_PORT = EMAIL_URL['EMAIL_PORT'] +EMAIL_BACKEND = EMAIL_URL['EMAIL_BACKEND'] +EMAIL_HOST_USER = EMAIL_URL['EMAIL_HOST_USER'] +EMAIL_HOST_PASSWORD = EMAIL_URL['EMAIL_HOST_PASSWORD'] +EMAIL_USE_TLS = True +EMAIL_QA_ALLOW_LIST = '' +EMAIL_DENY_LIST = '' + +SEND_REAL_EMAIL = False +ENV = 'tbstage' +DEBUG = True +DEBUG_PROPAGATE_EXCEPTIONS = True +SESSION_COOKIE_SECURE = True +ENABLE_ADDON_SIGNING = False + +API_THROTTLE = False + +CDN_HOST = 'https://addons-stage.thunderbird.net' +DOMAIN = 'addons-stage.thunderbird.net' + +SERVER_EMAIL = 'thunderbird-seamonkey-ops@mozilla.com' +SITE_URL = 'https://' + DOMAIN +SERVICES_URL = 'https://services.addons-stage.thunderbird.net' +STATIC_URL = '%s/static/' % CDN_HOST +MEDIA_URL = '%s/user-media/' % CDN_HOST + +SESSION_COOKIE_DOMAIN = ".%s" % DOMAIN + +# Filter IP addresses of allowed clients that can post email through the API. +# This is normally blank on production. +ALLOWED_CLIENTS_EMAIL_API = [] +# Auth token required to authorize inbound email. +INBOUND_EMAIL_SECRET_KEY = _inbound_email_secret['secret_key'] +# Validation key we need to send in POST response. +INBOUND_EMAIL_VALIDATION_KEY = _inbound_email_secret['validation_key'] +# Domain emails should be sent to. +INBOUND_EMAIL_DOMAIN = 'addons-stage.thunderbird.net' + +NETAPP_STORAGE_ROOT = env('NETAPP_STORAGE_ROOT') +NETAPP_STORAGE = NETAPP_STORAGE_ROOT + '/shared_storage' +GUARDED_ADDONS_PATH = NETAPP_STORAGE_ROOT + '/guarded-addons' +MEDIA_ROOT = NETAPP_STORAGE + '/uploads' + +TMP_PATH = os.path.join(NETAPP_STORAGE, 'tmp') +PACKAGER_PATH = os.path.join(TMP_PATH, 'packager') + +ADDONS_PATH = NETAPP_STORAGE_ROOT + '/files' + +# Must be forced in settings because name => path can't be dyncamically +# computed: reviewer_attachmentS VS reviewer_attachment. +# TODO: rename folder on file system. +# (One can also just rename the setting, but this will not be consistent +# with the naming scheme.) +REVIEWER_ATTACHMENTS_PATH = MEDIA_ROOT + '/reviewer_attachment' + +FILESYSTEM_CACHE_ROOT = NETAPP_STORAGE_ROOT + '/cache' + +DATABASES = {} +DATABASES['default'] = { + 'NAME': 'addons_mozilla_org', + 'USER': _mysql_secret['username'], + 'PASSWORD': _mysql_secret['password'], + 'HOST': _mysql_secret['host'], + 'PORT': str(_mysql_secret['port']), +} +DATABASES['default']['ENGINE'] = 'django.db.backends.mysql' +# Run all views in a transaction (on master) unless they are decorated not to. +DATABASES['default']['ATOMIC_REQUESTS'] = True +# Pool our database connections up for 300 seconds +DATABASES['default']['CONN_MAX_AGE'] = 300 +DATABASES['default']['OPTIONS'] = {'sql_mode': 'STRICT_ALL_TABLES'} +DATABASES['default']['TEST'] = { + 'CHARSET': 'utf8', + 'COLLATION': 'utf8_general_ci' +} + +DATABASES['slave'] = { + 'NAME': 'addons_mozilla_org', + 'USER': _mysql_secret['username'], + 'PASSWORD': _mysql_secret['password'], + # Use the same host as default; the old value 'database.services.atn-stage' + # is a private DNS name in a PHZ not resolvable from the ECS VPC + # For stage, reading from the primary is acceptable + 'HOST': _mysql_secret['host'], + 'PORT': str(_mysql_secret['port']), +} +# Do not open a transaction for every view on the slave DB. +DATABASES['slave']['ATOMIC_REQUESTS'] = False +DATABASES['slave']['ENGINE'] = 'django.db.backends.mysql' +# Pool our database connections up for 300 seconds +DATABASES['slave']['CONN_MAX_AGE'] = 300 +DATABASES['slave']['OPTIONS'] = {'sql_mode': 'STRICT_ALL_TABLES'} + +SERVICES_DATABASE = { + 'NAME': 'addons_mozilla_org', + 'USER': _mysql_secret['username'], + 'PASSWORD': _mysql_secret['password'], + # Same fix as slave above + 'HOST': _mysql_secret['host'], + 'PORT': str(_mysql_secret['port']), +} + +SLAVE_DATABASES = ['slave'] + +CACHE_MIDDLEWARE_KEY_PREFIX = CACHE_KEY_PREFIX + +CACHES = { + 'filesystem': { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': FILESYSTEM_CACHE_ROOT, + } +} + +CACHES['default'] = { + 'LOCATION': [ + _cache_host_secret, + ] +} +CACHES['default']['TIMEOUT'] = 60 +CACHES['default']['BACKEND'] = 'django.core.cache.backends.memcached.MemcachedCache' +CACHES['default']['KEY_PREFIX'] = CACHE_KEY_PREFIX + +SECRET_KEY = _django_secret + + +# Celery +AWS_STATS_S3_BUCKET = 'versioncheck-athena-results-stage' +CELERY_RESULT_BACKEND = _celery_result_backend_secret +CELERY_BROKER_URL = _celery_broker_secret +CELERY_TASK_IGNORE_RESULT = True +CELERY_WORKER_DISABLE_RATE_LIMITS = True +CELERY_BROKER_CONNECTION_TIMEOUT = 0.5 + +# Always eager to function with no brokers, remove when adding brokers. +CELERY_TASK_ALWAYS_EAGER = False +CELERY_ALWAYS_EAGER = False + +LOG_LEVEL = logging.DEBUG + +LOGGING['loggers'].update({ + 'adi.updatecountsfromfile': {'level': logging.INFO}, + 'amqp': {'level': logging.WARNING}, + 'raven': {'level': logging.WARNING}, + 'requests': {'level': logging.WARNING}, + 'z.addons': {'level': logging.DEBUG}, + 'z.task': {'level': logging.DEBUG}, + 'z.redis': {'level': logging.DEBUG}, + 'z.pool': {'level': logging.ERROR}, +}) + +# New Recaptcha V2 +NOBOT_RECAPTCHA_PUBLIC_KEY = _recaptcha_secret['public'] +NOBOT_RECAPTCHA_PRIVATE_KEY = _recaptcha_secret['private'] + +ES_TIMEOUT = 60 +# Note: there is no separate stage ES domain; amo-tb is shared. +ES_HOSTS = [_es_host_secret] +ES_URLS = ['http://%s' % h for h in ES_HOSTS] +ES_INDEXES = dict((k, '%s_%s' % (v, ENV)) for k, v in ES_INDEXES.items()) + +# TODO: STATSD +# STATSD_HOST = env('STATSD_HOST') +# STATSD_PREFIX = env('STATSD_PREFIX') + +# CEF_PRODUCT = STATSD_PREFIX + +NEW_FEATURES = True + +CLEANCSS_BIN = 'node_modules/.bin/cleancss' +UGLIFY_BIN = 'node_modules/.bin/uglifyjs' +ADDONS_LINTER_BIN = 'node_modules/.bin/addons-linter' + +LESS_PREPROCESS = True + +XSENDFILE_HEADER = 'X-Accel-Redirect' + +GOOGLE_ANALYTICS_CREDENTIALS = {} # TODO GOOGLE ANALYTICS +GOOGLE_ANALYTICS_CREDENTIALS['user_agent'] = None +GOOGLE_ANALYTICS_CREDENTIALS['token_expiry'] = datetime.datetime(2013, 1, 3, 1, 20, 16, 45465) # noqa + +GEOIP_URL = 'https://geo.services.mozilla.com' + +# 256-byte AES key file for encrypting developer api keys. Mind the dictionary format. +AES_KEYS = {'api_key:secret': '/data/aeskeys/api_key_secret.key'} + +# Signing +SIGNING_SERVER = '' # TODO SIGNING SERVER? + +SENTRY_DSN = '' # TODO SENTRY + +GOOGLE_ANALYTICS_DOMAIN = 'addons-stage.thunderbird.net' + +NEWRELIC_ENABLE = False + +FXA_CONFIG = { + 'default': { + 'client_id': _fxa_secret['client_id'], + 'client_secret': _fxa_secret['client_secret'], + 'content_host': 'https://accounts.firefox.com', + 'oauth_host': 'https://oauth.accounts.firefox.com/v1', + 'profile_host': 'https://profile.accounts.firefox.com/v1', + 'redirect_url': + 'https://%s/api/v3/accounts/authenticate/' % DOMAIN, + 'scope': 'profile', + }, + 'internal': { + 'client_id': '', + 'client_secret': '', + 'content_host': 'https://accounts.firefox.com', + 'oauth_host': 'https://oauth.accounts.firefox.com/v1', + 'profile_host': 'https://profile.accounts.firefox.com/v1', + 'redirect_url': + 'https://addons-admin.stage.mozaws.net/fxa-authenticate', + 'scope': 'profile', + }, + 'amo': { + 'client_id': _fxa_secret['client_id'], + 'client_secret': _fxa_secret['client_secret'], + 'content_host': 'https://accounts.firefox.com', + 'oauth_host': 'https://oauth.accounts.firefox.com/v1', + 'profile_host': 'https://profile.accounts.firefox.com/v1', + 'redirect_url': + 'https://addons-stage.thunderbird.net/api/v3/accounts/authenticate/', + 'scope': 'profile', + 'skip_register_redirect': True, + }, +} +DEFAULT_FXA_CONFIG_NAME = 'default' +INTERNAL_FXA_CONFIG_NAME = 'internal' +ALLOWED_FXA_CONFIGS = ['default', 'amo'] + +# cors_endpoint_overrides was removed in the Thunderbird fork; +# CORS is handled via django-cors-headers middleware configuration +# CORS_ENDPOINT_OVERRIDES = cors_endpoint_overrides([...]) + +VALIDATOR_TIMEOUT = 360 + +ES_DEFAULT_NUM_SHARDS = 10 + +READ_ONLY = env.bool('READ_ONLY', default=False) + +# TODO: Github user ? +GITHUB_API_USER = '' +GITHUB_API_TOKEN = '' + +RECOMMENDATION_ENGINE_URL = env( + 'RECOMMENDATION_ENGINE_URL', +default='https://taar.stage.mozaws.net/api/recommendations/') + +# OVERRIDE +# new base settings below + +ALLOWED_HOSTS = [ + '.thunderbird.net', + '.allizom.org', + '.mozilla.org', + '.mozilla.com', + '.mozilla.net', + '.mozaws.net', +] + +FLIGTAR = 'addons+fligtar-rip@thunderbird.net' +THEMES_EMAIL = 'addons+theme-reviews@thunderbird.net' +ABUSE_EMAIL = 'addons+abuse@thunderbird.net' +NOBODY_EMAIL = 'nobody@thunderbird.net' + +DEFAULT_APP = 'thunderbird' + +# URL paths +# paths for images, e.g. mozcdn.com/amo or '/static' +VAMO_URL = 'https://versioncheck.addons-stage.thunderbird.net' +NEW_PERSONAS_UPDATE_URL = VAMO_URL + '/%(locale)s/themes/update-check/%(id)d' + +# TODO Outgoing URL bouncer +REDIRECT_URL = '' +REDIRECT_SECRET_KEY = '' + +# Allow URLs from these servers. Use full domain names. +REDIRECT_URL_ALLOW_LIST = ['addons-stage.thunderbird.net'] + +# Email settings +ADDONS_EMAIL = "Thunderbird Add-ons " +DEFAULT_FROM_EMAIL = ADDONS_EMAIL + +# Please use all lowercase for the deny_list. +EMAIL_DENY_LIST = ( + 'nobody@thunderbird.net', +) + +# URL for Add-on Validation FAQ. +VALIDATION_FAQ_URL = ('https://wiki.mozilla.org/Add-ons/Reviewers/Guide/' + 'AddonReviews#Step_2:_Automatic_validation') + +# CSP Settings +PROD_CDN_HOST = 'https://addons-stage.thunderbird.net/' +ANALYTICS_HOST = 'https://ssl.google-analytics.com' + +CSP_BASE_URI = ( + "'self'", + # Required for the legacy discovery pane. + 'https://addons-stage.thunderbird.net', +) +CSP_CONNECT_SRC = ( + "'self'", + 'https://sentry.prod.mozaws.net', +) +CSP_FORM_ACTION = ( + "'self'", + 'https://developer.mozilla.org', +) +CSP_FONT_SRC = ( + "'self'", + PROD_CDN_HOST, +) +CSP_CHILD_SRC = ( + "'self'", + 'https://www.google.com/recaptcha/', +) +CSP_FRAME_SRC = CSP_CHILD_SRC +CSP_IMG_SRC = ( + "'self'", + 'data:', # Used in inlined mobile css. + 'blob:', # Needed for image uploads. + ANALYTICS_HOST, + PROD_CDN_HOST, + 'https://static.addons.mozilla.net', # CDN origin server. + 'https://sentry.prod.mozaws.net', +) +CSP_MEDIA_SRC = ( + 'https://videos.cdn.mozilla.net', +) +CSP_OBJECT_SRC = ("'none'",) + +CSP_SCRIPT_SRC = ( + 'https://ssl.google-analytics.com/ga.js', + 'https://www.google.com/recaptcha/', + 'https://www.gstatic.com/recaptcha/', + PROD_CDN_HOST, +) +CSP_STYLE_SRC = ( + "'self'", + "'unsafe-inline'", + PROD_CDN_HOST, +) + +# An approved list of domains that the authentication script will redirect to +# upon successfully logging in or out. +VALID_LOGIN_REDIRECTS = { + 'builder': 'https://builder.addons.mozilla.org', + 'builderstage': 'https://builder-addons.allizom.org', + 'buildertrunk': 'https://builder-addons-dev.allizom.org', +} + +# Blog URL +DEVELOPER_BLOG_URL = 'http://blog.mozilla.com/addons/feed/' + diff --git a/src/olympia/amo/management/commands/ro_healthcheck.py b/src/olympia/amo/management/commands/ro_healthcheck.py new file mode 100644 index 000000000000..d4b07ddc2bd3 --- /dev/null +++ b/src/olympia/amo/management/commands/ro_healthcheck.py @@ -0,0 +1,266 @@ +# -*- coding: utf-8 -*- +""" +Read-only health check for ECS deployment validation + +Validates that the Django application can boot, connect to all backend +services, and execute read-only operations in the ECS Fargate environment. +This sits between the infrastructure smoke test (TCP connectivity) and +running the full application (read-only and write operations) + +What this checks + - Container has correct Python path, deps, and module loading + - Django settings import works end-to-end (including Secrets Manager) + - ORM can connect to MySQL and execute SELECT queries + - Cache backend (Redis/Memcached) initialises and responds + - Celery broker (RabbitMQ) is reachable + - Elasticsearch/OpenSearch client can connect + +What this does NOT do: + - Write to any database, cache, queue, or search index + - Modify any state anywhere + - Run migrations + +Sample Usage: + python manage.py ro_healthcheck # Run all checks + python manage.py ro_healthcheck --json # Output as JSON +""" + +import json as json_module +import sys +import time + +from django.core.management.base import BaseCommand +from django.conf import settings + +import olympia.core.logger + +log = olympia.core.logger.getLogger("z.ro_healthcheck") + + +class Command(BaseCommand): + help = "Read-only health check for ECS deployment validation" + + def add_arguments(self, parser): + parser.add_argument( + "--json", + action="store_true", + help="Output results as JSON", + ) + + def handle(self, *args, **options): + results = {} + output_json = options.get("json", False) + + # ----------------------------------------------------------------- + # 1. Django settings loaded (if we got here, this already passed) + # ----------------------------------------------------------------- + results["settings"] = { + "description": "Django settings import", + "status": "PASS", + "message": f"DJANGO_SETTINGS_MODULE={settings.SETTINGS_MODULE}", + } + + # ----------------------------------------------------------------- + # 2. Database connectivity (read-only) + # ----------------------------------------------------------------- + results["database"] = self._check_database() + + # ----------------------------------------------------------------- + # 3. Cache backend + # ----------------------------------------------------------------- + results["cache"] = self._check_cache() + + # ----------------------------------------------------------------- + # 4. Celery broker (RabbitMQ) + # ----------------------------------------------------------------- + results["celery_broker"] = self._check_celery_broker() + + # ----------------------------------------------------------------- + # 5. Elasticsearch / OpenSearch + # ----------------------------------------------------------------- + results["elasticsearch"] = self._check_elasticsearch() + + # ----------------------------------------------------------------- + # Output + # ----------------------------------------------------------------- + if output_json: + self.stdout.write(json_module.dumps(results, indent=2)) + else: + self._print_results(results) + + has_failures = any(r["status"] == "FAIL" for r in results.values()) + if has_failures: + sys.exit(1) + + def _check_database(self): + """Connect to MySQL and run a read-only query via ORM. + + Sets session to transaction_read_only BEFORE any ORM work, + and fails fast if read-only mode cannot just be enforced + """ + try: + from django.db import connections + + start = time.time() + conn = connections["default"] + conn.ensure_connection() + cursor = conn.cursor() + + # Force read-only session BEFORE any ORM work + cursor.execute("SET SESSION transaction_read_only = 1;") + + # Verify read-only mode is actually active + cursor.execute("SELECT @@session.transaction_read_only;") + ro_flag = cursor.fetchone()[0] + if ro_flag != 1: + cursor.close() + return { + "description": "MySQL database (read-only enforcement)", + "status": "FAIL", + "message": "Could not enforce read-only session", + } + + # Now safe to run ORM queries -- writes would be rejected by MySQL + from olympia.addons.models import Addon + + count = Addon.objects.count() + latency_ms = (time.time() - start) * 1000 + + # Clean up + cursor.execute("SET SESSION transaction_read_only = 0;") + cursor.close() + + return { + "description": "MySQL database (read-only ORM query)", + "status": "PASS", + "message": f"Connected, {count} addons ({latency_ms:.0f}ms)", + } + except Exception as e: + # Include configured host for diagnostics + try: + db_host = settings.DATABASES.get("default", {}).get("HOST", "not set") + db_engine = settings.DATABASES.get("default", {}).get("ENGINE", "not set") + diag = f" [configured: engine={db_engine}, host={db_host}]" + except Exception: + diag = "" + return { + "description": "MySQL database (read-only ORM query)", + "status": "FAIL", + "message": f"{e}{diag}", + } + + def _check_cache(self): + """Verify Django cache backend can connect and respond""" + try: + from django.core.cache import cache + + start = time.time() + + # Use a harmless get (returns None if key doesn't exist) + # This exercises the full cache client initialisation path + cache.get("ro_healthcheck_probe") + latency_ms = (time.time() - start) * 1000 + + backend = settings.CACHES.get("default", {}).get( + "BACKEND", "unknown" + ) + + return { + "description": "Cache backend", + "status": "PASS", + "message": f"Backend: {backend} ({latency_ms:.0f}ms)", + } + except Exception as e: + return { + "description": "Cache backend", + "status": "FAIL", + "message": str(e), + } + + def _check_celery_broker(self): + """Verify Celery can connect to the broker (RabbitMQ). + + Uses ensure_connection with a short timeout + """ + try: + from olympia.amo.celery import app as celery_app + + start = time.time() + conn = celery_app.connection() + conn.ensure_connection(max_retries=1, timeout=5) + conn.close() + latency_ms = (time.time() - start) * 1000 + + return { + "description": "Celery broker (RabbitMQ)", + "status": "PASS", + "message": f"Connected ({latency_ms:.0f}ms)", + } + except Exception as e: + # Strip any connection details from the error + error_msg = str(e).split("@")[-1] if "@" in str(e) else str(e) + return { + "description": "Celery broker (RabbitMQ)", + "status": "FAIL", + "message": error_msg, + } + + def _check_elasticsearch(self): + """Verify Elasticsearch/OpenSearch client can connect. + + Calls es.info() which is a read-only cluster metadata endpoint + """ + try: + # olympia.lib.es.utils provides helper functions for reindexing, + # but the canonical ES client factory lives in olympia.amo.search + # We'd import from there to avoid ImportError and ensure consistency + from olympia.amo.search import get_es + + start = time.time() + es = get_es() + info = es.info(request_timeout=5) + latency_ms = (time.time() - start) * 1000 + + version = info.get("version", {}).get("number", "unknown") + + return { + "description": "Elasticsearch / OpenSearch", + "status": "PASS", + "message": f"Reachable, version: {version} ({latency_ms:.0f}ms)", + } + except Exception as e: + # ES may require SigV4 auth or be unavailable; degrade gracefully + error_type = type(e).__name__ + return { + "description": "Elasticsearch / OpenSearch", + "status": "FAIL", + "message": f"{error_type}: {e}", + } + + def _print_results(self, results): + """Print results in human-readable format""" + self.stdout.write("") + self.stdout.write("=" * 70) + self.stdout.write("ATN Read-Only Health Check (ECS Deployment Validation)") + self.stdout.write("=" * 70) + + passed = 0 + failed = 0 + + for name, result in results.items(): + status = result["status"] + icon = "[OK]" if status == "PASS" else "[FAIL]" + + if status == "PASS": + passed += 1 + else: + failed += 1 + + self.stdout.write(f"\n {icon} {result['description']}") + self.stdout.write(f" {result['message']}") + + self.stdout.write("") + self.stdout.write("-" * 70) + self.stdout.write(f" Results: {passed} passed, {failed} failed") + self.stdout.write("=" * 70) + self.stdout.write("")