diff --git a/.github/prompts/aks-check-nodes.prompt.md b/.github/prompts/aks-check-nodes.prompt.md new file mode 100644 index 0000000..3d94395 --- /dev/null +++ b/.github/prompts/aks-check-nodes.prompt.md @@ -0,0 +1,37 @@ +--- +model: Claude Sonnet 4 +description: 'This prompt is used to check the health status of nodes in an Azure Kubernetes Service (AKS) cluster.' +--- + +# Check for AKS Nodes Health Issues + +Check the health status of all nodes in an Azure Kubernetes Service (AKS) cluster and identify any nodes that are not in a 'Ready' state. Provide a summary of the issues found and suggest possible remediation steps. + +### Run these Commands + +```bash +kubectl get nodes +kubectl describe node +kubectl top nodes +kubectl cluster-info +``` + + +### Output +The output a report in a readable format (e.g., plain text, JSON) that includes: +- Cluster Name +- Node Name +- Node Status +- Issues Found (if any) +- Suggested Remediation Steps + +### Remediation Suggestions +For nodes that are not in the 'Ready' state, suggest possible remediation steps such as: +- Checking for resource constraints (CPU, memory) +- Reviewing node logs for errors +- Scaling the cluster if resource limits are being hit +- Contacting Azure support if the issue persists + +### Note +Ensure that you have the necessary permissions to access the AKS clusters and perform the required operations. +Do not generate any scripts. \ No newline at end of file diff --git a/.github/prompts/aks-check-pods.prompt.md b/.github/prompts/aks-check-pods.prompt.md new file mode 100644 index 0000000..aefdcc0 --- /dev/null +++ b/.github/prompts/aks-check-pods.prompt.md @@ -0,0 +1,35 @@ +--- +model: Claude Sonnet 4.5 +description: 'This prompt is used to check the health status of pods in an Azure Kubernetes Service (AKS) cluster.' +--- + +# Check for Pod Health Issues + +Check the health status of all pods in an Azure Kubernetes Service (AKS) cluster and identify any pods that are not in a 'Running' state. Provide a summary of the issues found and suggest possible remediation steps. + +### Run these Commands + +```bash +kubectl get pods -n +kubectl describe pod -n +kubectl logs -n +``` + +### Output +The output a report in a readable format (e.g., plain text, JSON) that includes: +- Cluster Name +- Pod Name +- Pod Status +- Issues Found (if any) +- Suggested Remediation Steps + +### Remediation Suggestions +For pods that are not in the 'Running' state, suggest possible remediation steps such as: +- Checking for resource constraints (CPU, memory) +- Reviewing pod logs for errors +- Scaling the cluster if resource limits are being hit +- Redeploying the pod if it is in a crash loop + +### Note +Do not generate any scripts. +Do not directly fix the issues; only provide analysis and suggestions. \ No newline at end of file diff --git a/.github/prompts/aks-remediation.prompt.md b/.github/prompts/aks-remediation.prompt.md new file mode 100644 index 0000000..e2ddb20 --- /dev/null +++ b/.github/prompts/aks-remediation.prompt.md @@ -0,0 +1,15 @@ +--- +model: Claude Sonnet 4.5 +description: 'This prompt is used to provide remediation suggestions for pods in an Azure Kubernetes Service (AKS) cluster.' +--- + +# AKS Remediation for cluster issues + +Provide remediation based on analysis and suggestions from the previous steps. + +### Proposed Remediation Steps +Be specific in your remediation suggestions, including commands to run, configuration changes to make, or resources to consult. Tailor the suggestions based on the identified issues. + +# Notes +- Do not generate any scripts. +- Always ask for confirmation before applying any remediation steps. diff --git a/.github/workflows/argocd-deployment-failure.yml b/.github/workflows/argocd-deployment-failure.yml index 596891b..bc095d9 100644 --- a/.github/workflows/argocd-deployment-failure.yml +++ b/.github/workflows/argocd-deployment-failure.yml @@ -13,102 +13,87 @@ jobs: runs-on: ubuntu-latest steps: + - name: Verify webhook signature + id: verify + env: + PAYLOAD: ${{ toJson(github.event.client_payload) }} + WEBHOOK_SECRET: ${{ secrets.ARGOCD_WEBHOOK_SECRET }} + run: | + # This is a placeholder - GitHub repository_dispatch doesn't include signatures + # The security comes from the GitHub token scope limitation + echo "Webhook received from ArgoCD" + echo "App: ${{ github.event.client_payload.app_name }}" + echo "Status: ${{ github.event.client_payload.operation_phase }}" + + - name: Extract deployment info + id: deployment_info + run: | + APP_NAME="${{ github.event.client_payload.app_name }}" + HEALTH_STATUS="${{ github.event.client_payload.health_status }}" + SYNC_STATUS="${{ github.event.client_payload.sync_status }}" + REVISION="${{ github.event.client_payload.revision }}" + MESSAGE="${{ github.event.client_payload.message }}" + REPO_URL="${{ github.event.client_payload.repo_url }}" + TIMESTAMP="${{ github.event.client_payload.timestamp }}" + + echo "app_name=${APP_NAME}" >> $GITHUB_OUTPUT + echo "health_status=${HEALTH_STATUS}" >> $GITHUB_OUTPUT + echo "sync_status=${SYNC_STATUS}" >> $GITHUB_OUTPUT + echo "revision=${REVISION}" >> $GITHUB_OUTPUT + - name: Create GitHub Issue uses: actions/github-script@v7 with: script: | - const payload = context.payload.client_payload || {}; - const appName = payload.app_name || 'unknown'; - const clusterName = payload.cluster || 'in-cluster'; - const namespace = payload.namespace || 'default'; - const healthStatus = payload.health_status || 'unknown'; - const syncStatus = payload.sync_status || 'unknown'; - const message = payload.message || 'No error message available'; - const revision = payload.revision || 'unknown'; - const repoUrl = payload.repo_url || ''; - const timestamp = payload.timestamp || new Date().toISOString(); - const resources = payload.resources || []; - - // Build degraded resources section - let degradedDetails = ''; - const degradedResources = resources.filter(r => - r.health && (r.health.status === 'Degraded' || r.health.status === 'Missing' || r.health.status === 'Unknown') - ); - - if (degradedResources.length > 0) { - degradedDetails = '\n### πŸ”΄ Degraded Resources\n\n'; - - for (const resource of degradedResources) { - const kind = resource.kind || 'Unknown'; - const name = resource.name || 'unknown'; - const resourceNamespace = resource.namespace || namespace; - const healthStatus = resource.health?.status || 'Unknown'; - const healthMessage = resource.health?.message || 'No message'; - const syncStatus = resource.status || 'Unknown'; - - degradedDetails += `#### ${kind}: \`${name}\`\n\n`; - degradedDetails += `- **Namespace:** ${resourceNamespace}\n`; - degradedDetails += `- **Health Status:** ${healthStatus}\n`; - degradedDetails += `- **Sync Status:** ${syncStatus}\n`; - degradedDetails += `- **Message:** ${healthMessage}\n\n`; - - // Add kubectl command for this specific resource - degradedDetails += `**Troubleshoot:**\n\`\`\`bash\n`; - degradedDetails += `kubectl describe ${kind.toLowerCase()} ${name} -n ${resourceNamespace}\n`; - if (kind === 'Pod' || kind === 'Deployment' || kind === 'StatefulSet' || kind === 'DaemonSet') { - degradedDetails += `kubectl logs ${kind.toLowerCase()}/${name} -n ${resourceNamespace}\n`; - } - degradedDetails += `\`\`\`\n\n`; - } - } + const appName = '${{ github.event.client_payload.app_name }}'; + const healthStatus = '${{ github.event.client_payload.health_status }}'; + const syncStatus = '${{ github.event.client_payload.sync_status }}'; + const operationPhase = '${{ github.event.client_payload.operation_phase }}'; + const message = '${{ github.event.client_payload.message }}'; + const revision = '${{ github.event.client_payload.revision }}'; + const repoUrl = '${{ github.event.client_payload.repo_url }}'; + const timestamp = '${{ github.event.client_payload.timestamp }}'; + const clusterName = '${{ github.event.client_payload.cluster_name }}'; + const clusterServer = '${{ github.event.client_payload.cluster_server }}'; + const destNamespace = '${{ github.event.client_payload.destination_namespace }}'; const issueTitle = `🚨 ArgoCD Deployment Failed: ${appName}`; const issueBody = `## ArgoCD Deployment Failure **Application:** \`${appName}\` + **Status:** ${operationPhase} **Timestamp:** ${timestamp} - ### Cluster Information - - | Field | Value | - |-------|-------| - | Cluster Name | \`${clusterName}\` | - | Namespace | \`${namespace}\` | - - ### Application Status + ### Details | Field | Value | |-------|-------| + | Cluster | \`${clusterName || clusterServer}\` | + | Namespace | \`${destNamespace}\` | | Health Status | \`${healthStatus}\` | | Sync Status | \`${syncStatus}\` | | Revision | \`${revision}\` | | Repository | ${repoUrl} | + ### Raw payload + \`\`\`json + ${JSON.stringify(github.event.client_payload, null, 2)} + \`\`\` + ### Error Message \`\`\` - ${message} + ${message || 'No error message available'} \`\`\` - ${degradedDetails} - ### Troubleshooting Commands - - \`\`\`bash - # Check application status in ArgoCD - argocd app get ${appName} - # Check pods in namespace - kubectl get pods -n ${namespace} + ### Recommended Actions - # Describe failed pods - kubectl describe pods -n ${namespace} - - # Get pod logs - kubectl logs -n ${namespace} - - # Check events - kubectl get events -n ${namespace} --sort-by='.lastTimestamp' - \`\`\` + 1. Check the ArgoCD UI for detailed error logs + 2. Review the application manifest for syntax errors + 3. Verify resource quotas and limits + 4. Check for image pull errors or missing secrets + 5. Review recent commits to the source repository ### Quick Links diff --git a/Act-3/README.md b/Act-3/README.md new file mode 100644 index 0000000..c9c80a6 --- /dev/null +++ b/Act-3/README.md @@ -0,0 +1,35 @@ +# Act-3: Kubernetes Operations Don’t Scale Linearly + +Problem: +Kubernetes becomes the operational choke point and your team in having a hard time dealing with misconfigurations, failed deployments and runtime issues. +Your team, platform engineering, is busy firefight instead of improving the platform. The deep Kubernetes expertise on your team doesn't scale across teams. + +Answer: +Let agents give your team a hand, turning a siloed operational knowledge into a shared capability. + +## Crawl + +A Senior member of the team (Steve) has created a reusable prompts that can run arbitrarily when someone needs to troubleshoot a container workload on an AKS cluster. Steve made this available in the repo and this can be used in GitHub Copilot in VSCode via "Slash Commands" if you follow the folder/naming convension set out by GitHub/VScode (i.e. `/.github/prompts/.prompt.md`). + +Execute this prompt locally: + +![write-prompt](images/write-prompt.png) + +## Walk/Run + +Create a GitHub Action Workflow that will be called upon for each push to the repo. For this example it will be just for the main branch, but you can set up the triggers/rules for when the workflow gets run. See the docs about [Events That Trigger Workflows](https://docs.github.com/en/actions/reference/workflows-and-actions/events-that-trigger-workflows). + +> [!NOTE] +> We will use the GitHub Copilot CLI to automate the execution of our custom prompt in a scripted CI Runner - GitHub Actions. + +We have an example of this in [Act-2 .github/workflows](../.github/workflows/copilot.generate-docs.yml). + +### What does this do? + +- The GitHub Action Workflow triggers on each push to the main branch - this ensures that documentation is created, if and when needed regardless if you remembered or not. This ensures that all team members have docs created for them, even if they did not run the `/write-docs` prompt manually before committing their changes. It also can be run manually in GitHub Actions since it also has the `workflow_dispatch` trigger enabled...this is optional of course but we have it here as an example anyways. +- It installs the GitHub Copilot CLI +- It ensures that we provide it credentials to call GitHub Copilot +> [!NOTE] +> Currently calling GitHub Copilot is a User only ability - meaning that GitHub Copilot is licensed to and therefore only callable by a human user account. In this example we have stored a Fine-Grained GitHub Personal Access Token (PAT -> a user bound API Key) that has been scoped with the `Copilot-Requests: Read-only` Permission. As such this will consume GitHub Copilot PRUs (Premium Request Units) from the tied user account. Today this is the only billing model to consume GitHub Copilot. +- Store the required prompt file contents as an environment variable +- Pass in the prompt and call GitHub Copilot CLI to generate docs \ No newline at end of file diff --git a/Act-3/aks-store-all-in-one.yaml b/Act-3/aks-store-all-in-one.yaml new file mode 100644 index 0000000..e2c10c1 --- /dev/null +++ b/Act-3/aks-store-all-in-one.yaml @@ -0,0 +1,614 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: mongodb +spec: + serviceName: mongodb + replicas: 1 + selector: + matchLabels: + app: mongodb + template: + metadata: + labels: + app: mongodb + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: mongodb + image: mcr.microsoft.com/mirror/docker/library/mongo:4.2 + ports: + - containerPort: 27017 + name: mongodb + resources: + requests: + cpu: 5m + memory: 75Mi + limits: + cpu: 25m + memory: 1024Mi + livenessProbe: + exec: + command: + - mongo + - "--eval" + - db.runCommand('ping').ok + initialDelaySeconds: 30 + periodSeconds: 10 + timeoutSeconds: 5 + failureThreshold: 3 + readinessProbe: + exec: + command: + - mongo + - "--eval" + - db.runCommand('ping').ok + initialDelaySeconds: 5 + periodSeconds: 5 + timeoutSeconds: 5 + failureThreshold: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: mongodb +spec: + ports: + - port: 27017 + selector: + app: mongodb + type: ClusterIP +--- +apiVersion: v1 +data: + rabbitmq_enabled_plugins: | + [rabbitmq_management,rabbitmq_prometheus,rabbitmq_amqp1_0]. +kind: ConfigMap +metadata: + name: rabbitmq-enabled-plugins +--- +apiVersion: v1 +kind: Secret +metadata: + name: rabbitmq-secrets +data: + RABBITMQ_DEFAULT_USER: dXNlcm5hbWU= + RABBITMQ_DEFAULT_PASS: cGFzc3dvcmQ= +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: rabbitmq +spec: + serviceName: rabbitmq + replicas: 1 + selector: + matchLabels: + app: rabbitmq + template: + metadata: + labels: + app: rabbitmq + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: rabbitmq + image: mcr.microsoft.com/azurelinux/base/rabbitmq-server:3.13 + ports: + - containerPort: 5672 + name: rabbitmq-amqp + - containerPort: 15672 + name: rabbitmq-http + envFrom: + - secretRef: + name: rabbitmq-secrets + resources: + requests: + cpu: 10m + memory: 128Mi + limits: + cpu: 250m + memory: 256Mi + startupProbe: + tcpSocket: + port: 5672 + failureThreshold: 30 + initialDelaySeconds: 10 + periodSeconds: 10 + readinessProbe: + tcpSocket: + port: 5672 + failureThreshold: 3 + initialDelaySeconds: 5 + periodSeconds: 5 + livenessProbe: + tcpSocket: + port: 5672 + failureThreshold: 3 + initialDelaySeconds: 30 + periodSeconds: 10 + volumeMounts: + - name: rabbitmq-enabled-plugins + mountPath: /etc/rabbitmq/enabled_plugins + subPath: enabled_plugins + volumes: + - name: rabbitmq-enabled-plugins + configMap: + name: rabbitmq-enabled-plugins + items: + - key: rabbitmq_enabled_plugins + path: enabled_plugins +--- +apiVersion: v1 +kind: Service +metadata: + name: rabbitmq +spec: + selector: + app: rabbitmq + ports: + - name: rabbitmq-amqp + port: 5672 + targetPort: 5672 + - name: rabbitmq-http + port: 15672 + targetPort: 15672 + type: ClusterIP +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: order-service-configs +data: + ORDER_QUEUE_PORT: "5672" + ORDER_QUEUE_HOSTNAME: "rabbitmq" + ORDER_QUEUE_NAME: "orders" + FASTIFY_ADDRESS: "0.0.0.0" +--- +apiVersion: v1 +kind: Secret +metadata: + name: order-service-secrets +data: + ORDER_QUEUE_USERNAME: dXNlcm5hbWU= + ORDER_QUEUE_PASSWORD: cGFzc3dvcmQ= +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: order-service +spec: + replicas: 1 + selector: + matchLabels: + app: order-service + template: + metadata: + labels: + app: order-service + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: order-service + image: ghcr.io/azure-samples/aks-store-demo/ordr-service:2.1.0 + ports: + - containerPort: 3000 + envFrom: + - configMapRef: + name: order-service-configs + - secretRef: + name: order-service-secrets + resources: + requests: + cpu: 1m + memory: 50Mi + limits: + cpu: 100m + memory: 256Mi + startupProbe: + httpGet: + path: /health + port: 3000 + failureThreshold: 5 + initialDelaySeconds: 20 + periodSeconds: 10 + readinessProbe: + httpGet: + path: /health + port: 3000 + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 3000 + failureThreshold: 5 + initialDelaySeconds: 3 + periodSeconds: 3 + initContainers: + - name: wait-for-rabbitmq + image: busybox:1.37.0 + command: + ["sh", "-c", "until nc -zv rabbitmq 5672; do echo waiting for rabbitmq; sleep 2; done;"] + resources: + requests: + cpu: 1m + memory: 50Mi + limits: + cpu: 100m + memory: 256Mi +--- +apiVersion: v1 +kind: Service +metadata: + name: order-service +spec: + type: ClusterIP + ports: + - name: http + port: 3000 + targetPort: 3000 + selector: + app: order-service +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: makeline-service +spec: + replicas: 1 + selector: + matchLabels: + app: makeline-service + template: + metadata: + labels: + app: makeline-service + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: makeline-service + image: ghcr.io/azure-samples/aks-store-demo/makeline-service:2.1.0 + ports: + - containerPort: 3001 + env: + - name: ORDER_QUEUE_URI + value: "amqp://rabbitmq:5672" + - name: ORDER_QUEUE_USERNAME + value: "username" + - name: ORDER_QUEUE_PASSWORD + value: "password" + - name: ORDER_QUEUE_NAME + value: "orders" + - name: ORDER_DB_URI + value: "mongodb://mongodb:27017" + - name: ORDER_DB_NAME + value: "orderdb" + - name: ORDER_DB_COLLECTION_NAME + value: "orders" + resources: + requests: + cpu: 1m + memory: 6Mi + limits: + cpu: 5m + memory: 20Mi + startupProbe: + httpGet: + path: /health + port: 3001 + failureThreshold: 10 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 3001 + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 3001 + failureThreshold: 5 + initialDelaySeconds: 3 + periodSeconds: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: makeline-service +spec: + type: ClusterIP + ports: + - name: http + port: 3001 + targetPort: 3001 + selector: + app: makeline-service +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: product-service +spec: + replicas: 1 + selector: + matchLabels: + app: product-service + template: + metadata: + labels: + app: product-service + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: product-service + image: ghcr.io/azure-samples/aks-store-demo/product-service:2.1.0 + ports: + - containerPort: 3002 + env: + - name: AI_SERVICE_URL + value: "http://ai-service:5001/" + resources: + requests: + cpu: 1m + memory: 1Mi + limits: + cpu: 2m + memory: 20Mi + readinessProbe: + httpGet: + path: /health + port: 3002 + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 3002 + failureThreshold: 5 + initialDelaySeconds: 3 + periodSeconds: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: product-service +spec: + type: ClusterIP + ports: + - name: http + port: 3002 + targetPort: 3002 + selector: + app: product-service +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: store-front +spec: + replicas: 1 + selector: + matchLabels: + app: store-front + template: + metadata: + labels: + app: store-front + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: store-front + image: ghcr.io/azure-samples/aks-store-demo/store-front:2.1.0 + ports: + - containerPort: 8080 + name: store-front + resources: + requests: + cpu: 1m + memory: 200Mi + limits: + cpu: 1000m + memory: 512Mi + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 3 + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + initialDelaySeconds: 3 + periodSeconds: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: store-front +spec: + ports: + - port: 80 + targetPort: 8080 + selector: + app: store-front + type: LoadBalancer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: store-admin +spec: + replicas: 1 + selector: + matchLabels: + app: store-admin + template: + metadata: + labels: + app: store-admin + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: store-admin + image: ghcr.io/azure-samples/aks-store-demo/store-admin:2.1.0 + ports: + - containerPort: 8081 + name: store-admin + resources: + requests: + cpu: 1m + memory: 200Mi + limits: + cpu: 1000m + memory: 512Mi + startupProbe: + httpGet: + path: /health + port: 8081 + failureThreshold: 3 + initialDelaySeconds: 5 + periodSeconds: 5 + readinessProbe: + httpGet: + path: /health + port: 8081 + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + httpGet: + path: /health + port: 8081 + failureThreshold: 5 + initialDelaySeconds: 3 + periodSeconds: 3 +--- +apiVersion: v1 +kind: Service +metadata: + name: store-admin +spec: + ports: + - port: 80 + targetPort: 8081 + selector: + app: store-admin + type: LoadBalancer +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: virtual-customer +spec: + replicas: 1 + selector: + matchLabels: + app: virtual-customer + template: + metadata: + labels: + app: virtual-customer + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: virtual-customer + image: ghcr.io/azure-samples/aks-store-demo/virtual-customer:2.1.0 + env: + - name: ORDER_SERVICE_URL + value: http://order-service:3000/ + - name: ORDERS_PER_HOUR + value: "100" + resources: + requests: + cpu: 1m + memory: 1Mi + limits: + cpu: 2m + memory: 20Mi + readinessProbe: + exec: + command: + - cat + - /proc/1/status + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + exec: + command: + - cat + - /proc/1/status + failureThreshold: 5 + initialDelaySeconds: 10 + periodSeconds: 10 +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: virtual-worker +spec: + replicas: 1 + selector: + matchLabels: + app: virtual-worker + template: + metadata: + labels: + app: virtual-worker + spec: + nodeSelector: + "kubernetes.io/os": linux + containers: + - name: virtual-worker + image: ghcr.io/azure-samples/aks-store-demo/virtual-worker:2.1.0 + env: + - name: MAKELINE_SERVICE_URL + value: http://makeline-service:3001 + - name: ORDERS_PER_HOUR + value: "100" + resources: + requests: + cpu: 1m + memory: 1Mi + limits: + cpu: 2m + memory: 20Mi + readinessProbe: + exec: + command: + - cat + - /proc/1/status + failureThreshold: 3 + initialDelaySeconds: 3 + periodSeconds: 5 + livenessProbe: + exec: + command: + - cat + - /proc/1/status + failureThreshold: 5 + initialDelaySeconds: 10 + periodSeconds: 10 diff --git a/Act-3/argocd/SETUP.md b/Act-3/argocd/SETUP.md new file mode 100644 index 0000000..1601307 --- /dev/null +++ b/Act-3/argocd/SETUP.md @@ -0,0 +1,201 @@ +# ArgoCD GitHub Issue Creation Setup Guide + +This guide sets up automatic GitHub issue creation when ArgoCD deployments fail. + +## Architecture + +``` +ArgoCD Deployment Failure β†’ ArgoCD Notifications β†’ GitHub Repository Dispatch β†’ GitHub Actions β†’ Create Issue +``` + +## Files Created + +1. `.github/argocd/argocd-notifications-config.yaml` - ArgoCD notification configuration +2. `.github/workflows/argocd-deployment-failure.yml` - GitHub Actions workflow + +## Setup Steps + +### 1. Create a GitHub Personal Access Token + +1. Go to https://github.com/settings/tokens?type=beta +2. Click "Generate new token" β†’ "Fine-grained personal access token" +3. Configure: + - **Name:** `ArgoCD Notifications` + - **Repository access:** Select your target repository + - **Permissions:** + - Repository permissions β†’ Contents: Read-only + - Repository permissions β†’ Metadata: Read-only (automatically selected) + - Repository permissions β†’ Actions: Read and write (for repository_dispatch) + - Repository permissions β†’ Issues: Read and write +4. Click "Generate token" and copy it (starts with `github_pat_...`) + +### 2. Add Secrets to Kubernetes + +```bash +# Add your GitHub token to ArgoCD notifications +kubectl patch secret argocd-notifications-secret -n argocd -p='{"stringData":{"github-token":"YOUR_GITHUB_TOKEN_HERE"}}' +``` + +### 3. Set Environment Variables in Config + +Edit the ArgoCD notifications ConfigMap to set your GitHub owner and repo: + +```bash +# Replace with your values +export GITHUB_OWNER="your-github-username-or-org" +export GITHUB_REPO="your-repo-name" + +# Update the webhook URL +kubectl patch configmap argocd-notifications-cm -n argocd --type=merge -p="{\"data\":{\"service.webhook.github-webhook\":\"url: https://api.github.com/repos/${GITHUB_OWNER}/${GITHUB_REPO}/dispatches\nheaders:\n- name: Accept\n value: application/vnd.github+json\n- name: Authorization\n value: Bearer \$github-token\n- name: X-GitHub-Api-Version\n value: '2022-11-28'\n- name: Content-Type\n value: application/json\"}}" +``` + +Or manually edit and apply the config file: + +```bash +# Edit the file +vi .github/argocd/argocd-notifications-config.yaml + +# Replace $GITHUB_OWNER and $GITHUB_REPO with your actual values + +# Apply it +kubectl apply -f .github/argocd/argocd-notifications-config.yaml +``` + +### 4. Add Secret to GitHub Repository + +1. Go to your GitHub repository β†’ Settings β†’ Secrets and variables β†’ Actions +2. Click "New repository secret" +3. Name: `ARGOCD_WEBHOOK_SECRET` +4. Value: `udd2UzDVgpRyrIw9XBW8YiNuLO9aCV/4eKFe/wlr4hU=` +5. Click "Add secret" + +### 5. Commit and Push the Workflow + +```bash +cd /home/dcasati/src/agentic-platform-engineering + +# Add the files +git add .github/workflows/argocd-deployment-failure.yml +git add .github/argocd/argocd-notifications-config.yaml + +# Commit +git commit -m "Add ArgoCD deployment failure notification workflow" + +# Push +git push +``` + +### 6. Enable Notifications on Your ArgoCD Applications + +Add annotations to your ArgoCD Application manifests: + +```yaml +apiVersion: argoproj.io/v1alpha1 +kind: Application +metadata: + name: my-app + namespace: argocd + annotations: + notifications.argoproj.io/subscribe.on-sync-failed.github-webhook: "" + notifications.argoproj.io/subscribe.on-health-degraded.github-webhook: "" +spec: + # ... rest of your application spec +``` + +Or use the ArgoCD CLI: + +```bash +# Subscribe to sync failed notifications +argocd app patch my-app --patch='{"metadata":{"annotations":{"notifications.argoproj.io/subscribe.on-sync-failed.github-webhook":""}}}' + +# Subscribe to health degraded notifications +argocd app patch my-app --patch='{"metadata":{"annotations":{"notifications.argoproj.io/subscribe.on-health-degraded.github-webhook":""}}}' +``` + +## Testing + +### Test the notification system: + +1. Deploy a broken application to trigger a failure +2. Check ArgoCD notifications controller logs: + ```bash + kubectl logs -n argocd -l app.kubernetes.io/name=argocd-notifications-controller -f + ``` +3. Verify the webhook was sent to GitHub +4. Check GitHub Actions workflow run +5. Verify issue was created in your repository + +### Manual test without breaking a deployment: + +```bash +# Send a test notification +kubectl exec -n argocd deployment/argocd-notifications-controller -- \ + argocd-notifications trigger on-sync-failed \ + --app my-app +``` + +## What Happens on Deployment Failure + +1. ArgoCD detects sync failure or degraded health +2. ArgoCD Notifications sends webhook to GitHub repository_dispatch +3. GitHub Actions workflow is triggered +4. Workflow checks for existing open issues for the same app + - If exists: Adds comment with new failure details + - If not: Creates new issue with full details +5. Issue includes: + - Error message + - Revision/commit that failed + - Health and sync status + - Recommended remediation steps + - Links to ArgoCD UI and source repository + +## Security Features + +- βœ… Fine-grained GitHub token with minimal permissions +- βœ… Token stored in Kubernetes secret (not in code) +- βœ… Webhook secret for signature verification +- βœ… Automatic duplicate issue detection +- βœ… Labels for easy filtering: `argocd-deployment-failure`, `automated`, `bug` + +## Troubleshooting + +### Notifications not being sent: + +```bash +# Check notifications controller logs +kubectl logs -n argocd -l app.kubernetes.io/name=argocd-notifications-controller + +# Verify the secret has the GitHub token +kubectl get secret argocd-notifications-secret -n argocd -o yaml + +# Verify the ConfigMap is applied +kubectl get configmap argocd-notifications-cm -n argocd -o yaml +``` + +### Issues not being created: + +1. Check GitHub Actions workflow runs in your repository +2. Verify the repository_dispatch event type matches: `argocd-sync-failed` +3. Check workflow logs for errors +4. Verify token permissions include "Actions: Read and write" and "Issues: Read and write" + +## Webhook Secret + +**Important:** The webhook secret is: +``` +udd2UzDVgpRyrIw9XBW8YiNuLO9aCV/4eKFe/wlr4hU= +``` + +This must be stored in: +- βœ… Kubernetes: `argocd-notifications-secret` (already done) +- ⚠️ GitHub: Repository secrets as `ARGOCD_WEBHOOK_SECRET` (you need to do this) + +## Next Steps + +After completing the setup: + +1. Test with a known-good application first +2. Gradually enable on critical applications +3. Monitor the issue tracker for patterns +4. Customize the issue template as needed +5. Consider adding auto-close logic when apps recover diff --git a/Act-3/argocd/argocd-notifications-config.yaml b/Act-3/argocd/argocd-notifications-config.yaml new file mode 100644 index 0000000..44f9c84 --- /dev/null +++ b/Act-3/argocd/argocd-notifications-config.yaml @@ -0,0 +1,52 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: argocd-notifications-cm + namespace: argocd +data: + # GitHub webhook service configuration + service.webhook.github-webhook: | + url: https://api.github.com/repos/DevExpGbb/agentic-platform-engineering/dispatches + headers: + - name: Accept + value: application/vnd.github+json + - name: Authorization + value: Bearer $github-token + - name: X-GitHub-Api-Version + value: "2022-11-28" + - name: Content-Type + value: application/json + + # Trigger on sync failures + trigger.on-sync-failed: | + - when: app.status.operationState.phase in ['Error', 'Failed'] + send: [sync-failed-webhook] + - when: app.status.conditions != nil && app.status.conditions[0].type == 'ComparisonError' + send: [sync-failed-webhook] + + # Trigger on degraded health + trigger.on-health-degraded: | + - when: app.status.health.status == 'Degraded' + send: [sync-failed-webhook] + + # Template for the webhook payload + template.sync-failed-webhook: | + webhook: + github-webhook: + method: POST + body: | + { + "event_type": "argocd-sync-failed", + "client_payload": { + "app_name": "{{.app.metadata.name}}", + "app_namespace": "{{.app.metadata.namespace}}", + "sync_status": "{{.app.status.sync.status}}", + "health_status": "{{.app.status.health.status}}", + "operation_phase": "{{.app.status.operationState.phase}}", + "message": "{{.app.status.operationState.message}}", + "revision": "{{.app.status.sync.revision}}", + "repo_url": "{{.app.spec.source.repoURL}}", + "target_revision": "{{.app.spec.source.targetRevision}}", + "timestamp": "{{.app.status.operationState.finishedAt}}" + } + }