From ed1312cec36bac1c5d4e18f05cf50a37e0ee7c87 Mon Sep 17 00:00:00 2001 From: Martin Jackson Date: Fri, 13 Feb 2026 09:49:19 -0600 Subject: [PATCH] Test name change Revert "Test name change" This reverts commit b451a92ed76bdfcf7e98541f5fa0c0852f6e8a26. Parameterize names Test name parameterization Unify naming strategy Factor out cluster names Upgrade to ACM 2.14 Standardize on RDR structure Preserve the whole structure to prevent render failures Parameterize primary/secondary cluster for edge-gitops-vms Fixups for gitops-vms deployments Update Changes.md Commit cursor swing at simplification Parameterize preferred cluster Parameterize other DRPC aspects Consolidate DRPC parameters Fix template render error Fix another template render failure Sanitize install_config Fixup template again Update method to retrieve CAs Update method to retrieve CAs Update method for editing CAs Update logging and be more careful in python Try this approach - watch S3ProfileNames though Use cluster names Refactor validation to ensure there are just two profiles as long as the CA material is present Rebase with main in mind Try a different approach to validation Change grep technique Fix validation Ensure cluster labels are set Change label to clusterset for ACM membership Ensure clusterGroup is set Enhance merge logic for install config Enhance merge logic further Piece out overrides further to ensure machine types and regions are properly handled Restore default cluster naming Continue refining override logic Correct spelling for secondary Remove RegionalDR from values-hub and move sharedValueFiles to extraValueFiles for apps that matter Add additional validation scripts Create script and doc for updating the JSON files Fix lint error in markdown Update cluster validation Revert some bad logic introduced for bad reasons (ODF was not deploying and I thought the script was wrong) Simplify certificate extraction to only use yq Avoid overly large shell command Reformat README to silence super-linter Specify argocd server to avoid error Do not exit successfully if both clusters are not deploying Force sync via gitops-vms namespace Rationalize cron and non-cron argo monitors, force sync a namespace Correct namespace Update changelog --- Changes.md | 7 + README.md | 2 +- .../opp/scripts/argocd-health-monitor-cron.sh | 91 ++-- .../hub/opp/scripts/argocd-health-monitor.sh | 286 ++---------- .../scripts/odf-ssl-certificate-extraction.sh | 439 +++++++----------- charts/hub/opp/scripts/odf-ssl-precheck.sh | 28 +- charts/hub/opp/templates/_helpers.tpl | 13 + .../cronjob-argocd-health-monitor.yaml | 13 + .../templates/job-argocd-health-monitor.yaml | 13 + .../job-odf-ssl-certificate-extraction.yaml | 4 + .../job-odf-ssl-certificate-precheck.yaml | 5 + charts/hub/opp/values.yaml | 21 +- charts/hub/rdr/README.md | 61 +++ .../files/default-primary-install-config.json | 55 +++ .../default-secondary-install-config.json | 55 +++ .../hub/rdr/scripts/edge-gitops-vms-deploy.sh | 95 ++-- .../rdr/scripts/odf-dr-prerequisites-check.sh | 32 +- .../scripts/submariner-prerequisites-check.sh | 6 +- charts/hub/rdr/templates/_helpers.tpl | 150 ++++++ .../rdr/templates/cluster_deployments.yaml | 24 +- charts/hub/rdr/templates/dr_policy.yaml | 4 +- charts/hub/rdr/templates/drpc.yaml | 29 +- .../templates/job-drcluster-validation.yaml | 20 +- .../templates/job-edge-gitops-vms-deploy.yaml | 6 + .../templates/job-odf-dr-prerequisites.yaml | 5 + .../job-submariner-prerequisites.yaml | 5 + .../hub/rdr/templates/mirrorpeer_create.yaml | 4 +- .../templates/submariner_addon_install.yaml | 16 +- charts/hub/rdr/values.yaml | 47 ++ overrides/values-cluster-names.yaml | 29 ++ overrides/values-minimal-regional-dr.yaml | 10 + scripts/cleanup-gitops-vms-non-primary.sh | 2 +- scripts/download-kubeconfigs.sh | 4 +- scripts/extract-cluster-cas.sh | 13 +- scripts/test-rdr-install-config.sh | 239 ++++++++++ scripts/update-ca-bundle.sh | 2 +- .../update-rdr-default-install-config-json.sh | 136 ++++++ values-hub.yaml | 7 +- 38 files changed, 1282 insertions(+), 696 deletions(-) create mode 100644 charts/hub/opp/templates/_helpers.tpl create mode 100644 charts/hub/rdr/README.md create mode 100644 charts/hub/rdr/files/default-primary-install-config.json create mode 100644 charts/hub/rdr/files/default-secondary-install-config.json create mode 100644 charts/hub/rdr/templates/_helpers.tpl create mode 100644 overrides/values-cluster-names.yaml create mode 100644 overrides/values-minimal-regional-dr.yaml create mode 100755 scripts/test-rdr-install-config.sh create mode 100755 scripts/update-rdr-default-install-config-json.sh diff --git a/Changes.md b/Changes.md index cc629ec..8c2f5f5 100644 --- a/Changes.md +++ b/Changes.md @@ -4,3 +4,10 @@ v1.0 - November 2025 * Arrange to default baseDomain settings appropriately so that forking the pattern is not a hard requirement * Initial release + +v1.0 - February 2026 + +* The names ocp-primary and ocp-secondary were hardcoded in various places, which caused issues when trying +to install two copies of this pattern into the same DNS domain. +* Also parameterize the version of edge-gitops-vms chart in case it needs to get updated. It too was hardcoded. +* Update to ACM 2.14 in prep for OCP 4.20+ testing. diff --git a/README.md b/README.md index 0242fc0..758eec0 100644 --- a/README.md +++ b/README.md @@ -6,5 +6,5 @@ ## Start Here If you've followed a link to this repository, but are not really sure what it contains -or how to use it, head over to [Ansible Edge GitOps](https://validatedpatterns.io/patterns/ansible-edge-gitops/) +or how to use it, head over to [RamenDR Starter Kit](https://validatedpatterns.io/patterns/ramendr-starter-kit/) for additional context and installation instructions diff --git a/charts/hub/opp/scripts/argocd-health-monitor-cron.sh b/charts/hub/opp/scripts/argocd-health-monitor-cron.sh index 7ba0ef8..a069570 100755 --- a/charts/hub/opp/scripts/argocd-health-monitor-cron.sh +++ b/charts/hub/opp/scripts/argocd-health-monitor-cron.sh @@ -1,12 +1,26 @@ #!/bin/bash +# ArgoCD health monitor - CronJob (runs every 15 min). +# Why two scripts? The Job (argocd-health-monitor.sh) runs once at deploy, retries until both clusters are +# healthy then exits. This CronJob runs periodically to detect and remediate wedged clusters after deploy. +# Both use the same remediation: force-sync Namespace ramendr-starter-kit-resilient in Application ramendr-starter-kit-resilient. set -euo pipefail echo "Starting ArgoCD health monitoring and remediation..." +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Configuration MAX_ATTEMPTS=270 # Check 270 times (90 minutes with 20s intervals) before failing SLEEP_INTERVAL=20 ARGOCD_NAMESPACE="openshift-gitops" +# Namespace where the Application to force-sync lives (parameterized; default openshift-gitops) +FORCE_SYNC_APP_NAMESPACE="${FORCE_SYNC_APP_NAMESPACE:-openshift-gitops}" +# Application and specific resource to force-sync when remediating (Namespace ramendr-starter-kit-resilient in Application ramendr-starter-kit-resilient) +FORCE_SYNC_APP_NAME="${FORCE_SYNC_APP_NAME:-ramendr-starter-kit-resilient}" +FORCE_SYNC_RESOURCE_KIND="${FORCE_SYNC_RESOURCE_KIND:-Namespace}" +FORCE_SYNC_RESOURCE_NAME="${FORCE_SYNC_RESOURCE_NAME:-ramendr-starter-kit-resilient}" HEALTH_CHECK_TIMEOUT=30 # Function to check if a cluster is wedged @@ -26,11 +40,11 @@ check_cluster_wedged() { local cluster_argocd_namespace="" local cluster_argocd_instance="" case "$cluster" in - "ocp-primary") + "$PRIMARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; - "ocp-secondary") + "$SECONDARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; @@ -96,81 +110,40 @@ check_cluster_wedged() { fi } -# Function to remediate a wedged cluster +# Function to remediate a wedged cluster (force sync a known resource instead of restarting Argo CD) remediate_wedged_cluster() { local cluster="$1" local kubeconfig="$2" - echo "πŸ”§ Remediating wedged cluster: $cluster" - - # Stop all ArgoCD instances by scaling down deployments - echo " Stopping all ArgoCD instances on $cluster..." - oc --kubeconfig="$kubeconfig" scale deployment --all -n "$ARGOCD_NAMESPACE" --replicas=0 &>/dev/null || true - oc --kubeconfig="$kubeconfig" scale statefulset --all -n "$ARGOCD_NAMESPACE" --replicas=0 &>/dev/null || true - - # If scaling doesn't work, try more aggressive cleanup - echo " Attempting aggressive cleanup for stuck deployments..." - oc --kubeconfig="$kubeconfig" delete deployment --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - oc --kubeconfig="$kubeconfig" delete statefulset --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - oc --kubeconfig="$kubeconfig" delete pods --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - - # Wait for all instances to stop - echo " Waiting for ArgoCD instances to stop..." - local attempt=1 - while [[ $attempt -le 30 ]]; do - local running_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) - if [[ $running_pods -eq 0 ]]; then - echo " βœ… All ArgoCD instances stopped on $cluster" - break - fi - echo " Waiting for instances to stop... (attempt $attempt/30)" - sleep 5 - ((attempt++)) - done + echo "πŸ”§ Remediating wedged cluster: $cluster (forcibly resyncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME)" - # Restart all ArgoCD instances by scaling up deployments - echo " Restarting all ArgoCD instances on $cluster..." - oc --kubeconfig="$kubeconfig" scale deployment --all -n "$ARGOCD_NAMESPACE" --replicas=1 &>/dev/null || true - oc --kubeconfig="$kubeconfig" scale statefulset --all -n "$ARGOCD_NAMESPACE" --replicas=1 &>/dev/null || true - - # Wait for pods to restart - echo " Waiting for ArgoCD pods to restart on $cluster..." - local attempt=1 - while [[ $attempt -le 20 ]]; do - local running_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) - local total_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --no-headers 2>/dev/null | wc -l) - - if [[ $running_pods -gt 0 && $running_pods -eq $total_pods ]]; then - echo " βœ… ArgoCD pods restarted successfully on $cluster" - break - fi - - echo " Waiting for pods to restart... (attempt $attempt/20)" - sleep 10 - ((attempt++)) - done - - if [[ $attempt -gt 20 ]]; then - echo " ⚠️ ArgoCD pods may not have fully restarted on $cluster" + # Forcibly resync the specific resource (e.g. Namespace ramendr-starter-kit-resilient) in the Application (no Argo CD restart) + if oc --kubeconfig="$kubeconfig" get application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" &>/dev/null; then + echo " Force syncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME (namespace $FORCE_SYNC_APP_NAMESPACE) on $cluster..." + oc --kubeconfig="$kubeconfig" patch application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$FORCE_SYNC_RESOURCE_KIND\",\"name\":\"$FORCE_SYNC_RESOURCE_NAME\"}],\"syncOptions\":[\"Force=true\"]}}}" &>/dev/null || true + echo " βœ… Triggered force sync for $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME" + else + echo " ⚠️ Application $FORCE_SYNC_APP_NAME not found in $FORCE_SYNC_APP_NAMESPACE on $cluster - cannot force sync" fi - # Trigger ArgoCD refresh/sync + # Trigger ArgoCD refresh/sync (argocd CLI needs --server when run inside the pod) echo " Triggering ArgoCD refresh on $cluster..." local server_pod=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" -l app.kubernetes.io/name=openshift-gitops-server --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + local argocd_server="localhost:8080" if [[ -n "$server_pod" ]]; then # Trigger refresh of all applications - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list -o name | while read app; do + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list --server "$argocd_server" -o name 2>/dev/null | while read app; do if [[ -n "$app" ]]; then echo " Refreshing $app..." - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --refresh &>/dev/null || true + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --server "$argocd_server" --refresh &>/dev/null || true fi done - + # Trigger hard refresh - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list -o name | while read app; do + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list --server "$argocd_server" -o name 2>/dev/null | while read app; do if [[ -n "$app" ]]; then echo " Hard refreshing $app..." - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --hard-refresh &>/dev/null || true + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --server "$argocd_server" --hard-refresh &>/dev/null || true fi done fi diff --git a/charts/hub/opp/scripts/argocd-health-monitor.sh b/charts/hub/opp/scripts/argocd-health-monitor.sh index 9c6b9bf..4f3ea48 100755 --- a/charts/hub/opp/scripts/argocd-health-monitor.sh +++ b/charts/hub/opp/scripts/argocd-health-monitor.sh @@ -1,12 +1,26 @@ #!/bin/bash +# ArgoCD health monitor - Job (one-shot, long-running). +# Why two scripts? This Job runs once at deploy time (sync-wave 0), retries for up to ~90 min until both +# primary and secondary Argo CD instances are healthy, then exits. The CronJob (argocd-health-monitor-cron.sh) +# runs every 15 min to catch wedged clusters after deploy. Both use the same remediation: force-sync the +# specific resource (Namespace ramendr-starter-kit-resilient) in Application ramendr-starter-kit-resilient. set -euo pipefail echo "Starting ArgoCD health monitoring and remediation..." +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Configuration MAX_ATTEMPTS=180 # Check 180 times (90 minutes with 30s intervals) before failing SLEEP_INTERVAL=30 ARGOCD_NAMESPACE="openshift-gitops" +# Same as cron: force-sync this specific resource in this Application when remediating (parameterized) +FORCE_SYNC_APP_NAMESPACE="${FORCE_SYNC_APP_NAMESPACE:-openshift-gitops}" +FORCE_SYNC_APP_NAME="${FORCE_SYNC_APP_NAME:-ramendr-starter-kit-resilient}" +FORCE_SYNC_RESOURCE_KIND="${FORCE_SYNC_RESOURCE_KIND:-Namespace}" +FORCE_SYNC_RESOURCE_NAME="${FORCE_SYNC_RESOURCE_NAME:-ramendr-starter-kit-resilient}" HEALTH_CHECK_TIMEOUT=60 # Function to check if a cluster is wedged @@ -26,11 +40,11 @@ check_cluster_wedged() { local cluster_argocd_namespace="" local cluster_argocd_instance="" case "$cluster" in - "ocp-primary") + "$PRIMARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; - "ocp-secondary") + "$SECONDARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; @@ -61,7 +75,12 @@ check_cluster_wedged() { return 0 fi fi - + + # For primary/secondary we require the cluster-specific Argo CD instance; missing = not healthy (job must not succeed) + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then + echo "❌ Required Argo CD instance ($cluster_argocd_instance in $cluster_argocd_namespace) not found on $cluster - job will retry or fail" + return 0 + fi echo "βœ… $cluster appears healthy (no ArgoCD instances installed yet)" return 1 fi @@ -135,7 +154,12 @@ check_cluster_wedged() { fi fi fi - + + # For primary/secondary we require the Argo CD instance to be running; missing = not healthy (job must not succeed) + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then + echo "❌ Required Argo CD instance ($cluster_argocd_instance) not running in $cluster_argocd_namespace on $cluster - job will retry or fail" + return 0 + fi echo "βœ… $cluster appears healthy (no ArgoCD instances running yet)" return 1 elif [[ $cluster_argocd_pods -eq 1 ]]; then @@ -170,240 +194,20 @@ check_cluster_wedged() { fi } -# Function to remediate a wedged cluster using ArgoCD sync mechanisms +# Function to remediate a wedged cluster (force sync the specific resource in the specific Application, same as cron) remediate_wedged_cluster() { local cluster="$1" local kubeconfig="$2" - echo "πŸ”§ Remediating wedged cluster: $cluster using ArgoCD sync mechanisms" - echo " 🎯 Focus: Force sync namespace policies and specific resources in stuck applications" + echo "πŸ”§ Remediating wedged cluster: $cluster (forcibly resyncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME)" - # STEP 1: Find and sync namespace policies - echo " πŸ” STEP 1: Finding and syncing namespace policies..." - - # Get all ArgoCD applications - local applications=$(oc --kubeconfig="$kubeconfig" get applications -n "$ARGOCD_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "") - - if [[ -n "$applications" ]]; then - echo " Found ArgoCD applications: $applications" - - for app in $applications; do - echo " πŸ”„ Processing application: $app" - - # Get application status - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app status: sync=$app_status, health=$app_health" - - # If application is out of sync or unhealthy, force sync it - if [[ "$app_status" != "Synced" || "$app_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app is not in sync - forcing sync..." - - # Force sync the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true"]}}}' &>/dev/null || true - - # Wait a moment for sync to start - sleep 5 - - # Check if there are specific resources that need to be synced - echo " πŸ” Checking for specific resources that need sync in $app..." - - # Get resources that are out of sync - local out_of_sync_resources=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.status=="OutOfSync")].name}' 2>/dev/null || echo "") - - if [[ -n "$out_of_sync_resources" ]]; then - echo " πŸ“‹ Found out-of-sync resources: $out_of_sync_resources" - - # Force sync specific resources - for resource in $out_of_sync_resources; do - echo " πŸ”„ Force syncing resource: $resource" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$(echo $resource | cut -d'/' -f1)\",\"name\":\"$(echo $resource | cut -d'/' -f2)\"}]}}}" &>/dev/null || true - done - fi - - # Check for namespace policies specifically - echo " πŸ” Looking for namespace policies in $app..." - local namespace_policies=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.kind=="Policy")].name}' 2>/dev/null || echo "") - - if [[ -n "$namespace_policies" ]]; then - echo " πŸ“‹ Found namespace policies: $namespace_policies" - - # Force sync namespace policies - for policy in $namespace_policies; do - echo " πŸ”„ Force syncing namespace policy: $policy" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"Policy\",\"name\":\"$policy\"}]}}}" &>/dev/null || true - done - fi - else - echo " βœ… Application $app is already in sync and healthy" - fi - done + if oc --kubeconfig="$kubeconfig" get application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" &>/dev/null; then + echo " Force syncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME (namespace $FORCE_SYNC_APP_NAMESPACE) on $cluster..." + oc --kubeconfig="$kubeconfig" patch application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$FORCE_SYNC_RESOURCE_KIND\",\"name\":\"$FORCE_SYNC_RESOURCE_NAME\"}],\"syncOptions\":[\"Force=true\"]}}}" &>/dev/null || true + echo " βœ… Triggered force sync for $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME" else - echo " ⚠️ No ArgoCD applications found in $ARGOCD_NAMESPACE namespace" + echo " ⚠️ Application $FORCE_SYNC_APP_NAME not found in $FORCE_SYNC_APP_NAMESPACE on $cluster - cannot force sync" fi - - # STEP 2: Force refresh and hard refresh of applications - echo " πŸ”„ STEP 2: Force refreshing applications..." - - for app in $applications; do - echo " πŸ”„ Force refreshing application: $app" - - # Force refresh the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"initiatedBy":{"username":"argocd-health-monitor"},"info":[{"name":"refresh","value":"hard"}]}}' &>/dev/null || true - - # Wait for refresh to complete - sleep 10 - - # Check application status after refresh - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app status after refresh: sync=$app_status, health=$app_health" - done - - # STEP 3: Check for stuck applications and force sync them - echo " πŸ” STEP 3: Checking for stuck applications..." - - for app in $applications; do - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - if [[ "$app_status" != "Synced" || "$app_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app is still stuck - attempting final sync..." - - # Final attempt to sync the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true"]}}}' &>/dev/null || true - - # Wait for sync to complete - sleep 15 - - # Check final status - local final_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local final_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Final status for $app: sync=$final_status, health=$final_health" - fi - done - - echo " βœ… ArgoCD sync-based remediation completed for $cluster" - echo " 🎯 Remediated: Used ArgoCD sync mechanisms to force sync stuck applications and namespace policies" - echo " ⚠️ Note: This approach preserves existing resources while forcing proper synchronization" -} - -# Function to apply aggressive ArgoCD sync for wedged openshift-gitops namespace -apply_aggressive_argocd_sync() { - local cluster="$1" - local kubeconfig="$2" - - echo "πŸ”„πŸ”„πŸ”„ APPLYING AGGRESSIVE ARGOCD SYNC TO OPENSHIFT-GITOPS NAMESPACE πŸ”„πŸ”„πŸ”„" - echo " 🎯 Target: $ARGOCD_NAMESPACE namespace on $cluster" - echo " ⚠️ This will force sync all applications and namespace policies using ArgoCD mechanisms" - - # Check if openshift-gitops namespace exists - if ! oc --kubeconfig="$kubeconfig" get namespace "$ARGOCD_NAMESPACE" &>/dev/null; then - echo " βœ… $ARGOCD_NAMESPACE namespace does not exist - nothing to sync" - return 0 - fi - - echo " πŸ” $ARGOCD_NAMESPACE namespace exists - proceeding with aggressive ArgoCD sync" - - # STEP 1: Get all ArgoCD applications - echo " πŸ” STEP 1: Finding all ArgoCD applications..." - local applications=$(oc --kubeconfig="$kubeconfig" get applications -n "$ARGOCD_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "") - - if [[ -z "$applications" ]]; then - echo " ⚠️ No ArgoCD applications found in $ARGOCD_NAMESPACE namespace" - return 0 - fi - - echo " Found ArgoCD applications: $applications" - - # STEP 2: Force sync all applications with aggressive options - echo " πŸ”„ STEP 2: Force syncing all applications with aggressive options..." - - for app in $applications; do - echo " πŸ”„ Aggressively syncing application: $app" - - # Get current application status - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app current status: sync=$app_status, health=$app_health" - - # Force sync with aggressive options - echo " πŸ”„ Force syncing $app with aggressive options..." - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true","Replace=true"]}}}' &>/dev/null || true - - # Wait for sync to start - sleep 5 - - # Check for specific resources that need aggressive sync - echo " πŸ” Checking for specific resources that need aggressive sync in $app..." - - # Get all resources in the application - local all_resources=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[*].name}' 2>/dev/null || echo "") - - if [[ -n "$all_resources" ]]; then - echo " πŸ“‹ Found resources in $app: $all_resources" - - # Force sync each resource individually - for resource in $all_resources; do - echo " πŸ”„ Force syncing resource: $resource" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$(echo $resource | cut -d'/' -f1)\",\"name\":\"$(echo $resource | cut -d'/' -f2)\"}],\"syncOptions\":[\"Force=true\",\"Replace=true\"]}}}" &>/dev/null || true - done - fi - - # Check for namespace policies specifically - echo " πŸ” Looking for namespace policies in $app..." - local namespace_policies=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.kind=="Policy")].name}' 2>/dev/null || echo "") - - if [[ -n "$namespace_policies" ]]; then - echo " πŸ“‹ Found namespace policies: $namespace_policies" - - # Force sync namespace policies with aggressive options - for policy in $namespace_policies; do - echo " πŸ”„ Aggressively syncing namespace policy: $policy" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"Policy\",\"name\":\"$policy\"}],\"syncOptions\":[\"Force=true\",\"Replace=true\",\"PrunePropagationPolicy=foreground\"]}}}" &>/dev/null || true - done - fi - done - - # STEP 3: Force refresh all applications - echo " πŸ”„ STEP 3: Force refreshing all applications..." - - for app in $applications; do - echo " πŸ”„ Force refreshing application: $app" - - # Force hard refresh - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"initiatedBy":{"username":"argocd-health-monitor"},"info":[{"name":"refresh","value":"hard"}]}}' &>/dev/null || true - - # Wait for refresh to complete - sleep 10 - done - - # STEP 4: Final verification and sync - echo " πŸ” STEP 4: Final verification and sync..." - - for app in $applications; do - echo " πŸ” Final check for application: $app" - - # Get final status - local final_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local final_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Final status for $app: sync=$final_status, health=$final_health" - - # If still not healthy, try one more aggressive sync - if [[ "$final_status" != "Synced" || "$final_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app still not healthy - attempting final aggressive sync..." - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true","Replace=true","Prune=true"]}}}' &>/dev/null || true - fi - done - - echo " πŸ”„πŸ”„πŸ”„ AGGRESSIVE ARGOCD SYNC COMPLETED FOR $ARGOCD_NAMESPACE NAMESPACE πŸ”„πŸ”„πŸ”„" - echo " 🎯 Result: Used ArgoCD sync mechanisms to force sync all applications and namespace policies" - echo " ⚠️ Note: This approach preserves existing resources while forcing proper synchronization" } # Function to download kubeconfig for a cluster (using same logic as download-kubeconfigs.sh) @@ -618,22 +422,14 @@ while [[ $attempt -le $MAX_ATTEMPTS ]]; do fi fi - # Remediate wedged clusters + # Remediate wedged clusters (same targeted force-sync for all: Namespace in Application ramendr-starter-kit-resilient) if [[ ${#wedged_clusters[@]} -gt 0 ]]; then echo "Found wedged clusters: ${wedged_clusters[*]}" for cluster in "${wedged_clusters[@]}"; do kubeconfig_path="/tmp/${cluster}-kubeconfig.yaml" - - # Apply aggressive ArgoCD sync specifically for ocp-secondary if it's wedged - if [[ "$cluster" == "ocp-secondary" ]]; then - echo "πŸ”„πŸ”„πŸ”„ APPLYING AGGRESSIVE ARGOCD SYNC TO WEDGED OCP-SECONDARY πŸ”„πŸ”„πŸ”„" - echo " 🎯 Target: Force sync all applications and namespace policies on ocp-secondary" - apply_aggressive_argocd_sync "$cluster" "$kubeconfig_path" - else - echo "πŸ”§ Applying standard ArgoCD sync remediation to wedged cluster: $cluster" - remediate_wedged_cluster "$cluster" "$kubeconfig_path" - fi + echo "πŸ”§ Applying remediation to wedged cluster: $cluster" + remediate_wedged_cluster "$cluster" "$kubeconfig_path" done echo "βœ… Remediation completed for wedged clusters" @@ -647,5 +443,7 @@ while [[ $attempt -le $MAX_ATTEMPTS ]]; do fi done -echo "πŸŽ‰ ArgoCD health monitoring completed" - +# Exited loop by exhausting attempts (did not exit 0 from "all healthy") +echo "❌ ArgoCD health monitoring did not complete successfully within $MAX_ATTEMPTS attempts" +echo " One or both required Argo CD instances (on $PRIMARY_CLUSTER and $SECONDARY_CLUSTER) were not running correctly." +exit 1 diff --git a/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh b/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh index 7225d0e..d43ea93 100755 --- a/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh +++ b/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh @@ -212,9 +212,13 @@ else echo " Found managed clusters: $MANAGED_CLUSTERS" fi +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Extract CA from each managed cluster CA_FILES=() -REQUIRED_CLUSTERS=("hub" "ocp-primary" "ocp-secondary") +REQUIRED_CLUSTERS=("hub" "$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") EXTRACTED_CLUSTERS=() # Track hub cluster CA extraction @@ -291,8 +295,8 @@ if [[ ${#MISSING_CLUSTERS[@]} -gt 0 ]]; then echo "" echo "The ODF SSL certificate extractor job requires CA material from ALL three clusters:" echo " - hub (hub cluster)" - echo " - ocp-primary (primary managed cluster)" - echo " - ocp-secondary (secondary managed cluster)" + echo " - $PRIMARY_CLUSTER (primary managed cluster)" + echo " - $SECONDARY_CLUSTER (secondary managed cluster)" echo "" echo "Without CA material from all clusters, the DR setup will fail." echo "Please ensure all clusters are accessible and have proper kubeconfigs." @@ -450,14 +454,17 @@ if oc get configmap ramen-hub-operator-config -n openshift-operators &>/dev/null # Get existing ramen_manager_config.yaml content EXISTING_YAML=$(oc get configmap ramen-hub-operator-config -n openshift-operators -o jsonpath='{.data.ramen_manager_config\.yaml}' 2>/dev/null || echo "") - # CRITICAL: Verify at least 2 S3profiles exist before attempting update + # Patch existing s3StoreProfiles only: add/update caCertificates on each existing profile. + # We do NOT create new profiles or delete/overwrite profile names. At least 2 existing profiles required. MIN_REQUIRED_PROFILES=2 if [[ -n "$EXISTING_YAML" ]]; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_KOP=$(echo "$EXISTING_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_TOP=$(echo "$EXISTING_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_KOP=$((10#${COUNT_KOP:-0})) + COUNT_TOP=$((10#${COUNT_TOP:-0})) + EXISTING_PROFILE_COUNT=$(( COUNT_KOP >= COUNT_TOP ? COUNT_KOP : COUNT_TOP )) else - # Fallback to grep if yq is not available EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") if [[ $EXISTING_PROFILE_COUNT -eq 0 ]]; then EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") @@ -472,252 +479,106 @@ if oc get configmap ramen-hub-operator-config -n openshift-operators &>/dev/null echo " Current YAML content (first 50 lines):" echo "$EXISTING_YAML" | head -n 50 echo "" - echo " The certificate extractor requires at least $MIN_REQUIRED_PROFILES S3profiles to add CA certificates." - echo " Please ensure the ramen-hub-operator-config ConfigMap has at least $MIN_REQUIRED_PROFILES s3StoreProfiles configured." + echo " The certificate extractor only patches existing s3StoreProfiles with caCertificates." + echo " Please ensure ramen-hub-operator-config has at least $MIN_REQUIRED_PROFILES s3StoreProfiles configured." handle_error "Insufficient s3StoreProfiles found: found $EXISTING_PROFILE_COUNT profile(s), but at least $MIN_REQUIRED_PROFILES are required" else - echo " βœ… Found $EXISTING_PROFILE_COUNT s3StoreProfiles (minimum required: $MIN_REQUIRED_PROFILES)" + echo " βœ… Found $EXISTING_PROFILE_COUNT s3StoreProfiles (will patch caCertificates into existing profiles only)" fi fi - - # Create updated YAML with caCertificates in each s3StoreProfiles item + + # Patch existing profiles with caCertificates using yq only (env var avoids embedding base64 in expression) + PATCHED_VIA_YQ=false if [[ -n "$EXISTING_YAML" ]]; then - # Create a temporary YAML file with the update echo "$EXISTING_YAML" > "$WORK_DIR/existing-ramen-config.yaml" - echo " Existing YAML content (first 20 lines):" echo "$EXISTING_YAML" | head -n 20 - - # Try to install PyYAML first, or use alternative methods - echo " Attempting to update s3StoreProfiles with caCertificates..." - - # Method 1: Try Python with PyYAML first (most reliable) - PYTHON_SUCCESS=false - if python3 -c "import yaml" 2>/dev/null || python3 -m pip install --user PyYAML 2>&1 | grep -q "Successfully installed\|Requirement already satisfied"; then - echo " Using Python with PyYAML to update s3StoreProfiles..." - export CA_BUNDLE_BASE64 - if python3 -c " -import yaml -import sys -import os + echo " Patching s3StoreProfiles with caCertificates using yq..." -ca_bundle = os.environ.get('CA_BUNDLE_BASE64', '') + if ! command -v yq &>/dev/null; then + echo " ❌ yq is required but not found in PATH" + handle_error "yq is required to patch ramen_manager_config with caCertificates; please install yq (e.g. mikefarah/yq)" + fi -try: - with open('$WORK_DIR/existing-ramen-config.yaml', 'r') as f: - config = yaml.safe_load(f) or {} - - if config is None: - config = {} - - if 's3StoreProfiles' not in config: - config['s3StoreProfiles'] = [] - - updated_count = 0 - for profile in config.get('s3StoreProfiles', []): - if isinstance(profile, dict): - profile['caCertificates'] = ca_bundle - updated_count += 1 - - print(f'Updated {updated_count} s3StoreProfiles with caCertificates', file=sys.stderr) - - with open('$WORK_DIR/existing-ramen-config.yaml', 'w') as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False, allow_unicode=True) - - print('SUCCESS', file=sys.stderr) - sys.exit(0) -except Exception as e: - print(f'ERROR: {e}', file=sys.stderr) - import traceback - traceback.print_exc(file=sys.stderr) - sys.exit(1) -" 2>&1; then - echo " βœ… Successfully updated s3StoreProfiles with caCertificates using Python" - PYTHON_SUCCESS=true - else - echo " ⚠️ Python update failed, trying yq..." - fi + export CA_BUNDLE_BASE64 + YQ_PATCHED=false + # Use strenv() so the base64 value is passed as a string without embedding in the expression (avoids quoting/special-char issues) + if yq eval -i '.s3StoreProfiles[]? |= . + {"caCertificates": strenv(CA_BUNDLE_BASE64)}' "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + YQ_PATCHED=true fi - - # Method 2: Try yq if Python failed - if [[ "$PYTHON_SUCCESS" != "true" ]] && command -v yq &>/dev/null; then - echo " Using yq to update s3StoreProfiles..." - # Use yq to update each profile individually - if yq eval '(.s3StoreProfiles[] | select(has("name"))) |= . + {"caCertificates": "'"$CA_BUNDLE_BASE64"'"}' -i "$WORK_DIR/existing-ramen-config.yaml" 2>&1; then - echo " βœ… Successfully updated s3StoreProfiles with caCertificates using yq" - PYTHON_SUCCESS=true - else - echo " ⚠️ yq failed, trying awk-based approach..." - PYTHON_SUCCESS=false - fi + if yq eval -i '.kubeObjectProtection.s3StoreProfiles[]? |= . + {"caCertificates": strenv(CA_BUNDLE_BASE64)}' "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + YQ_PATCHED=true fi - - # Method 3: Fallback to awk/sed if both Python and yq failed - if [[ "$PYTHON_SUCCESS" != "true" ]]; then - echo " Using awk-based approach as fallback..." - { - # Use awk to update or add caCertificates to each s3StoreProfiles item - awk -v ca_bundle="$CA_BUNDLE_BASE64" ' - BEGIN { in_profile=0; ca_added=0 } - /^s3StoreProfiles:/ { - print - next - } - /^ - name:/ { - in_profile=1 - ca_added=0 - print - next - } - in_profile && /^ caCertificates:/ { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - in_profile=0 - next - } - in_profile && /^ [a-zA-Z]/ && !/^ caCertificates:/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - print - next - } - in_profile && /^ -/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - in_profile=0 - print - next - } - in_profile && /^$/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - in_profile=0 - print - next - } - { print } - ' "$WORK_DIR/existing-ramen-config.yaml" > "$WORK_DIR/existing-ramen-config.yaml.tmp" && \ - mv "$WORK_DIR/existing-ramen-config.yaml.tmp" "$WORK_DIR/existing-ramen-config.yaml" && \ - echo " βœ… Updated s3StoreProfiles using awk" || { - echo " ❌ awk-based approach failed" - PYTHON_SUCCESS=false - } - } + if [[ "$YQ_PATCHED" != "true" ]]; then + echo " ❌ yq failed to patch s3StoreProfiles (no top-level or kubeObjectProtection.s3StoreProfiles found?)" + echo " yq version: $(yq --version 2>/dev/null || true)" + handle_error "yq could not update s3StoreProfiles with caCertificates" fi - - # Clean up temporary files + echo " βœ… Patched existing s3StoreProfiles with caCertificates using yq" + rm -f "$WORK_DIR/existing-ramen-config.yaml.bak" "$WORK_DIR/existing-ramen-config.yaml.tmp" - - # Verify the update + + # Verify patch (grep in file; do NOT load full content into shell variable - base64 can exceed ARG_MAX and truncate) if [[ -f "$WORK_DIR/existing-ramen-config.yaml" ]]; then - UPDATED_YAML=$(cat "$WORK_DIR/existing-ramen-config.yaml") - echo " Updated YAML content (first 20 lines):" - echo "$UPDATED_YAML" | head -n 20 - - # Verify caCertificates was added - if echo "$UPDATED_YAML" | grep -q "caCertificates"; then - echo " βœ… Verified: caCertificates found in updated YAML" - else - echo " ⚠️ Warning: caCertificates not found in updated YAML" + if ! grep -q "caCertificates" "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + echo " ❌ No caCertificates in updated YAML (patch failed or no s3StoreProfiles to patch)" + handle_error "Failed to patch ramen_manager_config with caCertificates - update produced no caCertificates" fi + echo " Updated YAML content (first 20 lines):" + head -n 20 "$WORK_DIR/existing-ramen-config.yaml" + echo " βœ… Verified: caCertificates found in updated YAML" + # Copy file directly; do NOT use a shell variable (large base64 would truncate and break the applied ConfigMap) + cp "$WORK_DIR/existing-ramen-config.yaml" "$WORK_DIR/ramen_manager_config.yaml" + PATCHED_VIA_YQ=true else echo " ❌ Error: Updated YAML file not found" - UPDATED_YAML="$EXISTING_YAML" + PATCHED_VIA_YQ=false fi - - rm -f "$WORK_DIR/update_ramen_config.py" else - # No existing YAML, create new one with s3StoreProfiles containing caCertificates - UPDATED_YAML="s3StoreProfiles: - - name: default + # No existing YAML (ConfigMap exists but ramen_manager_config.yaml empty): create minimal config with 2 profiles (parameterized by cluster name) + UPDATED_YAML="kubeObjectProtection: + s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" +s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER caCertificates: \"$CA_BUNDLE_BASE64\"" fi - - # Save updated YAML to a file for use with oc set data - echo "$UPDATED_YAML" > "$WORK_DIR/ramen_manager_config.yaml" + + # Save updated YAML for apply (only write from variable when we did not already copy the patched file) + if [[ "$PATCHED_VIA_YQ" != "true" ]]; then + echo "$UPDATED_YAML" > "$WORK_DIR/ramen_manager_config.yaml" + fi echo " Preparing to update ConfigMap with YAML content..." echo " YAML file size: $(wc -c < "$WORK_DIR/ramen_manager_config.yaml") bytes" echo " YAML file preview (first 10 lines):" head -n 10 "$WORK_DIR/ramen_manager_config.yaml" - # Update the ConfigMap using oc create with --dry-run=client and oc apply - # This is more reliable than oc set data for multiline content + # Build ConfigMap manifest: use literal-block method first (reliable, no yq/Python dependency) echo " Creating ConfigMap manifest with updated data..." oc get configmap ramen-hub-operator-config -n openshift-operators -o yaml > "$WORK_DIR/ramen-configmap-template.yaml" 2>/dev/null if [[ -f "$WORK_DIR/ramen-configmap-template.yaml" ]]; then - # Update the data section using yq or python - if command -v yq &>/dev/null; then - yq eval ".data.\"ramen_manager_config.yaml\" = load(\"$WORK_DIR/ramen_manager_config.yaml\") | .data.\"ramen_manager_config.yaml\" style=\"literal\"" -i "$WORK_DIR/ramen-configmap-template.yaml" 2>/dev/null || { - # Fallback: use python to update - python3 -c " -import yaml -import sys - -# Read the ConfigMap -with open('$WORK_DIR/ramen-configmap-template.yaml', 'r') as f: - cm = yaml.safe_load(f) - -# Read the updated YAML content -with open('$WORK_DIR/ramen_manager_config.yaml', 'r') as f: - updated_yaml = f.read() - -# Update the data section -if 'data' not in cm: - cm['data'] = {} - -cm['data']['ramen_manager_config.yaml'] = updated_yaml - -# Keep metadata but remove fields that Kubernetes manages (oc apply will update these) -if 'metadata' in cm: - # Remove only the fields that Kubernetes manages and shouldn't be in the apply - cm['metadata'].pop('resourceVersion', None) - cm['metadata'].pop('managedFields', None) - -# Write back -with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: - yaml.dump(cm, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -" 2>/dev/null - } - else - # Use python to update - python3 -c " -import yaml -import sys - -# Read the ConfigMap -with open('$WORK_DIR/ramen-configmap-template.yaml', 'r') as f: - cm = yaml.safe_load(f) - -# Read the updated YAML content -with open('$WORK_DIR/ramen_manager_config.yaml', 'r') as f: - updated_yaml = f.read() + # Always use the canonical name so we update the expected ConfigMap and verification finds it + METADATA_NAMESPACE=openshift-operators + METADATA_NAME=ramen-hub-operator-config + echo " Building ConfigMap manifest (literal block for ramen_manager_config.yaml)..." + { + echo "apiVersion: v1" + echo "kind: ConfigMap" + echo "metadata:" + echo " name: $METADATA_NAME" + echo " namespace: $METADATA_NAMESPACE" + echo "data:" + echo " ramen_manager_config.yaml: |" + sed 's/^/ /' "$WORK_DIR/ramen_manager_config.yaml" + } > "$WORK_DIR/ramen-configmap-updated.yaml" -# Update the data section -if 'data' not in cm: - cm['data'] = {} - -cm['data']['ramen_manager_config.yaml'] = updated_yaml - -# Keep metadata but remove fields that Kubernetes manages (oc apply will update these) -if 'metadata' in cm: - # Remove only the fields that Kubernetes manages and shouldn't be in the apply - cm['metadata'].pop('resourceVersion', None) - cm['metadata'].pop('managedFields', None) - -# Write back -with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: - yaml.dump(cm, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -" 2>/dev/null - fi - if [[ -f "$WORK_DIR/ramen-configmap-updated.yaml" ]]; then echo " Applying updated ConfigMap..." UPDATE_OUTPUT=$(oc apply -f "$WORK_DIR/ramen-configmap-updated.yaml" 2>&1) @@ -758,26 +619,35 @@ with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: VERIFICATION_ERRORS+=("caCertificates not found in ConfigMap") fi - if ! echo "$VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - VERIFICATION_PASSED=false - VERIFICATION_ERRORS+=("CA bundle base64 data not found in ConfigMap") + # Optional: exact base64 match can fail due to encoding/line wrap in stored ConfigMap + # Prefer verifying profile/caCertificates counts below; only warn if base64 substring missing + if [[ -n "$CA_BUNDLE_BASE64" ]] && [[ ${#CA_BUNDLE_BASE64} -gt 20 ]]; then + CA_PREFIX="${CA_BUNDLE_BASE64:0:80}" + if ! echo "$VERIFIED_YAML" | grep -qF "$CA_PREFIX"; then + echo " ⚠️ Note: CA bundle prefix not found in retrieved ConfigMap (encoding may differ); relying on profile/caCertificates count" + fi fi - - # Additional check: verify that each s3StoreProfiles item has caCertificates - # CRITICAL: Must find at least 2 S3profiles + + # Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 if echo "$VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(echo "$VERIFIED_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + PT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + CK=$(echo "$VERIFIED_YAML" | yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + CT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + # Normalize: yq can return "null" or empty; treat as 0 + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available + PROFILE_COUNT=0 + CA_CERT_COUNT=0 + fi + # If yq returned 0/0 but YAML clearly has content, use grep-based counts (works regardless of yq version/parsing) + if [[ $PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $PROFILE_COUNT -eq 0 ]]; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi + [[ "${PROFILE_COUNT:-0}" -eq 0 ]] && PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") fi PROFILE_COUNT=$(echo "$PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") @@ -904,26 +774,24 @@ with open('$WORK_DIR/ramen-patch.json', 'w') as f: VERIFICATION_ERRORS+=("caCertificates not found") fi - if ! echo "$VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - VERIFICATION_PASSED=false - VERIFICATION_ERRORS+=("CA bundle base64 data not found") - fi - - # Verify each profile has caCertificates - # CRITICAL: Must find at least 2 S3profiles + # Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 if echo "$VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(echo "$VERIFIED_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + PT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + CK=$(echo "$VERIFIED_YAML" | yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + CT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available + PROFILE_COUNT=0 + CA_CERT_COUNT=0 + fi + if [[ $PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $PROFILE_COUNT -eq 0 ]]; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi + [[ "${PROFILE_COUNT:-0}" -eq 0 ]] && PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") fi PROFILE_COUNT=$(echo "$PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") @@ -998,10 +866,18 @@ with open('$WORK_DIR/ramen-patch.json', 'w') as f: rm -f "$WORK_DIR/existing-ramen-config.yaml" "$WORK_DIR/ramen_manager_config.yaml" else - echo " ConfigMap does not exist, creating with ramen_manager_config.yaml containing s3StoreProfiles with caCertificates..." + echo " ConfigMap does not exist, creating with ramen_manager_config.yaml containing 2 s3StoreProfiles (${PRIMARY_CLUSTER}, ${SECONDARY_CLUSTER}) with caCertificates..." oc create configmap ramen-hub-operator-config -n openshift-operators \ - --from-literal=ramen_manager_config.yaml="s3StoreProfiles: - - name: default + --from-literal=ramen_manager_config.yaml="kubeObjectProtection: + s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" +s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER caCertificates: \"$CA_BUNDLE_BASE64\"" || { echo " Warning: Could not create ramen-hub-operator-config" } @@ -1160,7 +1036,7 @@ done # Verify distribution to managed clusters echo "9. Verifying certificate distribution to managed clusters..." verification_failed=false -REQUIRED_VERIFICATION_CLUSTERS=("ocp-primary" "ocp-secondary") +REQUIRED_VERIFICATION_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") VERIFIED_CLUSTERS=() for cluster in $MANAGED_CLUSTERS; do @@ -1213,7 +1089,7 @@ if [[ ${#MISSING_VERIFICATION_CLUSTERS[@]} -gt 0 ]]; then done echo "" echo "The ODF SSL certificate extractor job requires successful certificate distribution" - echo "to ALL managed clusters (ocp-primary and ocp-secondary)." + echo "to ALL managed clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER)." echo "" echo "Without proper certificate distribution, the DR setup will fail." echo "Please check cluster connectivity and kubeconfig availability." @@ -1245,47 +1121,47 @@ if [[ -z "$FINAL_VERIFIED_YAML" ]]; then handle_error "ramen-hub-operator-config ConfigMap is missing or empty - CA material not configured" fi +# Write to file to avoid ARG_MAX when content is large (big base64 certs); grep/yq on file are reliable +FINAL_VERIFIED_FILE="${WORK_DIR:-/tmp/odf-ssl-certs}/final_verified_ramen.yaml" +mkdir -p "$(dirname "$FINAL_VERIFIED_FILE")" +printf '%s' "$FINAL_VERIFIED_YAML" > "$FINAL_VERIFIED_FILE" + FINAL_VERIFICATION_PASSED=true FINAL_VERIFICATION_ERRORS=() -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "s3StoreProfiles"; then +if ! grep -q "s3StoreProfiles" "$FINAL_VERIFIED_FILE" 2>/dev/null; then FINAL_VERIFICATION_PASSED=false FINAL_VERIFICATION_ERRORS+=("s3StoreProfiles not found in final verification") fi -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "caCertificates"; then +if ! grep -q "caCertificates" "$FINAL_VERIFIED_FILE" 2>/dev/null; then FINAL_VERIFICATION_PASSED=false FINAL_VERIFICATION_ERRORS+=("caCertificates not found in final verification") fi -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - FINAL_VERIFICATION_PASSED=false - FINAL_VERIFICATION_ERRORS+=("CA bundle base64 data not found in final verification") -fi - -# Verify each profile has caCertificates -# CRITICAL: Must find at least 2 S3profiles +# Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 -if echo "$FINAL_VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles +if grep -q "s3StoreProfiles" "$FINAL_VERIFIED_FILE" 2>/dev/null; then if command -v yq &>/dev/null; then - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - FINAL_CA_CERT_COUNT=$(echo "$FINAL_VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(yq eval '.kubeObjectProtection.s3StoreProfiles | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + PT=$(yq eval '.s3StoreProfiles | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + CK=$(yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + CT=$(yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + FINAL_PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + FINAL_CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $FINAL_PROFILE_COUNT -eq 0 ]]; then - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi - FINAL_CA_CERT_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") + FINAL_PROFILE_COUNT=0 + FINAL_CA_CERT_COUNT=0 + fi + if [[ $FINAL_PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $FINAL_CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then + FINAL_PROFILE_COUNT=$(grep -c "s3ProfileName:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + [[ "${FINAL_PROFILE_COUNT:-0}" -eq 0 ]] && FINAL_PROFILE_COUNT=$(grep -c "s3Bucket:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + FINAL_CA_CERT_COUNT=$(grep -c "caCertificates:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") fi - # Remove any whitespace/newlines and ensure numeric FINAL_PROFILE_COUNT=$(echo "$FINAL_PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") FINAL_CA_CERT_COUNT=$(echo "$FINAL_CA_CERT_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") - - # Force to integer (remove leading zeros) FINAL_PROFILE_COUNT=$((10#$FINAL_PROFILE_COUNT)) FINAL_CA_CERT_COUNT=$((10#$FINAL_CA_CERT_COUNT)) @@ -1355,9 +1231,14 @@ if [[ "$FINAL_VERIFICATION_PASSED" != "true" ]]; then echo " - $error" done echo " Current ConfigMap YAML content:" - echo "$FINAL_VERIFIED_YAML" + cat "$FINAL_VERIFIED_FILE" echo "" - echo " The ConfigMap edit is not complete and correct until the CA material has been added to the S3profiles." + if [[ $FINAL_PROFILE_COUNT -eq 0 ]]; then + echo " s3StoreProfiles is empty ([]). Configure at least 2 S3 store profiles in ramen-hub-operator-config" + echo " (via Ramen hub operator or ODF) before this job can add CA certificates. This job cannot create profiles." + else + echo " The ConfigMap edit is not complete until CA material has been added to all S3 profiles." + fi echo " This is a CRITICAL error - the job cannot complete successfully." handle_error "Final verification failed - ramen-hub-operator-config is not complete and correct - CA material not in s3StoreProfiles" # After handle_error, return failure to trigger retry in main loop diff --git a/charts/hub/opp/scripts/odf-ssl-precheck.sh b/charts/hub/opp/scripts/odf-ssl-precheck.sh index 3201937..b31c270 100755 --- a/charts/hub/opp/scripts/odf-ssl-precheck.sh +++ b/charts/hub/opp/scripts/odf-ssl-precheck.sh @@ -54,12 +54,16 @@ cleanup_placeholder_configmaps() { return 0 } +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Function to wait for required clusters to be available and joined wait_for_cluster_readiness() { - echo "πŸ” Waiting for required clusters (ocp-primary and ocp-secondary) to be available and joined..." + echo "πŸ” Waiting for required clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER) to be available and joined..." echo " This may take several minutes during initial cluster deployment" - REQUIRED_CLUSTERS=("ocp-primary" "ocp-secondary") + REQUIRED_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") attempt=1 while [[ $attempt -le $CLUSTER_READINESS_MAX_ATTEMPTS ]]; do @@ -154,12 +158,12 @@ check_certificate_distribution() { fi hub_certs=$(echo "$bundle_content" | grep -c "hub" || echo "0") - ocp_primary_certs=$(echo "$bundle_content" | grep -c "ocp-primary" || echo "0") - ocp_secondary_certs=$(echo "$bundle_content" | grep -c "ocp-secondary" || echo "0") + ocp_primary_certs=$(echo "$bundle_content" | grep -c "$PRIMARY_CLUSTER" || echo "0") + ocp_secondary_certs=$(echo "$bundle_content" | grep -c "$SECONDARY_CLUSTER" || echo "0") echo " Hub cluster certificates: $hub_certs" - echo " ocp-primary certificates: $ocp_primary_certs" - echo " ocp-secondary certificates: $ocp_secondary_certs" + echo " $PRIMARY_CLUSTER certificates: $ocp_primary_certs" + echo " $SECONDARY_CLUSTER certificates: $ocp_secondary_certs" if [[ $hub_certs -lt 2 || $ocp_primary_certs -lt 2 || $ocp_secondary_certs -lt 2 ]]; then echo "❌ Missing certificates from one or more clusters" @@ -361,7 +365,7 @@ spec: echo " Added hub ingress CA to bundle" # Track required clusters - REQUIRED_CLUSTERS=("hub" "ocp-primary" "ocp-secondary") + REQUIRED_CLUSTERS=("hub" "$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") EXTRACTED_CLUSTERS=() if [[ "$hub_ca_extracted" == "true" ]]; then EXTRACTED_CLUSTERS+=("hub") @@ -369,7 +373,7 @@ spec: cluster_count=0 for cluster in $managed_clusters; do - if [[ "$cluster" == "ocp-primary" || "$cluster" == "ocp-secondary" ]]; then + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then cluster_count=$((cluster_count + 1)) echo "3.$cluster_count Extracting CA from $cluster..." @@ -411,8 +415,8 @@ spec: echo "" echo "The ODF SSL certificate extractor job requires CA material from ALL three clusters:" echo " - hub (hub cluster)" - echo " - ocp-primary (primary managed cluster)" - echo " - ocp-secondary (secondary managed cluster)" + echo " - $PRIMARY_CLUSTER (primary managed cluster)" + echo " - $SECONDARY_CLUSTER (secondary managed cluster)" echo "" echo "Without CA material from all clusters, the DR setup will fail." echo "Please ensure all clusters are accessible and have proper kubeconfigs." @@ -841,7 +845,7 @@ with open('existing-ramen-config.yaml', 'w') as f: echo "9. Verifying certificate distribution to managed clusters..." verification_failed=false - REQUIRED_VERIFICATION_CLUSTERS=("ocp-primary" "ocp-secondary") + REQUIRED_VERIFICATION_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") VERIFIED_CLUSTERS=() for cluster in $MANAGED_CLUSTERS; do @@ -893,7 +897,7 @@ with open('existing-ramen-config.yaml', 'w') as f: done echo "" echo "The ODF SSL certificate extractor job requires successful certificate distribution" - echo "to ALL managed clusters (ocp-primary and ocp-secondary)." + echo "to ALL managed clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER)." echo "" echo "Without proper certificate distribution, the DR setup will fail." echo "Please check cluster connectivity and kubeconfig availability." diff --git a/charts/hub/opp/templates/_helpers.tpl b/charts/hub/opp/templates/_helpers.tpl new file mode 100644 index 0000000..6e11da2 --- /dev/null +++ b/charts/hub/opp/templates/_helpers.tpl @@ -0,0 +1,13 @@ +{{/* Primary cluster name: clusterOverrides.primary.name else regionalDR[0].clusters.primary.name else ocp-primary */}} +{{- define "opp.primaryClusterName" -}} +{{- $over := index (.Values.clusterOverrides | default dict) "primary" | default dict -}} +{{- $fromOver := index $over "name" -}} +{{- if $fromOver }}{{ $fromOver }}{{- else if and .Values.regionalDR (index .Values.regionalDR 0) }}{{ (index .Values.regionalDR 0).clusters.primary.name | default "ocp-primary" }}{{- else }}ocp-primary{{ end -}} +{{- end -}} + +{{/* Secondary cluster name */}} +{{- define "opp.secondaryClusterName" -}} +{{- $over := index (.Values.clusterOverrides | default dict) "secondary" | default dict -}} +{{- $fromOver := index $over "name" -}} +{{- if $fromOver }}{{ $fromOver }}{{- else if and .Values.regionalDR (index .Values.regionalDR 0) }}{{ (index .Values.regionalDR 0).clusters.secondary.name | default "ocp-secondary" }}{{- else }}ocp-secondary{{ end -}} +{{- end -}} diff --git a/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml b/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml index 3e63f87..d3da73d 100644 --- a/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml +++ b/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml @@ -25,6 +25,19 @@ spec: containers: - name: argocd-health-monitor image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} + - name: FORCE_SYNC_APP_NAMESPACE + value: {{ .Values.argocdHealthMonitor.forceSyncAppNamespace | default "openshift-gitops" | quote }} + - name: FORCE_SYNC_APP_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncAppName | default "ramendr-starter-kit-resilient" | quote }} + - name: FORCE_SYNC_RESOURCE_KIND + value: {{ .Values.argocdHealthMonitor.forceSyncResourceKind | default "Namespace" | quote }} + - name: FORCE_SYNC_RESOURCE_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncResourceName | default "ramendr-starter-kit-resilient" | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/templates/job-argocd-health-monitor.yaml b/charts/hub/opp/templates/job-argocd-health-monitor.yaml index 42de71e..0469958 100644 --- a/charts/hub/opp/templates/job-argocd-health-monitor.yaml +++ b/charts/hub/opp/templates/job-argocd-health-monitor.yaml @@ -18,6 +18,19 @@ spec: containers: - name: argocd-health-monitor image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} + - name: FORCE_SYNC_APP_NAMESPACE + value: {{ .Values.argocdHealthMonitor.forceSyncAppNamespace | default "openshift-gitops" | quote }} + - name: FORCE_SYNC_APP_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncAppName | default "ramendr-starter-kit-resilient" | quote }} + - name: FORCE_SYNC_RESOURCE_KIND + value: {{ .Values.argocdHealthMonitor.forceSyncResourceKind | default "Namespace" | quote }} + - name: FORCE_SYNC_RESOURCE_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncResourceName | default "ramendr-starter-kit-resilient" | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml b/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml index fe50de5..70ead86 100644 --- a/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml +++ b/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml @@ -31,6 +31,10 @@ spec: env: - name: KUBECONFIG value: "" + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} restartPolicy: Never serviceAccountName: odf-ssl-extractor-sa backoffLimit: 10 diff --git a/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml b/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml index d8b65c2..5a9b4d0 100644 --- a/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml +++ b/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml @@ -30,6 +30,11 @@ spec: limits: memory: "128Mi" cpu: "100m" + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/values.yaml b/charts/hub/opp/values.yaml index 09a27d0..050c3e4 100644 --- a/charts/hub/opp/values.yaml +++ b/charts/hub/opp/values.yaml @@ -1,8 +1,19 @@ -# No Values -# Need to use fromSecret for these keys. problems ith json output. -## accessKey: '{{ `{{ fromSecret "openshift-storage" "noobaa-admin" "AWS_ACCESS_KEY_ID" }}` }}' -## secretKey: '{{ `{{ fromSecret "openshift-storage" "noobaa-admin" "AWS_SECRET_ACCESS_KEY" }}` }}' +--- +# DR pair cluster names - same structure as rdr chart (regionalDR). +# Override via values-hub or overrides so opp-policy and rdr use the same names. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary argocdHealthMonitor: enabled: true - + # When remediating a wedged cluster, force-sync this specific resource in this Application (instead of restarting Argo CD) + forceSyncAppNamespace: openshift-gitops + forceSyncAppName: ramendr-starter-kit-resilient + # Specific resource to sync: the Namespace (kind/name) in the application above + forceSyncResourceKind: Namespace + forceSyncResourceName: ramendr-starter-kit-resilient diff --git a/charts/hub/rdr/README.md b/charts/hub/rdr/README.md new file mode 100644 index 0000000..465b9e1 --- /dev/null +++ b/charts/hub/rdr/README.md @@ -0,0 +1,61 @@ +# RDR (Regional DR) chart + +Helm chart for Regional DR configuration (cluster pair, install_config, DRPC, etc.). + +## Updating the default install_config JSON files + +When `values.yaml` is changed (e.g. machine types, networking CIDRs, platform settings) under `regionalDR[0].clusters.primary.install_config` or `secondary.install_config`, the chart’s fallback files must be kept in sync so minimal `regionalDR` values still produce a full install_config. + +From the **repository root**: + +```bash +./scripts/update-rdr-default-install-config-json.sh +``` + +- **What it does:** Reads `charts/hub/rdr/values.yaml`, extracts both `install_config` sections, and overwrites: + - `charts/hub/rdr/files/default-primary-install-config.json` + - `charts/hub/rdr/files/default-secondary-install-config.json` +- **When to run:** After editing `install_config` in this chart’s `values.yaml`. +- **Dry-run:** To print the generated JSON without writing files: + + ```bash + ./scripts/update-rdr-default-install-config-json.sh --dry-run + ``` + +- **Requirements:** Python 3 with PyYAML (`pip install pyyaml`), or `yq` (and optionally `jq`). + +Then run the install_config tests to confirm nothing is broken: + +```bash +./scripts/test-rdr-install-config.sh +``` + +## Troubleshooting: DRCluster validation β€” "DRClusterConfig is not applied to cluster" + +The DRCluster validation job (sync-wave 8) waits until each DRCluster’s status shows `Validated=True`. If you see: + +```text +DRCluster ocp-p: Not validated yet (status: False) + Message: DRClusterConfig is not applied to cluster (ocp-p) +``` + +then the Ramen/ODF DR controller has not yet applied the DR config to that managed cluster (usually via a ManifestWork). + +**Checks:** + +1. **Hub operator** – ODF Multicluster Orchestrator / Ramen DR is installed on the hub and DRPolicy + DRCluster resources exist and are correct. +2. **Clusters joined** – Both clusters appear as ManagedClusters and are available: + + ```bash + oc get managedcluster ocp-p ocp-s + ``` + +3. **ManifestWorks** – Ramen creates ManifestWorks in each cluster’s namespace to deploy the DR cluster operator. On the hub: + + ```bash + oc get manifestwork -n ocp-p + oc get manifestwork -n ocp-s + ``` + + If these are missing or not applied, check Ramen controller logs on the hub. +4. **Cluster readiness** – Clusters must be reachable from the hub so the hub can apply and reconcile the ManifestWork; ensure they are not degraded or not ready. diff --git a/charts/hub/rdr/files/default-primary-install-config.json b/charts/hub/rdr/files/default-primary-install-config.json new file mode 100644 index 0000000..98a3db2 --- /dev/null +++ b/charts/hub/rdr/files/default-primary-install-config.json @@ -0,0 +1,55 @@ +{ + "apiVersion": "v1", + "baseDomain": "cluster.example.com", + "metadata": { + "name": "ocp-primary" + }, + "controlPlane": { + "name": "master", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.4xlarge" + } + } + }, + "compute": [ + { + "name": "worker", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.metal" + } + } + } + ], + "networking": { + "clusterNetwork": [ + { + "cidr": "10.132.0.0/14", + "hostPrefix": 23 + } + ], + "machineNetwork": [ + { + "cidr": "10.1.0.0/16" + } + ], + "networkType": "OVNKubernetes", + "serviceNetwork": [ + "172.20.0.0/16" + ] + }, + "platform": { + "aws": { + "region": "us-west-1", + "userTags": { + "project": "ValidatedPatterns" + } + } + }, + "publish": "External", + "sshKey": "", + "pullSecret": "" +} diff --git a/charts/hub/rdr/files/default-secondary-install-config.json b/charts/hub/rdr/files/default-secondary-install-config.json new file mode 100644 index 0000000..8fca7d1 --- /dev/null +++ b/charts/hub/rdr/files/default-secondary-install-config.json @@ -0,0 +1,55 @@ +{ + "apiVersion": "v1", + "baseDomain": "cluster.example.com", + "metadata": { + "name": "ocp-secondary" + }, + "controlPlane": { + "name": "master", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.4xlarge" + } + } + }, + "compute": [ + { + "name": "worker", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.metal" + } + } + } + ], + "networking": { + "clusterNetwork": [ + { + "cidr": "10.136.0.0/14", + "hostPrefix": 23 + } + ], + "machineNetwork": [ + { + "cidr": "10.2.0.0/16" + } + ], + "networkType": "OVNKubernetes", + "serviceNetwork": [ + "172.21.0.0/16" + ] + }, + "platform": { + "aws": { + "region": "us-east-1", + "userTags": { + "project": "ValidatedPatterns" + } + } + }, + "publish": "External", + "sshKey": "", + "pullSecret": "" +} diff --git a/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh b/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh index 9acd452..6467942 100755 --- a/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh +++ b/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh @@ -25,8 +25,9 @@ display_apply_error() { echo "Starting Edge GitOps VMs deployment check and deployment..." echo "This job will check for existing VMs, Services, Routes, and ExternalSecrets before applying the helm template" -# Configuration -HELM_CHART_URL="https://github.com/validatedpatterns/helm-charts/releases/download/main/edge-gitops-vms-0.2.10.tgz" +# Configuration (HELM_CHART_VERSION from values/env, default 0.2.10) +HELM_CHART_VERSION="${HELM_CHART_VERSION:-0.2.10}" +HELM_CHART_URL="https://github.com/validatedpatterns/helm-charts/releases/download/main/edge-gitops-vms-${HELM_CHART_VERSION}.tgz" WORK_DIR="/tmp/edge-gitops-vms" VALUES_FILE="$WORK_DIR/values-egv-dr.yaml" VM_NAMESPACE="gitops-vms" @@ -73,8 +74,8 @@ get_target_cluster_from_placement() { if [[ -z "$PLACEMENT_DECISION" ]]; then echo " ⚠️ Warning: Could not find PlacementDecision for $PLACEMENT_NAME" - echo " Will default to primary cluster (ocp-primary)" - TARGET_CLUSTER="ocp-primary" + echo " Will default to primary cluster (${PRIMARY_CLUSTER:-ocp-primary})" + TARGET_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" return 1 fi @@ -84,8 +85,8 @@ get_target_cluster_from_placement() { if [[ -z "$TARGET_CLUSTER" ]]; then echo " ⚠️ Warning: Could not determine target cluster from PlacementDecision" - echo " Will default to primary cluster (ocp-primary)" - TARGET_CLUSTER="ocp-primary" + echo " Will default to primary cluster (${PRIMARY_CLUSTER:-ocp-primary})" + TARGET_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" return 1 fi @@ -93,51 +94,77 @@ get_target_cluster_from_placement() { return 0 } -# Function to get kubeconfig for target managed cluster +# Function to get kubeconfig for target managed cluster (run from hub; secrets are in hub namespace ) get_target_cluster_kubeconfig() { local cluster="$1" - echo "Getting kubeconfig for target managed cluster: $cluster" + echo "Getting kubeconfig for target managed cluster: $cluster (from hub cluster)" + + # Try known secret names used by ACM for managed cluster kubeconfig + local secret_names=("${cluster}-admin-kubeconfig" "admin-kubeconfig" "import-kubeconfig") + local got_kubeconfig=false + + for secret_name in "${secret_names[@]}"; do + if oc get secret "$secret_name" -n "$cluster" -o jsonpath='{.data.kubeconfig}' 2>/dev/null | \ + base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null && [[ -s "$WORK_DIR/target-kubeconfig.yaml" ]]; then + got_kubeconfig=true + echo " βœ… Retrieved kubeconfig from secret $secret_name (namespace $cluster)" + break + fi + done + + if [[ "$got_kubeconfig" != "true" ]]; then + # Fallback: any secret in namespace $cluster with kubeconfig data + if oc get secret -n "$cluster" -o name | grep -E "(admin-kubeconfig|kubeconfig)" | head -1 | \ + xargs -I {} oc get {} -n "$cluster" -o jsonpath='{.data.kubeconfig}' | \ + base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null && [[ -s "$WORK_DIR/target-kubeconfig.yaml" ]]; then + got_kubeconfig=true + echo " βœ… Retrieved kubeconfig for $cluster" + fi + fi - # Try to get kubeconfig from secret - if oc get secret -n "$cluster" -o name | grep -E "(admin-kubeconfig|kubeconfig)" | head -1 | \ - xargs -I {} oc get {} -n "$cluster" -o jsonpath='{.data.kubeconfig}' | \ - base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null; then - echo " βœ… Retrieved kubeconfig for $cluster" + if [[ "$got_kubeconfig" == "true" ]]; then export KUBECONFIG="$WORK_DIR/target-kubeconfig.yaml" - - # Verify we can connect to the target cluster if oc get nodes &>/dev/null; then echo " βœ… Successfully connected to target managed cluster: $cluster" return 0 - else - echo " ⚠️ Warning: Could not verify connection to target cluster" - return 1 fi - else - echo " ⚠️ Could not get kubeconfig for $cluster" - echo " Will use current context (assuming we're already on the target cluster)" + echo " ⚠️ Warning: Kubeconfig retrieved but could not verify connection to $cluster" return 1 fi + + echo " ⚠️ Could not get kubeconfig for $cluster" + return 1 } +# Primary/secondary cluster names (from regionalDR via env when run by the rdr chart Job) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Get target cluster from Placement resource -TARGET_CLUSTER="ocp-primary" # Default to primary +TARGET_CLUSTER="$PRIMARY_CLUSTER" # Default to primary if get_target_cluster_from_placement; then echo " Target cluster: $TARGET_CLUSTER" else echo " Using default target cluster: $TARGET_CLUSTER" fi -# Get kubeconfig for target cluster +# Get kubeconfig for target cluster (must succeed so we do not deploy to hub by mistake) if ! get_target_cluster_kubeconfig "$TARGET_CLUSTER"; then - echo " ⚠️ Warning: Could not get kubeconfig for target cluster" - echo " Continuing with current context..." + echo " ❌ Error: Could not get kubeconfig for target cluster $TARGET_CLUSTER" + echo " Deployment must run against the primary/target cluster, not the hub." + echo " Ensure the hub can read the kubeconfig secret for $TARGET_CLUSTER (e.g. admin-kubeconfig in namespace $TARGET_CLUSTER)." + exit 1 fi -# Check if we're on the right cluster +# Verify we're on the target cluster, not the hub CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") echo "Current cluster context: $CURRENT_CLUSTER" echo "Target cluster for deployment: $TARGET_CLUSTER" +# Refuse if we're still on hub (in-cluster or local-cluster context) +if [[ "$CURRENT_CLUSTER" == "in-cluster" || "$CURRENT_CLUSTER" == "local-cluster" ]]; then + echo " ❌ Error: Current context is the hub (${CURRENT_CLUSTER}), not target $TARGET_CLUSTER. Refusing to deploy." + exit 1 +fi # Ensure the gitops-vms namespace exists on the target cluster echo "" @@ -495,16 +522,14 @@ else echo "" echo " Applying template to namespace: $VM_NAMESPACE..." - # Verify we're using the correct kubeconfig (target cluster) - if [[ -n "${KUBECONFIG:-}" && -f "$KUBECONFIG" ]]; then - CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") - echo " Using kubeconfig: $KUBECONFIG" - echo " Current cluster context: $CURRENT_CLUSTER" - echo " Target cluster: $TARGET_CLUSTER" - else - echo " ⚠️ Warning: KUBECONFIG not set or file not found, using default context" - echo " This may apply to the wrong cluster!" + # Require that we are using the target cluster's kubeconfig (never apply to hub) + if [[ "${KUBECONFIG:-}" != "$WORK_DIR/target-kubeconfig.yaml" || ! -f "$WORK_DIR/target-kubeconfig.yaml" ]]; then + echo " ❌ Error: KUBECONFIG must point to target cluster ($TARGET_CLUSTER). Refusing to apply to avoid deploying to hub." + exit 1 fi + CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") + echo " Using kubeconfig: $KUBECONFIG (target: $TARGET_CLUSTER)" + echo " Current cluster context: $CURRENT_CLUSTER" # Now apply the template and capture both stdout, stderr, and exit code # The oc apply will use the KUBECONFIG set earlier (target cluster's kubeconfig) diff --git a/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh b/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh index b34ecc3..40c5cd2 100755 --- a/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh +++ b/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh @@ -3,10 +3,10 @@ set -euo pipefail echo "Starting ODF DR prerequisites check..." -# Configuration +# Configuration (PRIMARY_CLUSTER and SECONDARY_CLUSTER from values.yaml via env) HUB_CLUSTER="local-cluster" -PRIMARY_CLUSTER="ocp-primary" -SECONDARY_CLUSTER="ocp-secondary" +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" KUBECONFIG_DIR="/tmp/kubeconfigs" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks @@ -306,35 +306,35 @@ check_ca_material_completeness() { return 1 fi - # Look for primary cluster certificates - if [[ "$hub_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Hub cluster CA bundle missing ocp-primary-ca certificate" + # Look for primary cluster certificates (marker from odf-ssl-certificate-extraction.sh) + if [[ "$hub_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Hub cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$primary_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Primary cluster CA bundle missing ocp-primary-ca certificate" + if [[ "$primary_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Primary cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$secondary_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Secondary cluster CA bundle missing ocp-primary-ca certificate" + if [[ "$secondary_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Secondary cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi # Look for secondary cluster certificates - if [[ "$hub_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Hub cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$hub_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Hub cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$primary_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Primary cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$primary_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Primary cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$secondary_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Secondary cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$secondary_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Secondary cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi diff --git a/charts/hub/rdr/scripts/submariner-prerequisites-check.sh b/charts/hub/rdr/scripts/submariner-prerequisites-check.sh index cdcb518..389c85e 100755 --- a/charts/hub/rdr/scripts/submariner-prerequisites-check.sh +++ b/charts/hub/rdr/scripts/submariner-prerequisites-check.sh @@ -3,9 +3,9 @@ set -euo pipefail echo "Starting Submariner prerequisites check..." -# Configuration -PRIMARY_CLUSTER="ocp-primary" -SECONDARY_CLUSTER="ocp-secondary" +# Configuration (PRIMARY_CLUSTER and SECONDARY_CLUSTER from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" KUBECONFIG_DIR="/tmp/kubeconfigs" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks diff --git a/charts/hub/rdr/templates/_helpers.tpl b/charts/hub/rdr/templates/_helpers.tpl new file mode 100644 index 0000000..3f80695 --- /dev/null +++ b/charts/hub/rdr/templates/_helpers.tpl @@ -0,0 +1,150 @@ +{{/* + Sanitize install_config for OpenShift installer: ensure apiVersion, pass through all + install-config fields (including full platform.aws: region, subnets, userTags, amiID, + defaultMachinePlatform, serviceEndpoints, etc.) so regionalDR and clusterOverrides + can override platform/region effectively. Only strip keys known invalid for the + installer (e.g. vpc in platform.aws). +*/}} +{{- define "rdr.sanitizeInstallConfig" -}} +{{- $raw := . -}} +{{- $withVersion := merge (dict "apiVersion" "v1") $raw -}} +{{- $platform := index $withVersion "platform" | default dict -}} +{{- $aws := index $platform "aws" | default dict -}} +{{- /* Pass through full platform.aws (region, subnets, userTags, amiID, defaultMachinePlatform, serviceEndpoints, etc.); omit only known-invalid keys like vpc */ -}} +{{- $awsSafe := ternary (omit $aws "vpc") $aws (and (kindIs "map" $aws) (hasKey $aws "vpc")) -}} +{{- $platformSafe := merge $platform (dict "aws" $awsSafe) -}} +{{- $allowed := dict "apiVersion" (index $withVersion "apiVersion") "baseDomain" (index $withVersion "baseDomain") "metadata" (index $withVersion "metadata") "controlPlane" (index $withVersion "controlPlane") "compute" (index $withVersion "compute") "networking" (index $withVersion "networking") "platform" $platformSafe "publish" (index $withVersion "publish") "pullSecret" (index $withVersion "pullSecret") "sshKey" (index $withVersion "sshKey") -}} +{{- $allowed | toJson -}} +{{- end -}} + +{{/* + Deep-merge install_config so clusterOverrides can override only platform/region, + metadata, or any subset without replacing the rest of base install_config. + Call with dict "base" "over" . +*/}} +{{- define "rdr.mergeInstallConfig" -}} +{{- $base := .base | default dict -}} +{{- $over := .over | default dict -}} +{{- /* Sprig merge: first dict wins; put over first so override wins */ -}} +{{- $merged := merge $over $base -}} +{{- $metadataMerged := merge (index $over "metadata" | default dict) (index $base "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $base "platform" | default dict -}} +{{- $platformOver := index $over "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index $platformOver "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- merge $merged (dict "platform" $platformFinal) | toJson -}} +{{- end -}} + +{{/* + Effective primary cluster: merge of regionalDR[0].clusters.primary and clusterOverrides.primary. + Use when clusterOverrides is set to avoid replacing full regionalDR in override file. + Call with a context that has .Values and optionally .primaryOverrideInstallConfig (override install_config); + if primaryOverrideInstallConfig is not provided, falls back to .Values.clusterOverrides.primary.install_config. +*/}} +{{- define "rdr.effectivePrimaryCluster" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- $over := index (.Values.clusterOverrides | default dict) "primary" | default dict -}} +{{- $base := $dr.clusters.primary -}} +{{- $baseIC := $base.install_config | default dict -}} +{{- /* When values-hub (or similar) replaces regionalDR with minimal structure, base has no install_config; use chart default */ -}} +{{- if and (index . "Files") (not (hasKey $baseIC "controlPlane")) -}} +{{- $baseIC = fromJson ((index . "Files").Get "files/default-primary-install-config.json") | default dict -}} +{{- end -}} +{{- $overIC := index . "primaryOverrideInstallConfig" | default $over.install_config | default dict -}} +{{- /* Shallow merge: over wins. Deep-merge metadata, platform.aws, controlPlane, compute so over wins but base keeps machine types when over is partial. */ -}} +{{- $merged := merge $overIC $baseIC -}} +{{- $metadataMerged := merge (index $overIC "metadata" | default dict) (index $baseIC "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $baseIC "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index (index $overIC "platform" | default dict) "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- $merged := merge $merged (dict "platform" $platformFinal) -}} +{{- /* Deep-merge controlPlane so override can set platform.aws.type without dropping base name/replicas */ -}} +{{- $cpBase := index $baseIC "controlPlane" | default dict -}} +{{- $cpOver := index $overIC "controlPlane" | default dict -}} +{{- $cpMerged := merge $cpOver $cpBase -}} +{{- $cpPlatformBase := index $cpBase "platform" | default dict -}} +{{- $cpPlatformOver := index $cpOver "platform" | default dict -}} +{{- $cpAwsBase := index $cpPlatformBase "aws" | default dict -}} +{{- $cpAwsOver := index $cpPlatformOver "aws" | default dict -}} +{{- $cpAwsMerged := merge $cpAwsOver $cpAwsBase -}} +{{- $cpPlatformFinal := merge $cpPlatformBase (dict "aws" $cpAwsMerged) -}} +{{- $controlPlaneFinal := merge $cpMerged (dict "platform" $cpPlatformFinal) -}} +{{- $merged := merge $merged (dict "controlPlane" $controlPlaneFinal) -}} +{{- /* Compute: override list wins if non-empty; else use base so base machine types are kept */ -}} +{{- $computeBase := index $baseIC "compute" | default list -}} +{{- $computeOver := index $overIC "compute" | default list -}} +{{- $computeFinal := ternary $computeOver $computeBase (gt (len $computeOver) 0) -}} +{{- $installConfig := merge $merged (dict "compute" $computeFinal) -}} +{{- $installConfigSafe := fromJson (include "rdr.sanitizeInstallConfig" $installConfig) -}} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) -}} +{{- $installConfigWithBase := merge $installConfigSafe (dict "baseDomain" ($defaultBaseDomain | default (index $installConfigSafe "baseDomain"))) -}} +{{- $clusterGroup := index $over "clusterGroup" | default $base.clusterGroup | default $dr.name -}} +{{- dict "name" (index $over "name" | default $base.name) "version" (index $over "version" | default $base.version) "clusterGroup" $clusterGroup "install_config" $installConfigWithBase | toJson -}} +{{- end -}} + +{{/* + Effective secondary cluster: merge of regionalDR[0].clusters.secondary and clusterOverrides.secondary. + Call with a context that has .Values and optionally .secondaryOverrideInstallConfig. +*/}} +{{- define "rdr.effectiveSecondaryCluster" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- $over := index (.Values.clusterOverrides | default dict) "secondary" | default dict -}} +{{- $base := $dr.clusters.secondary -}} +{{- $baseIC := $base.install_config | default dict -}} +{{- if and (index . "Files") (not (hasKey $baseIC "controlPlane")) -}} +{{- $baseIC = fromJson ((index . "Files").Get "files/default-secondary-install-config.json") | default dict -}} +{{- end -}} +{{- $overIC := index . "secondaryOverrideInstallConfig" | default $over.install_config | default dict -}} +{{- $merged := merge $overIC $baseIC -}} +{{- $metadataMerged := merge (index $overIC "metadata" | default dict) (index $baseIC "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $baseIC "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index (index $overIC "platform" | default dict) "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- $merged := merge $merged (dict "platform" $platformFinal) -}} +{{- $cpBase := index $baseIC "controlPlane" | default dict -}} +{{- $cpOver := index $overIC "controlPlane" | default dict -}} +{{- $cpMerged := merge $cpOver $cpBase -}} +{{- $cpPlatformBase := index $cpBase "platform" | default dict -}} +{{- $cpPlatformOver := index $cpOver "platform" | default dict -}} +{{- $cpAwsBase := index $cpPlatformBase "aws" | default dict -}} +{{- $cpAwsOver := index $cpPlatformOver "aws" | default dict -}} +{{- $cpAwsMerged := merge $cpAwsOver $cpAwsBase -}} +{{- $cpPlatformFinal := merge $cpPlatformBase (dict "aws" $cpAwsMerged) -}} +{{- $controlPlaneFinal := merge $cpMerged (dict "platform" $cpPlatformFinal) -}} +{{- $merged := merge $merged (dict "controlPlane" $controlPlaneFinal) -}} +{{- $computeBase := index $baseIC "compute" | default list -}} +{{- $computeOver := index $overIC "compute" | default list -}} +{{- $computeFinal := ternary $computeOver $computeBase (gt (len $computeOver) 0) -}} +{{- $installConfig := merge $merged (dict "compute" $computeFinal) -}} +{{- $installConfigSafe := fromJson (include "rdr.sanitizeInstallConfig" $installConfig) -}} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) -}} +{{- $installConfigWithBase := merge $installConfigSafe (dict "baseDomain" ($defaultBaseDomain | default (index $installConfigSafe "baseDomain"))) -}} +{{- $clusterGroup := index $over "clusterGroup" | default $base.clusterGroup | default $dr.name -}} +{{- dict "name" (index $over "name" | default $base.name) "version" (index $over "version" | default $base.version) "clusterGroup" $clusterGroup "install_config" $installConfigWithBase | toJson -}} +{{- end -}} + +{{/* Primary cluster name for use in drpc, jobs, etc. */}} +{{- define "rdr.primaryClusterName" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- index (index (.Values.clusterOverrides | default dict) "primary" | default dict) "name" | default $dr.clusters.primary.name -}} +{{- end -}} + +{{/* Secondary cluster name */}} +{{- define "rdr.secondaryClusterName" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- index (index (.Values.clusterOverrides | default dict) "secondary" | default dict) "name" | default $dr.clusters.secondary.name -}} +{{- end -}} + +{{/* Preferred cluster for DRPC (default: primary). Override via values.drpc.preferredCluster. */}} +{{- define "rdr.preferredClusterName" -}} +{{- (index (.Values.drpc | default dict) "preferredCluster") | default (include "rdr.primaryClusterName" .) -}} +{{- end -}} diff --git a/charts/hub/rdr/templates/cluster_deployments.yaml b/charts/hub/rdr/templates/cluster_deployments.yaml index 4ad029b..f1221fb 100644 --- a/charts/hub/rdr/templates/cluster_deployments.yaml +++ b/charts/hub/rdr/templates/cluster_deployments.yaml @@ -1,7 +1,14 @@ -{{- range .Values.regionalDR }} -{{ $clusterSet := .name }} -{{- range list .clusters.primary .clusters.secondary }} -{{ $cluster := . }} +{{- $dr := index .Values.regionalDR 0 }} +{{- $clusterSet := $dr.name }} +{{- $co := .Values.clusterOverrides | default dict -}} +{{- $ctx := dict "Values" .Values "Files" .Files "primaryOverrideInstallConfig" (index (index $co "primary" | default dict) "install_config" | default dict) "secondaryOverrideInstallConfig" (index (index $co "secondary" | default dict) "install_config" | default dict) }} +{{- $effectivePrimary := include "rdr.effectivePrimaryCluster" $ctx | fromJson }} +{{- $effectiveSecondary := include "rdr.effectiveSecondaryCluster" $ctx | fromJson }} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) }} +{{- range list $effectivePrimary $effectiveSecondary }} +{{- $cluster := . }} +{{- $baseDomainRaw := index $cluster.install_config "baseDomain" }} +{{- $baseDomainStr := default $defaultBaseDomain (and (kindIs "string" $baseDomainRaw) $baseDomainRaw) }} --- apiVersion: v1 kind: Namespace @@ -104,7 +111,7 @@ data: install-config.yaml: {{ tpl $install_config $ | b64enc }} --- -{{- $domainLabels := splitList "." (tpl $cluster.install_config.baseDomain $) }} +{{- $domainLabels := splitList "." (tpl $baseDomainStr $) }} {{- $baseDomain := join "." (slice $domainLabels 1) }} apiVersion: hive.openshift.io/v1 kind: ClusterDeployment @@ -115,6 +122,7 @@ metadata: purpose: regionalDR vendor: OpenShift cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} annotations: argocd.argoproj.io/sync-wave: "1" spec: @@ -146,7 +154,7 @@ metadata: name: {{ $cluster.name }} vendor: OpenShift cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} - clusterGroup: {{ $cluster.clusterGroup }} + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} purpose: regionalDR name: {{ $cluster.name }} annotations: @@ -167,6 +175,9 @@ spec: clusterNamespace: {{ $cluster.name }} clusterLabels: vendor: OpenShift + cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} + purpose: regionalDR + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} applicationManager: enabled: true policyController: @@ -178,4 +189,3 @@ spec: iamPolicyController: enabled: true {{- end }} -{{- end }} diff --git a/charts/hub/rdr/templates/dr_policy.yaml b/charts/hub/rdr/templates/dr_policy.yaml index e33cb09..9b4aaaf 100644 --- a/charts/hub/rdr/templates/dr_policy.yaml +++ b/charts/hub/rdr/templates/dr_policy.yaml @@ -16,8 +16,8 @@ metadata: cluster.open-cluster-management.io/backup: ramen spec: drClusters: - - {{ $clusters.primary.name }} - - {{ $clusters.secondary.name }} + - {{ include "rdr.primaryClusterName" $ }} + - {{ include "rdr.secondaryClusterName" $ }} schedulingInterval: {{ .interval }} {{- if .vmSupport }} replicationClassSelector: diff --git a/charts/hub/rdr/templates/drpc.yaml b/charts/hub/rdr/templates/drpc.yaml index 0f59902..2fef27a 100644 --- a/charts/hub/rdr/templates/drpc.yaml +++ b/charts/hub/rdr/templates/drpc.yaml @@ -1,38 +1,51 @@ +{{- $drpc := .Values.drpc | default dict }} --- apiVersion: ramendr.openshift.io/v1alpha1 kind: DRPlacementControl metadata: annotations: - drplacementcontrol.ramendr.openshift.io/app-namespace: openshift-dr-ops + drplacementcontrol.ramendr.openshift.io/app-namespace: {{ $drpc.namespace | default "openshift-dr-ops" }} argocd.argoproj.io/sync-wave: "9" labels: cluster.open-cluster-management.io/backup: ramen - name: gitops-vm-protection - namespace: openshift-dr-ops + name: {{ $drpc.name | default "gitops-vm-protection" }} + namespace: {{ $drpc.namespace | default "openshift-dr-ops" }} spec: drPolicyRef: apiVersion: ramendr.openshift.io/v1alpha1 kind: DRPolicy - name: 2m-vm + name: {{ index $drpc "drPolicyRef" "name" | default "2m-vm" }} kubeObjectProtection: - captureInterval: 2m0s + captureInterval: {{ index $drpc "kubeObjectProtection" "captureInterval" | default "2m0s" }} kubeObjectSelector: matchExpressions: + {{- with index $drpc "kubeObjectProtection" "kubeObjectSelector" "matchExpressions" }} + {{- toYaml . | nindent 6 }} + {{- else }} - key: drprotection operator: In values: - "true" + {{- end }} placementRef: apiVersion: cluster.open-cluster-management.io/v1beta1 kind: Placement - name: gitops-vm-protection-placement-1 - namespace: openshift-dr-ops - preferredCluster: ocp-primary + name: {{ index $drpc "placementRef" "name" | default "gitops-vm-protection-placement-1" }} + namespace: {{ index $drpc "placementRef" "namespace" | default "openshift-dr-ops" }} + preferredCluster: {{ include "rdr.preferredClusterName" . }} protectedNamespaces: + {{- with index $drpc "protectedNamespaces" }} + {{- toYaml . | nindent 2 }} + {{- else }} - gitops-vms + {{- end }} pvcSelector: matchExpressions: + {{- with index $drpc "pvcSelector" "matchExpressions" }} + {{- toYaml . | nindent 6 }} + {{- else }} - key: app.kubernetes.io/component operator: In values: - storage + {{- end }} \ No newline at end of file diff --git a/charts/hub/rdr/templates/job-drcluster-validation.yaml b/charts/hub/rdr/templates/job-drcluster-validation.yaml index 9da52ba..18260d3 100644 --- a/charts/hub/rdr/templates/job-drcluster-validation.yaml +++ b/charts/hub/rdr/templates/job-drcluster-validation.yaml @@ -34,9 +34,9 @@ spec: echo "Starting DRCluster validation check..." - # Configuration - PRIMARY_CLUSTER="{{ $clusters.primary.name }}" - SECONDARY_CLUSTER="{{ $clusters.secondary.name }}" + # Configuration (clusterOverrides or regionalDR) + PRIMARY_CLUSTER="{{ include "rdr.primaryClusterName" $ }}" + SECONDARY_CLUSTER="{{ include "rdr.secondaryClusterName" $ }}" DRPOLICY_NAME="{{ $name }}" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks @@ -105,10 +105,20 @@ spec: done echo "❌ DRCluster validation check failed after $MAX_ATTEMPTS attempts" - echo "Please ensure:" + echo "" + echo "If the message was 'DRClusterConfig is not applied to cluster':" + echo " Ramen has not yet applied the DR config to that managed cluster (e.g. via ManifestWork)." + echo " - Ensure the ODF/Ramen DR operator is installed on the hub and DRPolicy/DRCluster exist." + echo " - Ensure both clusters are joined (ManagedCluster) and available (hub can reach them)." + echo " - On the hub, check ManifestWorks in each cluster namespace for Ramen/DRClusterConfig:" + echo " oc get manifestwork -n $PRIMARY_CLUSTER" + echo " oc get manifestwork -n $SECONDARY_CLUSTER" + echo " - Check Ramen controller logs on the hub if ManifestWorks are missing or failing." + echo "" + echo "General checks:" echo "1. DRPolicy $DRPOLICY_NAME is created and properly configured" echo "2. Both clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER) are available and joined" - echo "3. DR operator is installed and running on both clusters" + echo "3. DR operator (Ramen/ODF) is installed and running on the hub and on both clusters" echo "4. ODF is properly configured on both clusters" echo "" echo "Current DRCluster status:" diff --git a/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml b/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml index 294ca3e..9ba9910 100644 --- a/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml +++ b/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml @@ -60,6 +60,12 @@ spec: env: - name: KUBECONFIG value: "" + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} + - name: HELM_CHART_VERSION + value: {{ index (index .Values "edgeGitopsVms" | default dict) "chartVersion" | default "0.2.10" | quote }} volumes: - name: values configMap: diff --git a/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml b/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml index 37f4dfc..dab05e0 100644 --- a/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml +++ b/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml @@ -17,6 +17,11 @@ spec: containers: - name: prerequisites-checker image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/rdr/templates/job-submariner-prerequisites.yaml b/charts/hub/rdr/templates/job-submariner-prerequisites.yaml index 59159a0..5c3dd7a 100644 --- a/charts/hub/rdr/templates/job-submariner-prerequisites.yaml +++ b/charts/hub/rdr/templates/job-submariner-prerequisites.yaml @@ -17,6 +17,11 @@ spec: containers: - name: prerequisites-checker image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/rdr/templates/mirrorpeer_create.yaml b/charts/hub/rdr/templates/mirrorpeer_create.yaml index 59635b7..3d01617 100644 --- a/charts/hub/rdr/templates/mirrorpeer_create.yaml +++ b/charts/hub/rdr/templates/mirrorpeer_create.yaml @@ -12,11 +12,11 @@ metadata: argocd.argoproj.io/sync-wave: "8" spec: items: - - clusterName: {{ $clusters.primary.name }} + - clusterName: {{ include "rdr.primaryClusterName" $ }} storageClusterRef: name: ocs-storagecluster namespace: openshift-storage - - clusterName: {{ $clusters.secondary.name }} + - clusterName: {{ include "rdr.secondaryClusterName" $ }} storageClusterRef: name: ocs-storagecluster namespace: openshift-storage diff --git a/charts/hub/rdr/templates/submariner_addon_install.yaml b/charts/hub/rdr/templates/submariner_addon_install.yaml index 0e25a5a..2b82b4a 100644 --- a/charts/hub/rdr/templates/submariner_addon_install.yaml +++ b/charts/hub/rdr/templates/submariner_addon_install.yaml @@ -1,18 +1,19 @@ -{{- range .Values.regionalDR }} -{{ $globalnetEnabled := .globalnetEnabled }} +{{- $dr := index .Values.regionalDR 0 }} +{{- $clusterSet := $dr.name }} +{{- $globalnetEnabled := $dr.globalnetEnabled }} --- apiVersion: v1 kind: Namespace metadata: annotations: argocd.argoproj.io/sync-wave: "3" - name: {{ .name }}-broker + name: {{ $clusterSet }}-broker --- apiVersion: submariner.io/v1alpha1 kind: Broker metadata: name: submariner-broker - namespace: {{ .name }}-broker + namespace: {{ $clusterSet }}-broker labels: cluster.open-cluster-management.io/backup: submariner annotations: @@ -21,8 +22,10 @@ spec: globalnetEnabled: {{ $globalnetEnabled | default false }} -{{- range list .clusters.primary .clusters.secondary }} -{{ $cluster := . }} +{{- $effectivePrimary := include "rdr.effectivePrimaryCluster" . | fromJson }} +{{- $effectiveSecondary := include "rdr.effectiveSecondaryCluster" . | fromJson }} +{{- range list $effectivePrimary $effectiveSecondary }} +{{- $cluster := . }} --- apiVersion: addon.open-cluster-management.io/v1alpha1 kind: ManagedClusterAddOn @@ -54,4 +57,3 @@ spec: name: {{ $cluster.name }}-cluster-aws-creds {{- end }} -{{- end }} diff --git a/charts/hub/rdr/values.yaml b/charts/hub/rdr/values.yaml index 9e1f5e5..67f0f5b 100644 --- a/charts/hub/rdr/values.yaml +++ b/charts/hub/rdr/values.yaml @@ -2,6 +2,53 @@ global: clusterDomain: cluster.example.com +# Edge GitOps VMs deployment (script and helm chart) +edgeGitopsVms: + chartVersion: "0.2.10" + +# Minimal overrides for cluster names, versions, regions (e.g. overrides/values-cluster-names.yaml). +# Merge override file FIRST so full regionalDR stays here and only clusterOverrides are applied. +# clusterOverrides: +# primary: +# name: ocp-primary +# version: 4.18.7 +# install_config: +# metadata: +# name: ocp-primary +# platform: +# aws: +# region: us-west-1 +# secondary: +# name: ocp-secondary +# ... + +# DRPlacementControl (gitops-vm-protection) - override in values-cluster-names or values +drpc: + name: gitops-vm-protection + namespace: openshift-dr-ops + drPolicyRef: + name: 2m-vm + kubeObjectProtection: + captureInterval: 2m0s + kubeObjectSelector: + matchExpressions: + - key: drprotection + operator: In + values: + - "true" + placementRef: + name: gitops-vm-protection-placement-1 + namespace: openshift-dr-ops + protectedNamespaces: + - gitops-vms + pvcSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - storage + preferredCluster: ocp-primary # default; override in values-cluster-names or values + regionalDR: - name: resilient # Matches with ClusterSet globalnetEnabled: false # Support for overlapping CIDR diff --git a/overrides/values-cluster-names.yaml b/overrides/values-cluster-names.yaml new file mode 100644 index 0000000..31e1a1f --- /dev/null +++ b/overrides/values-cluster-names.yaml @@ -0,0 +1,29 @@ +--- +# Minimal override for DR cluster names, versions, and regions. +# This file is applied after chart defaults (override merged last). It only contains overlay keys +# (clusterOverrides, drpc) so chart regionalDR defaults are preserved. +# Also works if merged first: -f overrides/values-cluster-names.yaml -f charts/hub/rdr/values.yaml + +clusterOverrides: + primary: + name: ocp-primary + version: 4.18.7 + install_config: + metadata: + name: ocp-primary + platform: + aws: + region: us-west-1 + secondary: + name: ocp-secondary + version: 4.18.7 + install_config: + metadata: + name: ocp-secondary + platform: + aws: + region: us-east-1 + +# DRPC overrides (preferredCluster defaults to primary if unset) +drpc: + preferredCluster: ocp-primary diff --git a/overrides/values-minimal-regional-dr.yaml b/overrides/values-minimal-regional-dr.yaml new file mode 100644 index 0000000..5a572bf --- /dev/null +++ b/overrides/values-minimal-regional-dr.yaml @@ -0,0 +1,10 @@ +--- +# Simulates old values-hub when it had a minimal regionalDR (no install_config). +# Used only for testing: chart falls back to files/default-*-install-config.json. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary diff --git a/scripts/cleanup-gitops-vms-non-primary.sh b/scripts/cleanup-gitops-vms-non-primary.sh index 9035233..43f120c 100755 --- a/scripts/cleanup-gitops-vms-non-primary.sh +++ b/scripts/cleanup-gitops-vms-non-primary.sh @@ -3,7 +3,7 @@ set -euo pipefail # Script to manually cleanup gitops-vms namespace on the non-primary cluster # This script will: -# 1. Determine the non-primary cluster (ocp-secondary by default) +# 1. Determine the non-primary cluster (discovered from DR policy; override with PRIMARY_CLUSTER/SECONDARY_CLUSTER env if needed) # 2. Render the helm template with the same chart version and values # 3. Extract resource kinds and names # 4. Delete them from the gitops-vms namespace diff --git a/scripts/download-kubeconfigs.sh b/scripts/download-kubeconfigs.sh index 7f03484..26ca345 100755 --- a/scripts/download-kubeconfigs.sh +++ b/scripts/download-kubeconfigs.sh @@ -120,12 +120,14 @@ show_usage() { echo "" echo "Examples:" echo " $0 # Download all managed cluster kubeconfigs" - echo " $0 -c ocp-primary # Download only ocp-primary kubeconfig" + echo " $0 -c \${PRIMARY_CLUSTER:-ocp-primary} # Download only primary cluster kubeconfig (set PRIMARY_CLUSTER to match values.yaml)" echo " $0 -o /tmp/kubeconfigs # Download to /tmp/kubeconfigs directory" echo " $0 --dry-run # Show what would be downloaded" echo "" echo "Environment variables:" echo " KUBECONFIG # Kubeconfig for hub cluster (if not using current context)" + echo " PRIMARY_CLUSTER # Primary DR cluster name (default: ocp-primary; match values.yaml)" + echo " SECONDARY_CLUSTER # Secondary DR cluster name (default: ocp-secondary; match values.yaml)" } # Main function diff --git a/scripts/extract-cluster-cas.sh b/scripts/extract-cluster-cas.sh index 94f3843..69c076b 100755 --- a/scripts/extract-cluster-cas.sh +++ b/scripts/extract-cluster-cas.sh @@ -108,18 +108,19 @@ main() { ((cluster_index++)) done - # For the specific clusters in your configuration (ocp-primary, ocp-secondary) - # These would need to be extracted when the clusters are available + # For the specific clusters in your configuration (set PRIMARY_CLUSTER/SECONDARY_CLUSTER to match values.yaml) + PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" + SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" echo "" echo "CA certificate extraction completed." echo "Certificates are stored in: $CA_OUTPUT_DIR" echo "" echo "To extract CAs from your specific clusters, run:" - echo " # For ocp-primary cluster:" - echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/ocp-primary-ca.crt" + echo " # For $PRIMARY_CLUSTER cluster:" + echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/${PRIMARY_CLUSTER}-ca.crt" echo "" - echo " # For ocp-secondary cluster:" - echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/ocp-secondary-ca.crt" + echo " # For $SECONDARY_CLUSTER cluster:" + echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/${SECONDARY_CLUSTER}-ca.crt" echo "" echo "Then update your values files with the CA data." } diff --git a/scripts/test-rdr-install-config.sh b/scripts/test-rdr-install-config.sh new file mode 100755 index 0000000..f294bdc --- /dev/null +++ b/scripts/test-rdr-install-config.sh @@ -0,0 +1,239 @@ +#!/usr/bin/env bash +# Test rdr chart install_config rendering in multiple merge scenarios. +# +# Scenarios: +# 1. Baseline: chart values only β†’ full install_config from chart regionalDR. +# 2. Chart + overrides/values-cluster-names.yaml β†’ overridden names/regions, full structure. +# 3. Chart + values-hub + overrides β†’ values-hub has no regionalDR, so chart regionalDR kept; overrides apply. +# 4. values-hub + overrides (no explicit chart -f) β†’ chart defaults still load; same as 3. +# 5. Minimal regionalDR + overrides β†’ simulates old values-hub with minimal regionalDR; chart uses +# files/default-*-install-config.json so install_config is still full; overrides apply. +# +# Ensures all required fields are present (metadata, controlPlane, compute, networking, platform) +# in every scenario, and that overridden fields (metadata.name, platform.aws.region) match overrides when used. + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +CHART="$REPO_ROOT/charts/hub/rdr" +RDR_VALUES="$REPO_ROOT/charts/hub/rdr/values.yaml" +OVERRIDES="$REPO_ROOT/overrides/values-cluster-names.yaml" +VALUES_HUB="$REPO_ROOT/values-hub.yaml" +DOMAIN="${TEST_CLUSTER_DOMAIN:-example.com}" + +# Minimal regionalDR (simulates old values-hub that replaced full regionalDR) +# This triggers the chart's default install_config files. +MINIMAL_REGIONAL_DR="$REPO_ROOT/overrides/values-minimal-regional-dr.yaml" + +run_helm() { + helm template rdr "$CHART" "$@" --set "global.clusterDomain=$DOMAIN" 2>/dev/null +} + +# Run helm; stdout+stderr to stdout, exit code preserved (caller can redirect and check $?) +run_helm_capture() { + helm template rdr "$CHART" "$@" --set "global.clusterDomain=$DOMAIN" 2>&1 +} + +# Extract and decode install-config.yaml from the Nth Secret (1=primary, 2=secondary) +get_install_config() { + local out="$1" + local nth="${2:-1}" + echo "$out" | grep -A1 "install-config.yaml:" | grep "install-config.yaml" | sed -n "${nth}p" | awk '{print $2}' | base64 -d 2>/dev/null || true +} + +# Validate decoded install_config has required structure (no empty compute, no null networking, etc.) +validate_install_config() { + local yaml="$1" + local label="$2" + local err=0 + if echo "$yaml" | grep -q "compute: \[\]"; then + echo " FAIL $label: compute is empty []" + err=1 + fi + if echo "$yaml" | grep -q "networking: null"; then + echo " FAIL $label: networking is null" + err=1 + fi + if echo "$yaml" | grep -q "publish: null"; then + echo " FAIL $label: publish is null" + err=1 + fi + if echo "$yaml" | grep -qE "platform:\s*$" -A1 | grep -q "aws: {}"; then + echo " FAIL $label: platform.aws is empty {}" + err=1 + fi + if ! echo "$yaml" | grep -q "controlPlane:"; then + echo " FAIL $label: missing controlPlane" + err=1 + fi + if ! echo "$yaml" | grep -q "type: m5"; then + echo " FAIL $label: missing machine type (m5.4xlarge or m5.metal)" + err=1 + fi + if ! echo "$yaml" | grep -q "metadata:"; then + echo " FAIL $label: missing metadata" + err=1 + fi + if ! echo "$yaml" | grep -q "platform:"; then + echo " FAIL $label: missing platform" + err=1 + fi + if ! echo "$yaml" | grep -q "region:"; then + echo " FAIL $label: missing platform.aws.region" + err=1 + fi + if [[ $err -eq 0 ]]; then + echo " OK $label: required fields present" + fi + return $err +} + +# Return 0 if install_config YAML has nulled/empty sections (broken); 1 if full +is_install_config_broken() { + local yaml="$1" + if echo "$yaml" | grep -q "compute: \[\]"; then return 0; fi + if echo "$yaml" | grep -q "networking: null"; then return 0; fi + if ! echo "$yaml" | grep -q "controlPlane:"; then return 0; fi + if ! echo "$yaml" | grep -q "type: m5"; then return 0; fi + return 1 +} + +# Create minimal regionalDR override once (used for scenario 5) +ensure_minimal_regional_dr() { + if [[ ! -f "$MINIMAL_REGIONAL_DR" ]]; then + cat > "$MINIMAL_REGIONAL_DR" << 'EOF' +--- +# Simulates old values-hub when it had a minimal regionalDR (no install_config). +# Used only for testing: chart falls back to files/default-*-install-config.json. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary +EOF + echo "Created $MINIMAL_REGIONAL_DR for testing." + fi +} + +main() { + local total_fail=0 + echo "=== RDR install_config rendering tests (domain=$DOMAIN) ===" + echo "" + + # Scenario 1: Baseline – chart defaults only + echo "--- Scenario 1: Baseline (chart values only) ---" + out=$(run_helm -f "$RDR_VALUES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (baseline)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (baseline)" || ((total_fail++)) + pname=$(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + sname=$(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + echo " Primary metadata.name: $pname" + echo " Secondary metadata.name: $sname" + [[ "$pname" == "ocp-primary" && "$sname" == "ocp-secondary" ]] || { echo " FAIL baseline: expected ocp-primary / ocp-secondary"; ((total_fail++)); } + echo "" + + # Scenario 2: Chart + overrides (values-cluster-names) + echo "--- Scenario 2: Chart + overrides/values-cluster-names.yaml ---" + out=$(run_helm -f "$RDR_VALUES" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (chart+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (chart+overrides)" || ((total_fail++)) + pname=$(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + sname=$(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + preg=$(echo "$primary" | grep 'region:' | head -1 | awk '{print $2}') + sreg=$(echo "$secondary" | grep 'region:' | head -1 | awk '{print $2}') + echo " Primary metadata.name: $pname" + echo " Primary region: $preg" + echo " Secondary metadata.name: $sname" + echo " Secondary region: $sreg" + # Override file may use ocp-p/ocp-s or other names; just ensure regions are set + [[ -n "$preg" && -n "$sreg" ]] || { echo " FAIL chart+overrides: regions should be set"; ((total_fail++)); } + echo "" + + # Scenario 3: Chart + values-hub (no regionalDR) + overrides + echo "--- Scenario 3: Chart + values-hub + overrides ---" + out=$(run_helm -f "$RDR_VALUES" -f "$VALUES_HUB" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (chart+hub+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (chart+hub+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo "" + + # Scenario 4: values-hub + overrides only (no explicit chart values file; chart defaults still load) + echo "--- Scenario 4: values-hub + overrides (chart defaults implicit) ---" + out=$(run_helm -f "$VALUES_HUB" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (hub+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (hub+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo "" + + # Scenario 5: Minimal regionalDR (simulates old values-hub with regionalDR) + overrides β†’ uses default files + ensure_minimal_regional_dr + echo "--- Scenario 5: Minimal regionalDR + overrides (uses chart default install_config files) ---" + out=$(run_helm -f "$MINIMAL_REGIONAL_DR" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (minimal regionalDR+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (minimal regionalDR+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Primary region: $(echo "$primary" | grep 'region:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary region: $(echo "$secondary" | grep 'region:' | head -1 | awk '{print $2}')" + echo "" + + # --- Validate default JSON files are required when regionalDR is minimal --- + echo "--- Validate: default JSON files prevent nulled install_config when regionalDR is minimal ---" + DEFAULT_PRIMARY="$REPO_ROOT/charts/hub/rdr/files/default-primary-install-config.json" + DEFAULT_SECONDARY="$REPO_ROOT/charts/hub/rdr/files/default-secondary-install-config.json" + if [[ ! -f "$DEFAULT_PRIMARY" || ! -f "$DEFAULT_SECONDARY" ]]; then + echo " SKIP default JSON files not found (cannot run validation)" + else + ensure_minimal_regional_dr + # Temporarily hide default files so the chart cannot use them + mv "$DEFAULT_PRIMARY" "${DEFAULT_PRIMARY}.bak" + mv "$DEFAULT_SECONDARY" "${DEFAULT_SECONDARY}.bak" + tmpout=$(mktemp) + trap "mv -f '${DEFAULT_PRIMARY}.bak' '$DEFAULT_PRIMARY' 2>/dev/null; mv -f '${DEFAULT_SECONDARY}.bak' '$DEFAULT_SECONDARY' 2>/dev/null; rm -f '$tmpout'" EXIT + run_helm_capture -f "$MINIMAL_REGIONAL_DR" -f "$OVERRIDES" >"$tmpout" + helm_ret=$? + out=$(cat "$tmpout") + # Restore files immediately so later tests or reruns work + mv -f "${DEFAULT_PRIMARY}.bak" "$DEFAULT_PRIMARY" 2>/dev/null || true + mv -f "${DEFAULT_SECONDARY}.bak" "$DEFAULT_SECONDARY" 2>/dev/null || true + trap - EXIT + rm -f "$tmpout" + + if [[ $helm_ret -ne 0 ]]; then + echo " OK Without default JSON files: helm template fails (exit $helm_ret) as expected." + else + primary_nojson=$(get_install_config "$out" 1) + if is_install_config_broken "$primary_nojson"; then + echo " OK Without default JSON files: install_config has nulled/empty sections (compute: [], networking: null, or missing types) as expected." + else + echo " FAIL Without default JSON files: install_config was still full; default files may be redundant." + ((total_fail++)) + fi + fi + echo " => Default JSON files are required when regionalDR is minimal (no install_config in base)." + fi + echo "" + + if [[ $total_fail -gt 0 ]]; then + echo "=== RESULT: $total_fail validation(s) failed ===" + exit 1 + fi + echo "=== RESULT: All scenarios passed ===" + exit 0 +} + +main "$@" diff --git a/scripts/update-ca-bundle.sh b/scripts/update-ca-bundle.sh index 9fc30b4..f0bfc3b 100755 --- a/scripts/update-ca-bundle.sh +++ b/scripts/update-ca-bundle.sh @@ -164,7 +164,7 @@ if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then echo "Examples:" echo " $0 status # Check current status" echo " $0 add /path/to/ca.crt # Add CA from file" - echo " $0 extract ocp-primary # Extract CA from managed cluster" + echo " $0 extract \${PRIMARY_CLUSTER:-ocp-primary} # Extract CA from primary managed cluster (set PRIMARY_CLUSTER to match values.yaml)" echo " $0 update-all # Update with all managed cluster CAs" exit 0 fi diff --git a/scripts/update-rdr-default-install-config-json.sh b/scripts/update-rdr-default-install-config-json.sh new file mode 100755 index 0000000..71572be --- /dev/null +++ b/scripts/update-rdr-default-install-config-json.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Update charts/hub/rdr/files/default-*-install-config.json from the install_config +# sections in charts/hub/rdr/values.yaml. Run this when you change machine types, +# networking, platform, or other install_config in the rdr chart values. +# +# Requires: Python 3 with PyYAML (pip install pyyaml) or yq. + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +RDR_CHART="$REPO_ROOT/charts/hub/rdr" +VALUES_YAML="$RDR_CHART/values.yaml" +DEFAULT_BASE_DOMAIN="cluster.example.com" +OUT_PRIMARY="$RDR_CHART/files/default-primary-install-config.json" +OUT_SECONDARY="$RDR_CHART/files/default-secondary-install-config.json" + +usage() { + echo "Usage: $0 [--dry-run]" + echo " Updates $OUT_PRIMARY and $OUT_SECONDARY from $VALUES_YAML" + echo " --dry-run Print what would be written, do not overwrite files." + exit 0 +} + +DRY_RUN= +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=1 ;; + -h|--help) usage ;; + esac +done + +[[ -f "$VALUES_YAML" ]] || { echo "Error: $VALUES_YAML not found."; exit 1; } + +# Prefer Python so we have one code path and predictable JSON formatting +run_python() { + python3 - "$VALUES_YAML" "$DEFAULT_BASE_DOMAIN" "$OUT_PRIMARY" "$OUT_SECONDARY" "$DRY_RUN" << 'PY' +import json +import sys +import yaml + +def main(): + values_path = sys.argv[1] + default_base_domain = sys.argv[2] + out_primary = sys.argv[3] + out_secondary = sys.argv[4] + dry_run = sys.argv[5] == "1" + + with open(values_path) as f: + data = yaml.safe_load(f) + + try: + clusters = data["regionalDR"][0]["clusters"] + primary_ic = clusters["primary"]["install_config"] + secondary_ic = clusters["secondary"]["install_config"] + except (KeyError, TypeError) as e: + sys.stderr.write("Error: could not find regionalDR[0].clusters.primary/secondary.install_config in values.yaml\n") + sys.exit(1) + + def normalize(ic): + # Deep copy and replace template baseDomain with static default + out = json.loads(json.dumps(ic)) + if isinstance(out.get("baseDomain"), str) and "{{" in out["baseDomain"]: + out["baseDomain"] = default_base_domain + return out + + primary = normalize(primary_ic) + secondary = normalize(secondary_ic) + + opts = {"indent": 2, "sort_keys": False} + primary_json = json.dumps(primary, **opts) + secondary_json = json.dumps(secondary, **opts) + + if dry_run: + print("--- primary (would write to", out_primary, ") ---") + print(primary_json) + print("--- secondary (would write to", out_secondary, ") ---") + print(secondary_json) + return + + with open(out_primary, "w") as f: + f.write(primary_json) + f.write("\n") + with open(out_secondary, "w") as f: + f.write(secondary_json) + f.write("\n") + print("Wrote", out_primary) + print("Wrote", out_secondary) + +if __name__ == "__main__": + main() +PY +} + +if command -v python3 &>/dev/null; then + if python3 -c "import yaml" 2>/dev/null; then + run_python + exit 0 + fi +fi + +# Fallback: yq (if available) +if command -v yq &>/dev/null; then + echo "Using yq (Python/PyYAML not available)." + extract_one() { + local path="$1" + local out="$2" + local tmp + tmp=$(mktemp) + trap "rm -f $tmp" EXIT + # yq v4: .regionalDR[0].clusters.primary.install_config + yq eval '.regionalDR[0].clusters.'"$path"'.install_config' "$VALUES_YAML" -o=json > "$tmp" 2>/dev/null || \ + yq r -j "$VALUES_YAML" "regionalDR.0.clusters.$path.install_config" > "$tmp" 2>/dev/null || \ + { echo "Error: yq could not extract install_config. Try: pip install pyyaml && $0"; exit 1; } + # Replace template baseDomain + if command -v jq &>/dev/null; then + jq --arg dom "$DEFAULT_BASE_DOMAIN" '.baseDomain = $dom' "$tmp" > "${tmp}.2" && mv "${tmp}.2" "$tmp" + else + sed -i "s/\"{{ join.*}}\"/\"$DEFAULT_BASE_DOMAIN\"/" "$tmp" 2>/dev/null || true + fi + if [[ -n "$DRY_RUN" ]]; then + echo "--- $out (dry-run) ---" + cat "$tmp" + else + cp "$tmp" "$out" + echo "Wrote $out" + fi + rm -f "$tmp" + trap - EXIT + } + extract_one "primary" "$OUT_PRIMARY" + extract_one "secondary" "$OUT_SECONDARY" + exit 0 +fi + +echo "Error: Need Python 3 with PyYAML (pip install pyyaml) or yq to run this script." +exit 1 diff --git a/values-hub.yaml b/values-hub.yaml index 8775dce..c06fbf5 100644 --- a/values-hub.yaml +++ b/values-hub.yaml @@ -12,12 +12,11 @@ clusterGroup: - openshift-dr-ops - openshift-storage - policies - # - regional-dr-trigger subscriptions: acm: name: advanced-cluster-management namespace: open-cluster-management - channel: release-2.13 + channel: release-2.14 odf-multicluster-orchestrator: name: odf-multicluster-orchestrator @@ -97,6 +96,8 @@ clusterGroup: - PrunePropagationPolicy=foreground - RespectIgnoreDifferences=true - ServerSideApply=true + extraValueFiles: + - '/overrides/values-cluster-names.yaml' ignoreDifferences: - group: operators.openshift.io kind: Console @@ -121,6 +122,8 @@ clusterGroup: maxDuration: 3m syncOptions: - RespectIgnoreDifferences=true + extraValueFiles: + - '/overrides/values-cluster-names.yaml' ignoreDifferences: # Prevent ArgoCD from pruning dynamically created OCM resources # These resources are created by OCM operators and should never be pruned