diff --git a/Changes.md b/Changes.md index cc629ec..8c2f5f5 100644 --- a/Changes.md +++ b/Changes.md @@ -4,3 +4,10 @@ v1.0 - November 2025 * Arrange to default baseDomain settings appropriately so that forking the pattern is not a hard requirement * Initial release + +v1.0 - February 2026 + +* The names ocp-primary and ocp-secondary were hardcoded in various places, which caused issues when trying +to install two copies of this pattern into the same DNS domain. +* Also parameterize the version of edge-gitops-vms chart in case it needs to get updated. It too was hardcoded. +* Update to ACM 2.14 in prep for OCP 4.20+ testing. diff --git a/README.md b/README.md index 0242fc0..758eec0 100644 --- a/README.md +++ b/README.md @@ -6,5 +6,5 @@ ## Start Here If you've followed a link to this repository, but are not really sure what it contains -or how to use it, head over to [Ansible Edge GitOps](https://validatedpatterns.io/patterns/ansible-edge-gitops/) +or how to use it, head over to [RamenDR Starter Kit](https://validatedpatterns.io/patterns/ramendr-starter-kit/) for additional context and installation instructions diff --git a/charts/hub/opp/scripts/argocd-health-monitor-cron.sh b/charts/hub/opp/scripts/argocd-health-monitor-cron.sh index 7ba0ef8..a069570 100755 --- a/charts/hub/opp/scripts/argocd-health-monitor-cron.sh +++ b/charts/hub/opp/scripts/argocd-health-monitor-cron.sh @@ -1,12 +1,26 @@ #!/bin/bash +# ArgoCD health monitor - CronJob (runs every 15 min). +# Why two scripts? The Job (argocd-health-monitor.sh) runs once at deploy, retries until both clusters are +# healthy then exits. This CronJob runs periodically to detect and remediate wedged clusters after deploy. +# Both use the same remediation: force-sync Namespace ramendr-starter-kit-resilient in Application ramendr-starter-kit-resilient. set -euo pipefail echo "Starting ArgoCD health monitoring and remediation..." +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Configuration MAX_ATTEMPTS=270 # Check 270 times (90 minutes with 20s intervals) before failing SLEEP_INTERVAL=20 ARGOCD_NAMESPACE="openshift-gitops" +# Namespace where the Application to force-sync lives (parameterized; default openshift-gitops) +FORCE_SYNC_APP_NAMESPACE="${FORCE_SYNC_APP_NAMESPACE:-openshift-gitops}" +# Application and specific resource to force-sync when remediating (Namespace ramendr-starter-kit-resilient in Application ramendr-starter-kit-resilient) +FORCE_SYNC_APP_NAME="${FORCE_SYNC_APP_NAME:-ramendr-starter-kit-resilient}" +FORCE_SYNC_RESOURCE_KIND="${FORCE_SYNC_RESOURCE_KIND:-Namespace}" +FORCE_SYNC_RESOURCE_NAME="${FORCE_SYNC_RESOURCE_NAME:-ramendr-starter-kit-resilient}" HEALTH_CHECK_TIMEOUT=30 # Function to check if a cluster is wedged @@ -26,11 +40,11 @@ check_cluster_wedged() { local cluster_argocd_namespace="" local cluster_argocd_instance="" case "$cluster" in - "ocp-primary") + "$PRIMARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; - "ocp-secondary") + "$SECONDARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; @@ -96,81 +110,40 @@ check_cluster_wedged() { fi } -# Function to remediate a wedged cluster +# Function to remediate a wedged cluster (force sync a known resource instead of restarting Argo CD) remediate_wedged_cluster() { local cluster="$1" local kubeconfig="$2" - echo "πŸ”§ Remediating wedged cluster: $cluster" - - # Stop all ArgoCD instances by scaling down deployments - echo " Stopping all ArgoCD instances on $cluster..." - oc --kubeconfig="$kubeconfig" scale deployment --all -n "$ARGOCD_NAMESPACE" --replicas=0 &>/dev/null || true - oc --kubeconfig="$kubeconfig" scale statefulset --all -n "$ARGOCD_NAMESPACE" --replicas=0 &>/dev/null || true - - # If scaling doesn't work, try more aggressive cleanup - echo " Attempting aggressive cleanup for stuck deployments..." - oc --kubeconfig="$kubeconfig" delete deployment --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - oc --kubeconfig="$kubeconfig" delete statefulset --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - oc --kubeconfig="$kubeconfig" delete pods --all -n "$ARGOCD_NAMESPACE" --grace-period=0 --force &>/dev/null || true - - # Wait for all instances to stop - echo " Waiting for ArgoCD instances to stop..." - local attempt=1 - while [[ $attempt -le 30 ]]; do - local running_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) - if [[ $running_pods -eq 0 ]]; then - echo " βœ… All ArgoCD instances stopped on $cluster" - break - fi - echo " Waiting for instances to stop... (attempt $attempt/30)" - sleep 5 - ((attempt++)) - done + echo "πŸ”§ Remediating wedged cluster: $cluster (forcibly resyncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME)" - # Restart all ArgoCD instances by scaling up deployments - echo " Restarting all ArgoCD instances on $cluster..." - oc --kubeconfig="$kubeconfig" scale deployment --all -n "$ARGOCD_NAMESPACE" --replicas=1 &>/dev/null || true - oc --kubeconfig="$kubeconfig" scale statefulset --all -n "$ARGOCD_NAMESPACE" --replicas=1 &>/dev/null || true - - # Wait for pods to restart - echo " Waiting for ArgoCD pods to restart on $cluster..." - local attempt=1 - while [[ $attempt -le 20 ]]; do - local running_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --field-selector=status.phase=Running --no-headers 2>/dev/null | wc -l) - local total_pods=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" --no-headers 2>/dev/null | wc -l) - - if [[ $running_pods -gt 0 && $running_pods -eq $total_pods ]]; then - echo " βœ… ArgoCD pods restarted successfully on $cluster" - break - fi - - echo " Waiting for pods to restart... (attempt $attempt/20)" - sleep 10 - ((attempt++)) - done - - if [[ $attempt -gt 20 ]]; then - echo " ⚠️ ArgoCD pods may not have fully restarted on $cluster" + # Forcibly resync the specific resource (e.g. Namespace ramendr-starter-kit-resilient) in the Application (no Argo CD restart) + if oc --kubeconfig="$kubeconfig" get application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" &>/dev/null; then + echo " Force syncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME (namespace $FORCE_SYNC_APP_NAMESPACE) on $cluster..." + oc --kubeconfig="$kubeconfig" patch application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$FORCE_SYNC_RESOURCE_KIND\",\"name\":\"$FORCE_SYNC_RESOURCE_NAME\"}],\"syncOptions\":[\"Force=true\"]}}}" &>/dev/null || true + echo " βœ… Triggered force sync for $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME" + else + echo " ⚠️ Application $FORCE_SYNC_APP_NAME not found in $FORCE_SYNC_APP_NAMESPACE on $cluster - cannot force sync" fi - # Trigger ArgoCD refresh/sync + # Trigger ArgoCD refresh/sync (argocd CLI needs --server when run inside the pod) echo " Triggering ArgoCD refresh on $cluster..." local server_pod=$(oc --kubeconfig="$kubeconfig" get pods -n "$ARGOCD_NAMESPACE" -l app.kubernetes.io/name=openshift-gitops-server --field-selector=status.phase=Running -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + local argocd_server="localhost:8080" if [[ -n "$server_pod" ]]; then # Trigger refresh of all applications - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list -o name | while read app; do + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list --server "$argocd_server" -o name 2>/dev/null | while read app; do if [[ -n "$app" ]]; then echo " Refreshing $app..." - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --refresh &>/dev/null || true + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --server "$argocd_server" --refresh &>/dev/null || true fi done - + # Trigger hard refresh - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list -o name | while read app; do + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app list --server "$argocd_server" -o name 2>/dev/null | while read app; do if [[ -n "$app" ]]; then echo " Hard refreshing $app..." - oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --hard-refresh &>/dev/null || true + oc --kubeconfig="$kubeconfig" exec -n "$ARGOCD_NAMESPACE" "$server_pod" -- argocd app get "$app" --server "$argocd_server" --hard-refresh &>/dev/null || true fi done fi diff --git a/charts/hub/opp/scripts/argocd-health-monitor.sh b/charts/hub/opp/scripts/argocd-health-monitor.sh index 9c6b9bf..4f3ea48 100755 --- a/charts/hub/opp/scripts/argocd-health-monitor.sh +++ b/charts/hub/opp/scripts/argocd-health-monitor.sh @@ -1,12 +1,26 @@ #!/bin/bash +# ArgoCD health monitor - Job (one-shot, long-running). +# Why two scripts? This Job runs once at deploy time (sync-wave 0), retries for up to ~90 min until both +# primary and secondary Argo CD instances are healthy, then exits. The CronJob (argocd-health-monitor-cron.sh) +# runs every 15 min to catch wedged clusters after deploy. Both use the same remediation: force-sync the +# specific resource (Namespace ramendr-starter-kit-resilient) in Application ramendr-starter-kit-resilient. set -euo pipefail echo "Starting ArgoCD health monitoring and remediation..." +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Configuration MAX_ATTEMPTS=180 # Check 180 times (90 minutes with 30s intervals) before failing SLEEP_INTERVAL=30 ARGOCD_NAMESPACE="openshift-gitops" +# Same as cron: force-sync this specific resource in this Application when remediating (parameterized) +FORCE_SYNC_APP_NAMESPACE="${FORCE_SYNC_APP_NAMESPACE:-openshift-gitops}" +FORCE_SYNC_APP_NAME="${FORCE_SYNC_APP_NAME:-ramendr-starter-kit-resilient}" +FORCE_SYNC_RESOURCE_KIND="${FORCE_SYNC_RESOURCE_KIND:-Namespace}" +FORCE_SYNC_RESOURCE_NAME="${FORCE_SYNC_RESOURCE_NAME:-ramendr-starter-kit-resilient}" HEALTH_CHECK_TIMEOUT=60 # Function to check if a cluster is wedged @@ -26,11 +40,11 @@ check_cluster_wedged() { local cluster_argocd_namespace="" local cluster_argocd_instance="" case "$cluster" in - "ocp-primary") + "$PRIMARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; - "ocp-secondary") + "$SECONDARY_CLUSTER") cluster_argocd_namespace="ramendr-starter-kit-resilient" cluster_argocd_instance="resilient-gitops-server" ;; @@ -61,7 +75,12 @@ check_cluster_wedged() { return 0 fi fi - + + # For primary/secondary we require the cluster-specific Argo CD instance; missing = not healthy (job must not succeed) + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then + echo "❌ Required Argo CD instance ($cluster_argocd_instance in $cluster_argocd_namespace) not found on $cluster - job will retry or fail" + return 0 + fi echo "βœ… $cluster appears healthy (no ArgoCD instances installed yet)" return 1 fi @@ -135,7 +154,12 @@ check_cluster_wedged() { fi fi fi - + + # For primary/secondary we require the Argo CD instance to be running; missing = not healthy (job must not succeed) + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then + echo "❌ Required Argo CD instance ($cluster_argocd_instance) not running in $cluster_argocd_namespace on $cluster - job will retry or fail" + return 0 + fi echo "βœ… $cluster appears healthy (no ArgoCD instances running yet)" return 1 elif [[ $cluster_argocd_pods -eq 1 ]]; then @@ -170,240 +194,20 @@ check_cluster_wedged() { fi } -# Function to remediate a wedged cluster using ArgoCD sync mechanisms +# Function to remediate a wedged cluster (force sync the specific resource in the specific Application, same as cron) remediate_wedged_cluster() { local cluster="$1" local kubeconfig="$2" - echo "πŸ”§ Remediating wedged cluster: $cluster using ArgoCD sync mechanisms" - echo " 🎯 Focus: Force sync namespace policies and specific resources in stuck applications" + echo "πŸ”§ Remediating wedged cluster: $cluster (forcibly resyncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME)" - # STEP 1: Find and sync namespace policies - echo " πŸ” STEP 1: Finding and syncing namespace policies..." - - # Get all ArgoCD applications - local applications=$(oc --kubeconfig="$kubeconfig" get applications -n "$ARGOCD_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "") - - if [[ -n "$applications" ]]; then - echo " Found ArgoCD applications: $applications" - - for app in $applications; do - echo " πŸ”„ Processing application: $app" - - # Get application status - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app status: sync=$app_status, health=$app_health" - - # If application is out of sync or unhealthy, force sync it - if [[ "$app_status" != "Synced" || "$app_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app is not in sync - forcing sync..." - - # Force sync the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true"]}}}' &>/dev/null || true - - # Wait a moment for sync to start - sleep 5 - - # Check if there are specific resources that need to be synced - echo " πŸ” Checking for specific resources that need sync in $app..." - - # Get resources that are out of sync - local out_of_sync_resources=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.status=="OutOfSync")].name}' 2>/dev/null || echo "") - - if [[ -n "$out_of_sync_resources" ]]; then - echo " πŸ“‹ Found out-of-sync resources: $out_of_sync_resources" - - # Force sync specific resources - for resource in $out_of_sync_resources; do - echo " πŸ”„ Force syncing resource: $resource" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$(echo $resource | cut -d'/' -f1)\",\"name\":\"$(echo $resource | cut -d'/' -f2)\"}]}}}" &>/dev/null || true - done - fi - - # Check for namespace policies specifically - echo " πŸ” Looking for namespace policies in $app..." - local namespace_policies=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.kind=="Policy")].name}' 2>/dev/null || echo "") - - if [[ -n "$namespace_policies" ]]; then - echo " πŸ“‹ Found namespace policies: $namespace_policies" - - # Force sync namespace policies - for policy in $namespace_policies; do - echo " πŸ”„ Force syncing namespace policy: $policy" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"Policy\",\"name\":\"$policy\"}]}}}" &>/dev/null || true - done - fi - else - echo " βœ… Application $app is already in sync and healthy" - fi - done + if oc --kubeconfig="$kubeconfig" get application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" &>/dev/null; then + echo " Force syncing resource $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME in Application $FORCE_SYNC_APP_NAME (namespace $FORCE_SYNC_APP_NAMESPACE) on $cluster..." + oc --kubeconfig="$kubeconfig" patch application "$FORCE_SYNC_APP_NAME" -n "$FORCE_SYNC_APP_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$FORCE_SYNC_RESOURCE_KIND\",\"name\":\"$FORCE_SYNC_RESOURCE_NAME\"}],\"syncOptions\":[\"Force=true\"]}}}" &>/dev/null || true + echo " βœ… Triggered force sync for $FORCE_SYNC_RESOURCE_KIND/$FORCE_SYNC_RESOURCE_NAME" else - echo " ⚠️ No ArgoCD applications found in $ARGOCD_NAMESPACE namespace" + echo " ⚠️ Application $FORCE_SYNC_APP_NAME not found in $FORCE_SYNC_APP_NAMESPACE on $cluster - cannot force sync" fi - - # STEP 2: Force refresh and hard refresh of applications - echo " πŸ”„ STEP 2: Force refreshing applications..." - - for app in $applications; do - echo " πŸ”„ Force refreshing application: $app" - - # Force refresh the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"initiatedBy":{"username":"argocd-health-monitor"},"info":[{"name":"refresh","value":"hard"}]}}' &>/dev/null || true - - # Wait for refresh to complete - sleep 10 - - # Check application status after refresh - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app status after refresh: sync=$app_status, health=$app_health" - done - - # STEP 3: Check for stuck applications and force sync them - echo " πŸ” STEP 3: Checking for stuck applications..." - - for app in $applications; do - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - if [[ "$app_status" != "Synced" || "$app_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app is still stuck - attempting final sync..." - - # Final attempt to sync the application - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true"]}}}' &>/dev/null || true - - # Wait for sync to complete - sleep 15 - - # Check final status - local final_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local final_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Final status for $app: sync=$final_status, health=$final_health" - fi - done - - echo " βœ… ArgoCD sync-based remediation completed for $cluster" - echo " 🎯 Remediated: Used ArgoCD sync mechanisms to force sync stuck applications and namespace policies" - echo " ⚠️ Note: This approach preserves existing resources while forcing proper synchronization" -} - -# Function to apply aggressive ArgoCD sync for wedged openshift-gitops namespace -apply_aggressive_argocd_sync() { - local cluster="$1" - local kubeconfig="$2" - - echo "πŸ”„πŸ”„πŸ”„ APPLYING AGGRESSIVE ARGOCD SYNC TO OPENSHIFT-GITOPS NAMESPACE πŸ”„πŸ”„πŸ”„" - echo " 🎯 Target: $ARGOCD_NAMESPACE namespace on $cluster" - echo " ⚠️ This will force sync all applications and namespace policies using ArgoCD mechanisms" - - # Check if openshift-gitops namespace exists - if ! oc --kubeconfig="$kubeconfig" get namespace "$ARGOCD_NAMESPACE" &>/dev/null; then - echo " βœ… $ARGOCD_NAMESPACE namespace does not exist - nothing to sync" - return 0 - fi - - echo " πŸ” $ARGOCD_NAMESPACE namespace exists - proceeding with aggressive ArgoCD sync" - - # STEP 1: Get all ArgoCD applications - echo " πŸ” STEP 1: Finding all ArgoCD applications..." - local applications=$(oc --kubeconfig="$kubeconfig" get applications -n "$ARGOCD_NAMESPACE" -o jsonpath='{.items[*].metadata.name}' 2>/dev/null || echo "") - - if [[ -z "$applications" ]]; then - echo " ⚠️ No ArgoCD applications found in $ARGOCD_NAMESPACE namespace" - return 0 - fi - - echo " Found ArgoCD applications: $applications" - - # STEP 2: Force sync all applications with aggressive options - echo " πŸ”„ STEP 2: Force syncing all applications with aggressive options..." - - for app in $applications; do - echo " πŸ”„ Aggressively syncing application: $app" - - # Get current application status - local app_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local app_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Application $app current status: sync=$app_status, health=$app_health" - - # Force sync with aggressive options - echo " πŸ”„ Force syncing $app with aggressive options..." - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true","Replace=true"]}}}' &>/dev/null || true - - # Wait for sync to start - sleep 5 - - # Check for specific resources that need aggressive sync - echo " πŸ” Checking for specific resources that need aggressive sync in $app..." - - # Get all resources in the application - local all_resources=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[*].name}' 2>/dev/null || echo "") - - if [[ -n "$all_resources" ]]; then - echo " πŸ“‹ Found resources in $app: $all_resources" - - # Force sync each resource individually - for resource in $all_resources; do - echo " πŸ”„ Force syncing resource: $resource" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"$(echo $resource | cut -d'/' -f1)\",\"name\":\"$(echo $resource | cut -d'/' -f2)\"}],\"syncOptions\":[\"Force=true\",\"Replace=true\"]}}}" &>/dev/null || true - done - fi - - # Check for namespace policies specifically - echo " πŸ” Looking for namespace policies in $app..." - local namespace_policies=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.resources[?(@.kind=="Policy")].name}' 2>/dev/null || echo "") - - if [[ -n "$namespace_policies" ]]; then - echo " πŸ“‹ Found namespace policies: $namespace_policies" - - # Force sync namespace policies with aggressive options - for policy in $namespace_policies; do - echo " πŸ”„ Aggressively syncing namespace policy: $policy" - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p="{\"operation\":{\"sync\":{\"resources\":[{\"kind\":\"Policy\",\"name\":\"$policy\"}],\"syncOptions\":[\"Force=true\",\"Replace=true\",\"PrunePropagationPolicy=foreground\"]}}}" &>/dev/null || true - done - fi - done - - # STEP 3: Force refresh all applications - echo " πŸ”„ STEP 3: Force refreshing all applications..." - - for app in $applications; do - echo " πŸ”„ Force refreshing application: $app" - - # Force hard refresh - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"initiatedBy":{"username":"argocd-health-monitor"},"info":[{"name":"refresh","value":"hard"}]}}' &>/dev/null || true - - # Wait for refresh to complete - sleep 10 - done - - # STEP 4: Final verification and sync - echo " πŸ” STEP 4: Final verification and sync..." - - for app in $applications; do - echo " πŸ” Final check for application: $app" - - # Get final status - local final_status=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.sync.status}' 2>/dev/null || echo "Unknown") - local final_health=$(oc --kubeconfig="$kubeconfig" get application "$app" -n "$ARGOCD_NAMESPACE" -o jsonpath='{.status.health.status}' 2>/dev/null || echo "Unknown") - - echo " Final status for $app: sync=$final_status, health=$final_health" - - # If still not healthy, try one more aggressive sync - if [[ "$final_status" != "Synced" || "$final_health" != "Healthy" ]]; then - echo " πŸ”„ Application $app still not healthy - attempting final aggressive sync..." - oc --kubeconfig="$kubeconfig" patch application "$app" -n "$ARGOCD_NAMESPACE" --type=merge -p='{"operation":{"sync":{"syncOptions":["CreateNamespace=true","PrunePropagationPolicy=foreground","PruneLast=true","Force=true","Replace=true","Prune=true"]}}}' &>/dev/null || true - fi - done - - echo " πŸ”„πŸ”„πŸ”„ AGGRESSIVE ARGOCD SYNC COMPLETED FOR $ARGOCD_NAMESPACE NAMESPACE πŸ”„πŸ”„πŸ”„" - echo " 🎯 Result: Used ArgoCD sync mechanisms to force sync all applications and namespace policies" - echo " ⚠️ Note: This approach preserves existing resources while forcing proper synchronization" } # Function to download kubeconfig for a cluster (using same logic as download-kubeconfigs.sh) @@ -618,22 +422,14 @@ while [[ $attempt -le $MAX_ATTEMPTS ]]; do fi fi - # Remediate wedged clusters + # Remediate wedged clusters (same targeted force-sync for all: Namespace in Application ramendr-starter-kit-resilient) if [[ ${#wedged_clusters[@]} -gt 0 ]]; then echo "Found wedged clusters: ${wedged_clusters[*]}" for cluster in "${wedged_clusters[@]}"; do kubeconfig_path="/tmp/${cluster}-kubeconfig.yaml" - - # Apply aggressive ArgoCD sync specifically for ocp-secondary if it's wedged - if [[ "$cluster" == "ocp-secondary" ]]; then - echo "πŸ”„πŸ”„πŸ”„ APPLYING AGGRESSIVE ARGOCD SYNC TO WEDGED OCP-SECONDARY πŸ”„πŸ”„πŸ”„" - echo " 🎯 Target: Force sync all applications and namespace policies on ocp-secondary" - apply_aggressive_argocd_sync "$cluster" "$kubeconfig_path" - else - echo "πŸ”§ Applying standard ArgoCD sync remediation to wedged cluster: $cluster" - remediate_wedged_cluster "$cluster" "$kubeconfig_path" - fi + echo "πŸ”§ Applying remediation to wedged cluster: $cluster" + remediate_wedged_cluster "$cluster" "$kubeconfig_path" done echo "βœ… Remediation completed for wedged clusters" @@ -647,5 +443,7 @@ while [[ $attempt -le $MAX_ATTEMPTS ]]; do fi done -echo "πŸŽ‰ ArgoCD health monitoring completed" - +# Exited loop by exhausting attempts (did not exit 0 from "all healthy") +echo "❌ ArgoCD health monitoring did not complete successfully within $MAX_ATTEMPTS attempts" +echo " One or both required Argo CD instances (on $PRIMARY_CLUSTER and $SECONDARY_CLUSTER) were not running correctly." +exit 1 diff --git a/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh b/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh index 7225d0e..d43ea93 100755 --- a/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh +++ b/charts/hub/opp/scripts/odf-ssl-certificate-extraction.sh @@ -212,9 +212,13 @@ else echo " Found managed clusters: $MANAGED_CLUSTERS" fi +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Extract CA from each managed cluster CA_FILES=() -REQUIRED_CLUSTERS=("hub" "ocp-primary" "ocp-secondary") +REQUIRED_CLUSTERS=("hub" "$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") EXTRACTED_CLUSTERS=() # Track hub cluster CA extraction @@ -291,8 +295,8 @@ if [[ ${#MISSING_CLUSTERS[@]} -gt 0 ]]; then echo "" echo "The ODF SSL certificate extractor job requires CA material from ALL three clusters:" echo " - hub (hub cluster)" - echo " - ocp-primary (primary managed cluster)" - echo " - ocp-secondary (secondary managed cluster)" + echo " - $PRIMARY_CLUSTER (primary managed cluster)" + echo " - $SECONDARY_CLUSTER (secondary managed cluster)" echo "" echo "Without CA material from all clusters, the DR setup will fail." echo "Please ensure all clusters are accessible and have proper kubeconfigs." @@ -450,14 +454,17 @@ if oc get configmap ramen-hub-operator-config -n openshift-operators &>/dev/null # Get existing ramen_manager_config.yaml content EXISTING_YAML=$(oc get configmap ramen-hub-operator-config -n openshift-operators -o jsonpath='{.data.ramen_manager_config\.yaml}' 2>/dev/null || echo "") - # CRITICAL: Verify at least 2 S3profiles exist before attempting update + # Patch existing s3StoreProfiles only: add/update caCertificates on each existing profile. + # We do NOT create new profiles or delete/overwrite profile names. At least 2 existing profiles required. MIN_REQUIRED_PROFILES=2 if [[ -n "$EXISTING_YAML" ]]; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_KOP=$(echo "$EXISTING_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_TOP=$(echo "$EXISTING_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + COUNT_KOP=$((10#${COUNT_KOP:-0})) + COUNT_TOP=$((10#${COUNT_TOP:-0})) + EXISTING_PROFILE_COUNT=$(( COUNT_KOP >= COUNT_TOP ? COUNT_KOP : COUNT_TOP )) else - # Fallback to grep if yq is not available EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") if [[ $EXISTING_PROFILE_COUNT -eq 0 ]]; then EXISTING_PROFILE_COUNT=$(echo "$EXISTING_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") @@ -472,252 +479,106 @@ if oc get configmap ramen-hub-operator-config -n openshift-operators &>/dev/null echo " Current YAML content (first 50 lines):" echo "$EXISTING_YAML" | head -n 50 echo "" - echo " The certificate extractor requires at least $MIN_REQUIRED_PROFILES S3profiles to add CA certificates." - echo " Please ensure the ramen-hub-operator-config ConfigMap has at least $MIN_REQUIRED_PROFILES s3StoreProfiles configured." + echo " The certificate extractor only patches existing s3StoreProfiles with caCertificates." + echo " Please ensure ramen-hub-operator-config has at least $MIN_REQUIRED_PROFILES s3StoreProfiles configured." handle_error "Insufficient s3StoreProfiles found: found $EXISTING_PROFILE_COUNT profile(s), but at least $MIN_REQUIRED_PROFILES are required" else - echo " βœ… Found $EXISTING_PROFILE_COUNT s3StoreProfiles (minimum required: $MIN_REQUIRED_PROFILES)" + echo " βœ… Found $EXISTING_PROFILE_COUNT s3StoreProfiles (will patch caCertificates into existing profiles only)" fi fi - - # Create updated YAML with caCertificates in each s3StoreProfiles item + + # Patch existing profiles with caCertificates using yq only (env var avoids embedding base64 in expression) + PATCHED_VIA_YQ=false if [[ -n "$EXISTING_YAML" ]]; then - # Create a temporary YAML file with the update echo "$EXISTING_YAML" > "$WORK_DIR/existing-ramen-config.yaml" - echo " Existing YAML content (first 20 lines):" echo "$EXISTING_YAML" | head -n 20 - - # Try to install PyYAML first, or use alternative methods - echo " Attempting to update s3StoreProfiles with caCertificates..." - - # Method 1: Try Python with PyYAML first (most reliable) - PYTHON_SUCCESS=false - if python3 -c "import yaml" 2>/dev/null || python3 -m pip install --user PyYAML 2>&1 | grep -q "Successfully installed\|Requirement already satisfied"; then - echo " Using Python with PyYAML to update s3StoreProfiles..." - export CA_BUNDLE_BASE64 - if python3 -c " -import yaml -import sys -import os + echo " Patching s3StoreProfiles with caCertificates using yq..." -ca_bundle = os.environ.get('CA_BUNDLE_BASE64', '') + if ! command -v yq &>/dev/null; then + echo " ❌ yq is required but not found in PATH" + handle_error "yq is required to patch ramen_manager_config with caCertificates; please install yq (e.g. mikefarah/yq)" + fi -try: - with open('$WORK_DIR/existing-ramen-config.yaml', 'r') as f: - config = yaml.safe_load(f) or {} - - if config is None: - config = {} - - if 's3StoreProfiles' not in config: - config['s3StoreProfiles'] = [] - - updated_count = 0 - for profile in config.get('s3StoreProfiles', []): - if isinstance(profile, dict): - profile['caCertificates'] = ca_bundle - updated_count += 1 - - print(f'Updated {updated_count} s3StoreProfiles with caCertificates', file=sys.stderr) - - with open('$WORK_DIR/existing-ramen-config.yaml', 'w') as f: - yaml.dump(config, f, default_flow_style=False, sort_keys=False, allow_unicode=True) - - print('SUCCESS', file=sys.stderr) - sys.exit(0) -except Exception as e: - print(f'ERROR: {e}', file=sys.stderr) - import traceback - traceback.print_exc(file=sys.stderr) - sys.exit(1) -" 2>&1; then - echo " βœ… Successfully updated s3StoreProfiles with caCertificates using Python" - PYTHON_SUCCESS=true - else - echo " ⚠️ Python update failed, trying yq..." - fi + export CA_BUNDLE_BASE64 + YQ_PATCHED=false + # Use strenv() so the base64 value is passed as a string without embedding in the expression (avoids quoting/special-char issues) + if yq eval -i '.s3StoreProfiles[]? |= . + {"caCertificates": strenv(CA_BUNDLE_BASE64)}' "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + YQ_PATCHED=true fi - - # Method 2: Try yq if Python failed - if [[ "$PYTHON_SUCCESS" != "true" ]] && command -v yq &>/dev/null; then - echo " Using yq to update s3StoreProfiles..." - # Use yq to update each profile individually - if yq eval '(.s3StoreProfiles[] | select(has("name"))) |= . + {"caCertificates": "'"$CA_BUNDLE_BASE64"'"}' -i "$WORK_DIR/existing-ramen-config.yaml" 2>&1; then - echo " βœ… Successfully updated s3StoreProfiles with caCertificates using yq" - PYTHON_SUCCESS=true - else - echo " ⚠️ yq failed, trying awk-based approach..." - PYTHON_SUCCESS=false - fi + if yq eval -i '.kubeObjectProtection.s3StoreProfiles[]? |= . + {"caCertificates": strenv(CA_BUNDLE_BASE64)}' "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + YQ_PATCHED=true fi - - # Method 3: Fallback to awk/sed if both Python and yq failed - if [[ "$PYTHON_SUCCESS" != "true" ]]; then - echo " Using awk-based approach as fallback..." - { - # Use awk to update or add caCertificates to each s3StoreProfiles item - awk -v ca_bundle="$CA_BUNDLE_BASE64" ' - BEGIN { in_profile=0; ca_added=0 } - /^s3StoreProfiles:/ { - print - next - } - /^ - name:/ { - in_profile=1 - ca_added=0 - print - next - } - in_profile && /^ caCertificates:/ { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - in_profile=0 - next - } - in_profile && /^ [a-zA-Z]/ && !/^ caCertificates:/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - print - next - } - in_profile && /^ -/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - in_profile=0 - print - next - } - in_profile && /^$/ { - if (!ca_added) { - print " caCertificates: \"" ca_bundle "\"" - ca_added=1 - } - in_profile=0 - print - next - } - { print } - ' "$WORK_DIR/existing-ramen-config.yaml" > "$WORK_DIR/existing-ramen-config.yaml.tmp" && \ - mv "$WORK_DIR/existing-ramen-config.yaml.tmp" "$WORK_DIR/existing-ramen-config.yaml" && \ - echo " βœ… Updated s3StoreProfiles using awk" || { - echo " ❌ awk-based approach failed" - PYTHON_SUCCESS=false - } - } + if [[ "$YQ_PATCHED" != "true" ]]; then + echo " ❌ yq failed to patch s3StoreProfiles (no top-level or kubeObjectProtection.s3StoreProfiles found?)" + echo " yq version: $(yq --version 2>/dev/null || true)" + handle_error "yq could not update s3StoreProfiles with caCertificates" fi - - # Clean up temporary files + echo " βœ… Patched existing s3StoreProfiles with caCertificates using yq" + rm -f "$WORK_DIR/existing-ramen-config.yaml.bak" "$WORK_DIR/existing-ramen-config.yaml.tmp" - - # Verify the update + + # Verify patch (grep in file; do NOT load full content into shell variable - base64 can exceed ARG_MAX and truncate) if [[ -f "$WORK_DIR/existing-ramen-config.yaml" ]]; then - UPDATED_YAML=$(cat "$WORK_DIR/existing-ramen-config.yaml") - echo " Updated YAML content (first 20 lines):" - echo "$UPDATED_YAML" | head -n 20 - - # Verify caCertificates was added - if echo "$UPDATED_YAML" | grep -q "caCertificates"; then - echo " βœ… Verified: caCertificates found in updated YAML" - else - echo " ⚠️ Warning: caCertificates not found in updated YAML" + if ! grep -q "caCertificates" "$WORK_DIR/existing-ramen-config.yaml" 2>/dev/null; then + echo " ❌ No caCertificates in updated YAML (patch failed or no s3StoreProfiles to patch)" + handle_error "Failed to patch ramen_manager_config with caCertificates - update produced no caCertificates" fi + echo " Updated YAML content (first 20 lines):" + head -n 20 "$WORK_DIR/existing-ramen-config.yaml" + echo " βœ… Verified: caCertificates found in updated YAML" + # Copy file directly; do NOT use a shell variable (large base64 would truncate and break the applied ConfigMap) + cp "$WORK_DIR/existing-ramen-config.yaml" "$WORK_DIR/ramen_manager_config.yaml" + PATCHED_VIA_YQ=true else echo " ❌ Error: Updated YAML file not found" - UPDATED_YAML="$EXISTING_YAML" + PATCHED_VIA_YQ=false fi - - rm -f "$WORK_DIR/update_ramen_config.py" else - # No existing YAML, create new one with s3StoreProfiles containing caCertificates - UPDATED_YAML="s3StoreProfiles: - - name: default + # No existing YAML (ConfigMap exists but ramen_manager_config.yaml empty): create minimal config with 2 profiles (parameterized by cluster name) + UPDATED_YAML="kubeObjectProtection: + s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" +s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER caCertificates: \"$CA_BUNDLE_BASE64\"" fi - - # Save updated YAML to a file for use with oc set data - echo "$UPDATED_YAML" > "$WORK_DIR/ramen_manager_config.yaml" + + # Save updated YAML for apply (only write from variable when we did not already copy the patched file) + if [[ "$PATCHED_VIA_YQ" != "true" ]]; then + echo "$UPDATED_YAML" > "$WORK_DIR/ramen_manager_config.yaml" + fi echo " Preparing to update ConfigMap with YAML content..." echo " YAML file size: $(wc -c < "$WORK_DIR/ramen_manager_config.yaml") bytes" echo " YAML file preview (first 10 lines):" head -n 10 "$WORK_DIR/ramen_manager_config.yaml" - # Update the ConfigMap using oc create with --dry-run=client and oc apply - # This is more reliable than oc set data for multiline content + # Build ConfigMap manifest: use literal-block method first (reliable, no yq/Python dependency) echo " Creating ConfigMap manifest with updated data..." oc get configmap ramen-hub-operator-config -n openshift-operators -o yaml > "$WORK_DIR/ramen-configmap-template.yaml" 2>/dev/null if [[ -f "$WORK_DIR/ramen-configmap-template.yaml" ]]; then - # Update the data section using yq or python - if command -v yq &>/dev/null; then - yq eval ".data.\"ramen_manager_config.yaml\" = load(\"$WORK_DIR/ramen_manager_config.yaml\") | .data.\"ramen_manager_config.yaml\" style=\"literal\"" -i "$WORK_DIR/ramen-configmap-template.yaml" 2>/dev/null || { - # Fallback: use python to update - python3 -c " -import yaml -import sys - -# Read the ConfigMap -with open('$WORK_DIR/ramen-configmap-template.yaml', 'r') as f: - cm = yaml.safe_load(f) - -# Read the updated YAML content -with open('$WORK_DIR/ramen_manager_config.yaml', 'r') as f: - updated_yaml = f.read() - -# Update the data section -if 'data' not in cm: - cm['data'] = {} - -cm['data']['ramen_manager_config.yaml'] = updated_yaml - -# Keep metadata but remove fields that Kubernetes manages (oc apply will update these) -if 'metadata' in cm: - # Remove only the fields that Kubernetes manages and shouldn't be in the apply - cm['metadata'].pop('resourceVersion', None) - cm['metadata'].pop('managedFields', None) - -# Write back -with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: - yaml.dump(cm, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -" 2>/dev/null - } - else - # Use python to update - python3 -c " -import yaml -import sys - -# Read the ConfigMap -with open('$WORK_DIR/ramen-configmap-template.yaml', 'r') as f: - cm = yaml.safe_load(f) - -# Read the updated YAML content -with open('$WORK_DIR/ramen_manager_config.yaml', 'r') as f: - updated_yaml = f.read() + # Always use the canonical name so we update the expected ConfigMap and verification finds it + METADATA_NAMESPACE=openshift-operators + METADATA_NAME=ramen-hub-operator-config + echo " Building ConfigMap manifest (literal block for ramen_manager_config.yaml)..." + { + echo "apiVersion: v1" + echo "kind: ConfigMap" + echo "metadata:" + echo " name: $METADATA_NAME" + echo " namespace: $METADATA_NAMESPACE" + echo "data:" + echo " ramen_manager_config.yaml: |" + sed 's/^/ /' "$WORK_DIR/ramen_manager_config.yaml" + } > "$WORK_DIR/ramen-configmap-updated.yaml" -# Update the data section -if 'data' not in cm: - cm['data'] = {} - -cm['data']['ramen_manager_config.yaml'] = updated_yaml - -# Keep metadata but remove fields that Kubernetes manages (oc apply will update these) -if 'metadata' in cm: - # Remove only the fields that Kubernetes manages and shouldn't be in the apply - cm['metadata'].pop('resourceVersion', None) - cm['metadata'].pop('managedFields', None) - -# Write back -with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: - yaml.dump(cm, f, default_flow_style=False, sort_keys=False, allow_unicode=True) -" 2>/dev/null - fi - if [[ -f "$WORK_DIR/ramen-configmap-updated.yaml" ]]; then echo " Applying updated ConfigMap..." UPDATE_OUTPUT=$(oc apply -f "$WORK_DIR/ramen-configmap-updated.yaml" 2>&1) @@ -758,26 +619,35 @@ with open('$WORK_DIR/ramen-configmap-updated.yaml', 'w') as f: VERIFICATION_ERRORS+=("caCertificates not found in ConfigMap") fi - if ! echo "$VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - VERIFICATION_PASSED=false - VERIFICATION_ERRORS+=("CA bundle base64 data not found in ConfigMap") + # Optional: exact base64 match can fail due to encoding/line wrap in stored ConfigMap + # Prefer verifying profile/caCertificates counts below; only warn if base64 substring missing + if [[ -n "$CA_BUNDLE_BASE64" ]] && [[ ${#CA_BUNDLE_BASE64} -gt 20 ]]; then + CA_PREFIX="${CA_BUNDLE_BASE64:0:80}" + if ! echo "$VERIFIED_YAML" | grep -qF "$CA_PREFIX"; then + echo " ⚠️ Note: CA bundle prefix not found in retrieved ConfigMap (encoding may differ); relying on profile/caCertificates count" + fi fi - - # Additional check: verify that each s3StoreProfiles item has caCertificates - # CRITICAL: Must find at least 2 S3profiles + + # Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 if echo "$VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(echo "$VERIFIED_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + PT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + CK=$(echo "$VERIFIED_YAML" | yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + CT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + # Normalize: yq can return "null" or empty; treat as 0 + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available + PROFILE_COUNT=0 + CA_CERT_COUNT=0 + fi + # If yq returned 0/0 but YAML clearly has content, use grep-based counts (works regardless of yq version/parsing) + if [[ $PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $PROFILE_COUNT -eq 0 ]]; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi + [[ "${PROFILE_COUNT:-0}" -eq 0 ]] && PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") fi PROFILE_COUNT=$(echo "$PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") @@ -904,26 +774,24 @@ with open('$WORK_DIR/ramen-patch.json', 'w') as f: VERIFICATION_ERRORS+=("caCertificates not found") fi - if ! echo "$VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - VERIFICATION_PASSED=false - VERIFICATION_ERRORS+=("CA bundle base64 data not found") - fi - - # Verify each profile has caCertificates - # CRITICAL: Must find at least 2 S3profiles + # Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 if echo "$VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles if command -v yq &>/dev/null; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(echo "$VERIFIED_YAML" | yq eval '.kubeObjectProtection.s3StoreProfiles | length' 2>/dev/null || echo "0") + PT=$(echo "$VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") + CK=$(echo "$VERIFIED_YAML" | yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + CT=$(echo "$VERIFIED_YAML" | yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available + PROFILE_COUNT=0 + CA_CERT_COUNT=0 + fi + if [[ $PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $PROFILE_COUNT -eq 0 ]]; then - PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi + [[ "${PROFILE_COUNT:-0}" -eq 0 ]] && PROFILE_COUNT=$(echo "$VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") CA_CERT_COUNT=$(echo "$VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") fi PROFILE_COUNT=$(echo "$PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") @@ -998,10 +866,18 @@ with open('$WORK_DIR/ramen-patch.json', 'w') as f: rm -f "$WORK_DIR/existing-ramen-config.yaml" "$WORK_DIR/ramen_manager_config.yaml" else - echo " ConfigMap does not exist, creating with ramen_manager_config.yaml containing s3StoreProfiles with caCertificates..." + echo " ConfigMap does not exist, creating with ramen_manager_config.yaml containing 2 s3StoreProfiles (${PRIMARY_CLUSTER}, ${SECONDARY_CLUSTER}) with caCertificates..." oc create configmap ramen-hub-operator-config -n openshift-operators \ - --from-literal=ramen_manager_config.yaml="s3StoreProfiles: - - name: default + --from-literal=ramen_manager_config.yaml="kubeObjectProtection: + s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" +s3StoreProfiles: + - s3ProfileName: $PRIMARY_CLUSTER + caCertificates: \"$CA_BUNDLE_BASE64\" + - s3ProfileName: $SECONDARY_CLUSTER caCertificates: \"$CA_BUNDLE_BASE64\"" || { echo " Warning: Could not create ramen-hub-operator-config" } @@ -1160,7 +1036,7 @@ done # Verify distribution to managed clusters echo "9. Verifying certificate distribution to managed clusters..." verification_failed=false -REQUIRED_VERIFICATION_CLUSTERS=("ocp-primary" "ocp-secondary") +REQUIRED_VERIFICATION_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") VERIFIED_CLUSTERS=() for cluster in $MANAGED_CLUSTERS; do @@ -1213,7 +1089,7 @@ if [[ ${#MISSING_VERIFICATION_CLUSTERS[@]} -gt 0 ]]; then done echo "" echo "The ODF SSL certificate extractor job requires successful certificate distribution" - echo "to ALL managed clusters (ocp-primary and ocp-secondary)." + echo "to ALL managed clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER)." echo "" echo "Without proper certificate distribution, the DR setup will fail." echo "Please check cluster connectivity and kubeconfig availability." @@ -1245,47 +1121,47 @@ if [[ -z "$FINAL_VERIFIED_YAML" ]]; then handle_error "ramen-hub-operator-config ConfigMap is missing or empty - CA material not configured" fi +# Write to file to avoid ARG_MAX when content is large (big base64 certs); grep/yq on file are reliable +FINAL_VERIFIED_FILE="${WORK_DIR:-/tmp/odf-ssl-certs}/final_verified_ramen.yaml" +mkdir -p "$(dirname "$FINAL_VERIFIED_FILE")" +printf '%s' "$FINAL_VERIFIED_YAML" > "$FINAL_VERIFIED_FILE" + FINAL_VERIFICATION_PASSED=true FINAL_VERIFICATION_ERRORS=() -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "s3StoreProfiles"; then +if ! grep -q "s3StoreProfiles" "$FINAL_VERIFIED_FILE" 2>/dev/null; then FINAL_VERIFICATION_PASSED=false FINAL_VERIFICATION_ERRORS+=("s3StoreProfiles not found in final verification") fi -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "caCertificates"; then +if ! grep -q "caCertificates" "$FINAL_VERIFIED_FILE" 2>/dev/null; then FINAL_VERIFICATION_PASSED=false FINAL_VERIFICATION_ERRORS+=("caCertificates not found in final verification") fi -if ! echo "$FINAL_VERIFIED_YAML" | grep -q "$CA_BUNDLE_BASE64"; then - FINAL_VERIFICATION_PASSED=false - FINAL_VERIFICATION_ERRORS+=("CA bundle base64 data not found in final verification") -fi - -# Verify each profile has caCertificates -# CRITICAL: Must find at least 2 S3profiles +# Verify structure: s3StoreProfiles under kubeObjectProtection or at top level (match script output) MIN_REQUIRED_PROFILES=2 -if echo "$FINAL_VERIFIED_YAML" | grep -q "s3StoreProfiles"; then - # Use yq to properly parse YAML and count profiles +if grep -q "s3StoreProfiles" "$FINAL_VERIFIED_FILE" 2>/dev/null; then if command -v yq &>/dev/null; then - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | yq eval '.s3StoreProfiles | length' 2>/dev/null || echo "0") - # Count profiles that have caCertificates field - FINAL_CA_CERT_COUNT=$(echo "$FINAL_VERIFIED_YAML" | yq eval '[.s3StoreProfiles[] | select(has("caCertificates"))] | length' 2>/dev/null || echo "0") + PK=$(yq eval '.kubeObjectProtection.s3StoreProfiles | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + PT=$(yq eval '.s3StoreProfiles | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + CK=$(yq eval '[.kubeObjectProtection.s3StoreProfiles[]? | select(has("caCertificates"))] | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + CT=$(yq eval '[.s3StoreProfiles[]? | select(has("caCertificates"))] | length' "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + PK=$((10#${PK:-0})); PT=$((10#${PT:-0})); CK=$((10#${CK:-0})); CT=$((10#${CT:-0})) + FINAL_PROFILE_COUNT=$(( PK >= PT ? PK : PT )) + FINAL_CA_CERT_COUNT=$(( CK >= CT ? CK : CT )) else - # Fallback to grep if yq is not available - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "s3ProfileName:" 2>/dev/null || echo "0") - if [[ $FINAL_PROFILE_COUNT -eq 0 ]]; then - FINAL_PROFILE_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "s3Bucket:" 2>/dev/null || echo "0") - fi - FINAL_CA_CERT_COUNT=$(echo "$FINAL_VERIFIED_YAML" | grep -c "caCertificates:" 2>/dev/null || echo "0") + FINAL_PROFILE_COUNT=0 + FINAL_CA_CERT_COUNT=0 + fi + if [[ $FINAL_PROFILE_COUNT -lt $MIN_REQUIRED_PROFILES || $FINAL_CA_CERT_COUNT -lt $MIN_REQUIRED_PROFILES ]]; then + FINAL_PROFILE_COUNT=$(grep -c "s3ProfileName:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + [[ "${FINAL_PROFILE_COUNT:-0}" -eq 0 ]] && FINAL_PROFILE_COUNT=$(grep -c "s3Bucket:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") + FINAL_CA_CERT_COUNT=$(grep -c "caCertificates:" "$FINAL_VERIFIED_FILE" 2>/dev/null || echo "0") fi - # Remove any whitespace/newlines and ensure numeric FINAL_PROFILE_COUNT=$(echo "$FINAL_PROFILE_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") FINAL_CA_CERT_COUNT=$(echo "$FINAL_CA_CERT_COUNT" | tr -d ' \n\r' | grep -E '^[0-9]+$' || echo "0") - - # Force to integer (remove leading zeros) FINAL_PROFILE_COUNT=$((10#$FINAL_PROFILE_COUNT)) FINAL_CA_CERT_COUNT=$((10#$FINAL_CA_CERT_COUNT)) @@ -1355,9 +1231,14 @@ if [[ "$FINAL_VERIFICATION_PASSED" != "true" ]]; then echo " - $error" done echo " Current ConfigMap YAML content:" - echo "$FINAL_VERIFIED_YAML" + cat "$FINAL_VERIFIED_FILE" echo "" - echo " The ConfigMap edit is not complete and correct until the CA material has been added to the S3profiles." + if [[ $FINAL_PROFILE_COUNT -eq 0 ]]; then + echo " s3StoreProfiles is empty ([]). Configure at least 2 S3 store profiles in ramen-hub-operator-config" + echo " (via Ramen hub operator or ODF) before this job can add CA certificates. This job cannot create profiles." + else + echo " The ConfigMap edit is not complete until CA material has been added to all S3 profiles." + fi echo " This is a CRITICAL error - the job cannot complete successfully." handle_error "Final verification failed - ramen-hub-operator-config is not complete and correct - CA material not in s3StoreProfiles" # After handle_error, return failure to trigger retry in main loop diff --git a/charts/hub/opp/scripts/odf-ssl-precheck.sh b/charts/hub/opp/scripts/odf-ssl-precheck.sh index 3201937..b31c270 100755 --- a/charts/hub/opp/scripts/odf-ssl-precheck.sh +++ b/charts/hub/opp/scripts/odf-ssl-precheck.sh @@ -54,12 +54,16 @@ cleanup_placeholder_configmaps() { return 0 } +# Primary and secondary managed cluster names (from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Function to wait for required clusters to be available and joined wait_for_cluster_readiness() { - echo "πŸ” Waiting for required clusters (ocp-primary and ocp-secondary) to be available and joined..." + echo "πŸ” Waiting for required clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER) to be available and joined..." echo " This may take several minutes during initial cluster deployment" - REQUIRED_CLUSTERS=("ocp-primary" "ocp-secondary") + REQUIRED_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") attempt=1 while [[ $attempt -le $CLUSTER_READINESS_MAX_ATTEMPTS ]]; do @@ -154,12 +158,12 @@ check_certificate_distribution() { fi hub_certs=$(echo "$bundle_content" | grep -c "hub" || echo "0") - ocp_primary_certs=$(echo "$bundle_content" | grep -c "ocp-primary" || echo "0") - ocp_secondary_certs=$(echo "$bundle_content" | grep -c "ocp-secondary" || echo "0") + ocp_primary_certs=$(echo "$bundle_content" | grep -c "$PRIMARY_CLUSTER" || echo "0") + ocp_secondary_certs=$(echo "$bundle_content" | grep -c "$SECONDARY_CLUSTER" || echo "0") echo " Hub cluster certificates: $hub_certs" - echo " ocp-primary certificates: $ocp_primary_certs" - echo " ocp-secondary certificates: $ocp_secondary_certs" + echo " $PRIMARY_CLUSTER certificates: $ocp_primary_certs" + echo " $SECONDARY_CLUSTER certificates: $ocp_secondary_certs" if [[ $hub_certs -lt 2 || $ocp_primary_certs -lt 2 || $ocp_secondary_certs -lt 2 ]]; then echo "❌ Missing certificates from one or more clusters" @@ -361,7 +365,7 @@ spec: echo " Added hub ingress CA to bundle" # Track required clusters - REQUIRED_CLUSTERS=("hub" "ocp-primary" "ocp-secondary") + REQUIRED_CLUSTERS=("hub" "$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") EXTRACTED_CLUSTERS=() if [[ "$hub_ca_extracted" == "true" ]]; then EXTRACTED_CLUSTERS+=("hub") @@ -369,7 +373,7 @@ spec: cluster_count=0 for cluster in $managed_clusters; do - if [[ "$cluster" == "ocp-primary" || "$cluster" == "ocp-secondary" ]]; then + if [[ "$cluster" == "$PRIMARY_CLUSTER" || "$cluster" == "$SECONDARY_CLUSTER" ]]; then cluster_count=$((cluster_count + 1)) echo "3.$cluster_count Extracting CA from $cluster..." @@ -411,8 +415,8 @@ spec: echo "" echo "The ODF SSL certificate extractor job requires CA material from ALL three clusters:" echo " - hub (hub cluster)" - echo " - ocp-primary (primary managed cluster)" - echo " - ocp-secondary (secondary managed cluster)" + echo " - $PRIMARY_CLUSTER (primary managed cluster)" + echo " - $SECONDARY_CLUSTER (secondary managed cluster)" echo "" echo "Without CA material from all clusters, the DR setup will fail." echo "Please ensure all clusters are accessible and have proper kubeconfigs." @@ -841,7 +845,7 @@ with open('existing-ramen-config.yaml', 'w') as f: echo "9. Verifying certificate distribution to managed clusters..." verification_failed=false - REQUIRED_VERIFICATION_CLUSTERS=("ocp-primary" "ocp-secondary") + REQUIRED_VERIFICATION_CLUSTERS=("$PRIMARY_CLUSTER" "$SECONDARY_CLUSTER") VERIFIED_CLUSTERS=() for cluster in $MANAGED_CLUSTERS; do @@ -893,7 +897,7 @@ with open('existing-ramen-config.yaml', 'w') as f: done echo "" echo "The ODF SSL certificate extractor job requires successful certificate distribution" - echo "to ALL managed clusters (ocp-primary and ocp-secondary)." + echo "to ALL managed clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER)." echo "" echo "Without proper certificate distribution, the DR setup will fail." echo "Please check cluster connectivity and kubeconfig availability." diff --git a/charts/hub/opp/templates/_helpers.tpl b/charts/hub/opp/templates/_helpers.tpl new file mode 100644 index 0000000..6e11da2 --- /dev/null +++ b/charts/hub/opp/templates/_helpers.tpl @@ -0,0 +1,13 @@ +{{/* Primary cluster name: clusterOverrides.primary.name else regionalDR[0].clusters.primary.name else ocp-primary */}} +{{- define "opp.primaryClusterName" -}} +{{- $over := index (.Values.clusterOverrides | default dict) "primary" | default dict -}} +{{- $fromOver := index $over "name" -}} +{{- if $fromOver }}{{ $fromOver }}{{- else if and .Values.regionalDR (index .Values.regionalDR 0) }}{{ (index .Values.regionalDR 0).clusters.primary.name | default "ocp-primary" }}{{- else }}ocp-primary{{ end -}} +{{- end -}} + +{{/* Secondary cluster name */}} +{{- define "opp.secondaryClusterName" -}} +{{- $over := index (.Values.clusterOverrides | default dict) "secondary" | default dict -}} +{{- $fromOver := index $over "name" -}} +{{- if $fromOver }}{{ $fromOver }}{{- else if and .Values.regionalDR (index .Values.regionalDR 0) }}{{ (index .Values.regionalDR 0).clusters.secondary.name | default "ocp-secondary" }}{{- else }}ocp-secondary{{ end -}} +{{- end -}} diff --git a/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml b/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml index 3e63f87..d3da73d 100644 --- a/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml +++ b/charts/hub/opp/templates/cronjob-argocd-health-monitor.yaml @@ -25,6 +25,19 @@ spec: containers: - name: argocd-health-monitor image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} + - name: FORCE_SYNC_APP_NAMESPACE + value: {{ .Values.argocdHealthMonitor.forceSyncAppNamespace | default "openshift-gitops" | quote }} + - name: FORCE_SYNC_APP_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncAppName | default "ramendr-starter-kit-resilient" | quote }} + - name: FORCE_SYNC_RESOURCE_KIND + value: {{ .Values.argocdHealthMonitor.forceSyncResourceKind | default "Namespace" | quote }} + - name: FORCE_SYNC_RESOURCE_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncResourceName | default "ramendr-starter-kit-resilient" | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/templates/job-argocd-health-monitor.yaml b/charts/hub/opp/templates/job-argocd-health-monitor.yaml index 42de71e..0469958 100644 --- a/charts/hub/opp/templates/job-argocd-health-monitor.yaml +++ b/charts/hub/opp/templates/job-argocd-health-monitor.yaml @@ -18,6 +18,19 @@ spec: containers: - name: argocd-health-monitor image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} + - name: FORCE_SYNC_APP_NAMESPACE + value: {{ .Values.argocdHealthMonitor.forceSyncAppNamespace | default "openshift-gitops" | quote }} + - name: FORCE_SYNC_APP_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncAppName | default "ramendr-starter-kit-resilient" | quote }} + - name: FORCE_SYNC_RESOURCE_KIND + value: {{ .Values.argocdHealthMonitor.forceSyncResourceKind | default "Namespace" | quote }} + - name: FORCE_SYNC_RESOURCE_NAME + value: {{ .Values.argocdHealthMonitor.forceSyncResourceName | default "ramendr-starter-kit-resilient" | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml b/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml index fe50de5..70ead86 100644 --- a/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml +++ b/charts/hub/opp/templates/job-odf-ssl-certificate-extraction.yaml @@ -31,6 +31,10 @@ spec: env: - name: KUBECONFIG value: "" + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} restartPolicy: Never serviceAccountName: odf-ssl-extractor-sa backoffLimit: 10 diff --git a/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml b/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml index d8b65c2..5a9b4d0 100644 --- a/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml +++ b/charts/hub/opp/templates/job-odf-ssl-certificate-precheck.yaml @@ -30,6 +30,11 @@ spec: limits: memory: "128Mi" cpu: "100m" + env: + - name: PRIMARY_CLUSTER + value: {{ include "opp.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "opp.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/opp/values.yaml b/charts/hub/opp/values.yaml index 09a27d0..050c3e4 100644 --- a/charts/hub/opp/values.yaml +++ b/charts/hub/opp/values.yaml @@ -1,8 +1,19 @@ -# No Values -# Need to use fromSecret for these keys. problems ith json output. -## accessKey: '{{ `{{ fromSecret "openshift-storage" "noobaa-admin" "AWS_ACCESS_KEY_ID" }}` }}' -## secretKey: '{{ `{{ fromSecret "openshift-storage" "noobaa-admin" "AWS_SECRET_ACCESS_KEY" }}` }}' +--- +# DR pair cluster names - same structure as rdr chart (regionalDR). +# Override via values-hub or overrides so opp-policy and rdr use the same names. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary argocdHealthMonitor: enabled: true - + # When remediating a wedged cluster, force-sync this specific resource in this Application (instead of restarting Argo CD) + forceSyncAppNamespace: openshift-gitops + forceSyncAppName: ramendr-starter-kit-resilient + # Specific resource to sync: the Namespace (kind/name) in the application above + forceSyncResourceKind: Namespace + forceSyncResourceName: ramendr-starter-kit-resilient diff --git a/charts/hub/rdr/README.md b/charts/hub/rdr/README.md new file mode 100644 index 0000000..465b9e1 --- /dev/null +++ b/charts/hub/rdr/README.md @@ -0,0 +1,61 @@ +# RDR (Regional DR) chart + +Helm chart for Regional DR configuration (cluster pair, install_config, DRPC, etc.). + +## Updating the default install_config JSON files + +When `values.yaml` is changed (e.g. machine types, networking CIDRs, platform settings) under `regionalDR[0].clusters.primary.install_config` or `secondary.install_config`, the chart’s fallback files must be kept in sync so minimal `regionalDR` values still produce a full install_config. + +From the **repository root**: + +```bash +./scripts/update-rdr-default-install-config-json.sh +``` + +- **What it does:** Reads `charts/hub/rdr/values.yaml`, extracts both `install_config` sections, and overwrites: + - `charts/hub/rdr/files/default-primary-install-config.json` + - `charts/hub/rdr/files/default-secondary-install-config.json` +- **When to run:** After editing `install_config` in this chart’s `values.yaml`. +- **Dry-run:** To print the generated JSON without writing files: + + ```bash + ./scripts/update-rdr-default-install-config-json.sh --dry-run + ``` + +- **Requirements:** Python 3 with PyYAML (`pip install pyyaml`), or `yq` (and optionally `jq`). + +Then run the install_config tests to confirm nothing is broken: + +```bash +./scripts/test-rdr-install-config.sh +``` + +## Troubleshooting: DRCluster validation β€” "DRClusterConfig is not applied to cluster" + +The DRCluster validation job (sync-wave 8) waits until each DRCluster’s status shows `Validated=True`. If you see: + +```text +DRCluster ocp-p: Not validated yet (status: False) + Message: DRClusterConfig is not applied to cluster (ocp-p) +``` + +then the Ramen/ODF DR controller has not yet applied the DR config to that managed cluster (usually via a ManifestWork). + +**Checks:** + +1. **Hub operator** – ODF Multicluster Orchestrator / Ramen DR is installed on the hub and DRPolicy + DRCluster resources exist and are correct. +2. **Clusters joined** – Both clusters appear as ManagedClusters and are available: + + ```bash + oc get managedcluster ocp-p ocp-s + ``` + +3. **ManifestWorks** – Ramen creates ManifestWorks in each cluster’s namespace to deploy the DR cluster operator. On the hub: + + ```bash + oc get manifestwork -n ocp-p + oc get manifestwork -n ocp-s + ``` + + If these are missing or not applied, check Ramen controller logs on the hub. +4. **Cluster readiness** – Clusters must be reachable from the hub so the hub can apply and reconcile the ManifestWork; ensure they are not degraded or not ready. diff --git a/charts/hub/rdr/files/default-primary-install-config.json b/charts/hub/rdr/files/default-primary-install-config.json new file mode 100644 index 0000000..98a3db2 --- /dev/null +++ b/charts/hub/rdr/files/default-primary-install-config.json @@ -0,0 +1,55 @@ +{ + "apiVersion": "v1", + "baseDomain": "cluster.example.com", + "metadata": { + "name": "ocp-primary" + }, + "controlPlane": { + "name": "master", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.4xlarge" + } + } + }, + "compute": [ + { + "name": "worker", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.metal" + } + } + } + ], + "networking": { + "clusterNetwork": [ + { + "cidr": "10.132.0.0/14", + "hostPrefix": 23 + } + ], + "machineNetwork": [ + { + "cidr": "10.1.0.0/16" + } + ], + "networkType": "OVNKubernetes", + "serviceNetwork": [ + "172.20.0.0/16" + ] + }, + "platform": { + "aws": { + "region": "us-west-1", + "userTags": { + "project": "ValidatedPatterns" + } + } + }, + "publish": "External", + "sshKey": "", + "pullSecret": "" +} diff --git a/charts/hub/rdr/files/default-secondary-install-config.json b/charts/hub/rdr/files/default-secondary-install-config.json new file mode 100644 index 0000000..8fca7d1 --- /dev/null +++ b/charts/hub/rdr/files/default-secondary-install-config.json @@ -0,0 +1,55 @@ +{ + "apiVersion": "v1", + "baseDomain": "cluster.example.com", + "metadata": { + "name": "ocp-secondary" + }, + "controlPlane": { + "name": "master", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.4xlarge" + } + } + }, + "compute": [ + { + "name": "worker", + "replicas": 3, + "platform": { + "aws": { + "type": "m5.metal" + } + } + } + ], + "networking": { + "clusterNetwork": [ + { + "cidr": "10.136.0.0/14", + "hostPrefix": 23 + } + ], + "machineNetwork": [ + { + "cidr": "10.2.0.0/16" + } + ], + "networkType": "OVNKubernetes", + "serviceNetwork": [ + "172.21.0.0/16" + ] + }, + "platform": { + "aws": { + "region": "us-east-1", + "userTags": { + "project": "ValidatedPatterns" + } + } + }, + "publish": "External", + "sshKey": "", + "pullSecret": "" +} diff --git a/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh b/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh index 9acd452..6467942 100755 --- a/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh +++ b/charts/hub/rdr/scripts/edge-gitops-vms-deploy.sh @@ -25,8 +25,9 @@ display_apply_error() { echo "Starting Edge GitOps VMs deployment check and deployment..." echo "This job will check for existing VMs, Services, Routes, and ExternalSecrets before applying the helm template" -# Configuration -HELM_CHART_URL="https://github.com/validatedpatterns/helm-charts/releases/download/main/edge-gitops-vms-0.2.10.tgz" +# Configuration (HELM_CHART_VERSION from values/env, default 0.2.10) +HELM_CHART_VERSION="${HELM_CHART_VERSION:-0.2.10}" +HELM_CHART_URL="https://github.com/validatedpatterns/helm-charts/releases/download/main/edge-gitops-vms-${HELM_CHART_VERSION}.tgz" WORK_DIR="/tmp/edge-gitops-vms" VALUES_FILE="$WORK_DIR/values-egv-dr.yaml" VM_NAMESPACE="gitops-vms" @@ -73,8 +74,8 @@ get_target_cluster_from_placement() { if [[ -z "$PLACEMENT_DECISION" ]]; then echo " ⚠️ Warning: Could not find PlacementDecision for $PLACEMENT_NAME" - echo " Will default to primary cluster (ocp-primary)" - TARGET_CLUSTER="ocp-primary" + echo " Will default to primary cluster (${PRIMARY_CLUSTER:-ocp-primary})" + TARGET_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" return 1 fi @@ -84,8 +85,8 @@ get_target_cluster_from_placement() { if [[ -z "$TARGET_CLUSTER" ]]; then echo " ⚠️ Warning: Could not determine target cluster from PlacementDecision" - echo " Will default to primary cluster (ocp-primary)" - TARGET_CLUSTER="ocp-primary" + echo " Will default to primary cluster (${PRIMARY_CLUSTER:-ocp-primary})" + TARGET_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" return 1 fi @@ -93,51 +94,77 @@ get_target_cluster_from_placement() { return 0 } -# Function to get kubeconfig for target managed cluster +# Function to get kubeconfig for target managed cluster (run from hub; secrets are in hub namespace ) get_target_cluster_kubeconfig() { local cluster="$1" - echo "Getting kubeconfig for target managed cluster: $cluster" + echo "Getting kubeconfig for target managed cluster: $cluster (from hub cluster)" + + # Try known secret names used by ACM for managed cluster kubeconfig + local secret_names=("${cluster}-admin-kubeconfig" "admin-kubeconfig" "import-kubeconfig") + local got_kubeconfig=false + + for secret_name in "${secret_names[@]}"; do + if oc get secret "$secret_name" -n "$cluster" -o jsonpath='{.data.kubeconfig}' 2>/dev/null | \ + base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null && [[ -s "$WORK_DIR/target-kubeconfig.yaml" ]]; then + got_kubeconfig=true + echo " βœ… Retrieved kubeconfig from secret $secret_name (namespace $cluster)" + break + fi + done + + if [[ "$got_kubeconfig" != "true" ]]; then + # Fallback: any secret in namespace $cluster with kubeconfig data + if oc get secret -n "$cluster" -o name | grep -E "(admin-kubeconfig|kubeconfig)" | head -1 | \ + xargs -I {} oc get {} -n "$cluster" -o jsonpath='{.data.kubeconfig}' | \ + base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null && [[ -s "$WORK_DIR/target-kubeconfig.yaml" ]]; then + got_kubeconfig=true + echo " βœ… Retrieved kubeconfig for $cluster" + fi + fi - # Try to get kubeconfig from secret - if oc get secret -n "$cluster" -o name | grep -E "(admin-kubeconfig|kubeconfig)" | head -1 | \ - xargs -I {} oc get {} -n "$cluster" -o jsonpath='{.data.kubeconfig}' | \ - base64 -d > "$WORK_DIR/target-kubeconfig.yaml" 2>/dev/null; then - echo " βœ… Retrieved kubeconfig for $cluster" + if [[ "$got_kubeconfig" == "true" ]]; then export KUBECONFIG="$WORK_DIR/target-kubeconfig.yaml" - - # Verify we can connect to the target cluster if oc get nodes &>/dev/null; then echo " βœ… Successfully connected to target managed cluster: $cluster" return 0 - else - echo " ⚠️ Warning: Could not verify connection to target cluster" - return 1 fi - else - echo " ⚠️ Could not get kubeconfig for $cluster" - echo " Will use current context (assuming we're already on the target cluster)" + echo " ⚠️ Warning: Kubeconfig retrieved but could not verify connection to $cluster" return 1 fi + + echo " ⚠️ Could not get kubeconfig for $cluster" + return 1 } +# Primary/secondary cluster names (from regionalDR via env when run by the rdr chart Job) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" + # Get target cluster from Placement resource -TARGET_CLUSTER="ocp-primary" # Default to primary +TARGET_CLUSTER="$PRIMARY_CLUSTER" # Default to primary if get_target_cluster_from_placement; then echo " Target cluster: $TARGET_CLUSTER" else echo " Using default target cluster: $TARGET_CLUSTER" fi -# Get kubeconfig for target cluster +# Get kubeconfig for target cluster (must succeed so we do not deploy to hub by mistake) if ! get_target_cluster_kubeconfig "$TARGET_CLUSTER"; then - echo " ⚠️ Warning: Could not get kubeconfig for target cluster" - echo " Continuing with current context..." + echo " ❌ Error: Could not get kubeconfig for target cluster $TARGET_CLUSTER" + echo " Deployment must run against the primary/target cluster, not the hub." + echo " Ensure the hub can read the kubeconfig secret for $TARGET_CLUSTER (e.g. admin-kubeconfig in namespace $TARGET_CLUSTER)." + exit 1 fi -# Check if we're on the right cluster +# Verify we're on the target cluster, not the hub CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") echo "Current cluster context: $CURRENT_CLUSTER" echo "Target cluster for deployment: $TARGET_CLUSTER" +# Refuse if we're still on hub (in-cluster or local-cluster context) +if [[ "$CURRENT_CLUSTER" == "in-cluster" || "$CURRENT_CLUSTER" == "local-cluster" ]]; then + echo " ❌ Error: Current context is the hub (${CURRENT_CLUSTER}), not target $TARGET_CLUSTER. Refusing to deploy." + exit 1 +fi # Ensure the gitops-vms namespace exists on the target cluster echo "" @@ -495,16 +522,14 @@ else echo "" echo " Applying template to namespace: $VM_NAMESPACE..." - # Verify we're using the correct kubeconfig (target cluster) - if [[ -n "${KUBECONFIG:-}" && -f "$KUBECONFIG" ]]; then - CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") - echo " Using kubeconfig: $KUBECONFIG" - echo " Current cluster context: $CURRENT_CLUSTER" - echo " Target cluster: $TARGET_CLUSTER" - else - echo " ⚠️ Warning: KUBECONFIG not set or file not found, using default context" - echo " This may apply to the wrong cluster!" + # Require that we are using the target cluster's kubeconfig (never apply to hub) + if [[ "${KUBECONFIG:-}" != "$WORK_DIR/target-kubeconfig.yaml" || ! -f "$WORK_DIR/target-kubeconfig.yaml" ]]; then + echo " ❌ Error: KUBECONFIG must point to target cluster ($TARGET_CLUSTER). Refusing to apply to avoid deploying to hub." + exit 1 fi + CURRENT_CLUSTER=$(oc config view --minify -o jsonpath='{.contexts[0].context.cluster}' 2>/dev/null || echo "") + echo " Using kubeconfig: $KUBECONFIG (target: $TARGET_CLUSTER)" + echo " Current cluster context: $CURRENT_CLUSTER" # Now apply the template and capture both stdout, stderr, and exit code # The oc apply will use the KUBECONFIG set earlier (target cluster's kubeconfig) diff --git a/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh b/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh index b34ecc3..40c5cd2 100755 --- a/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh +++ b/charts/hub/rdr/scripts/odf-dr-prerequisites-check.sh @@ -3,10 +3,10 @@ set -euo pipefail echo "Starting ODF DR prerequisites check..." -# Configuration +# Configuration (PRIMARY_CLUSTER and SECONDARY_CLUSTER from values.yaml via env) HUB_CLUSTER="local-cluster" -PRIMARY_CLUSTER="ocp-primary" -SECONDARY_CLUSTER="ocp-secondary" +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" KUBECONFIG_DIR="/tmp/kubeconfigs" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks @@ -306,35 +306,35 @@ check_ca_material_completeness() { return 1 fi - # Look for primary cluster certificates - if [[ "$hub_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Hub cluster CA bundle missing ocp-primary-ca certificate" + # Look for primary cluster certificates (marker from odf-ssl-certificate-extraction.sh) + if [[ "$hub_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Hub cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$primary_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Primary cluster CA bundle missing ocp-primary-ca certificate" + if [[ "$primary_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Primary cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$secondary_ca_bundle" != *"# CA from ocp-primary-ca"* ]]; then - echo "Secondary cluster CA bundle missing ocp-primary-ca certificate" + if [[ "$secondary_ca_bundle" != *"# CA from ${PRIMARY_CLUSTER}-ca"* ]]; then + echo "Secondary cluster CA bundle missing ${PRIMARY_CLUSTER}-ca certificate" return 1 fi # Look for secondary cluster certificates - if [[ "$hub_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Hub cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$hub_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Hub cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$primary_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Primary cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$primary_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Primary cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi - if [[ "$secondary_ca_bundle" != *"# CA from ocp-secondary-ca"* ]]; then - echo "Secondary cluster CA bundle missing ocp-secondary-ca certificate" + if [[ "$secondary_ca_bundle" != *"# CA from ${SECONDARY_CLUSTER}-ca"* ]]; then + echo "Secondary cluster CA bundle missing ${SECONDARY_CLUSTER}-ca certificate" return 1 fi diff --git a/charts/hub/rdr/scripts/submariner-prerequisites-check.sh b/charts/hub/rdr/scripts/submariner-prerequisites-check.sh index cdcb518..389c85e 100755 --- a/charts/hub/rdr/scripts/submariner-prerequisites-check.sh +++ b/charts/hub/rdr/scripts/submariner-prerequisites-check.sh @@ -3,9 +3,9 @@ set -euo pipefail echo "Starting Submariner prerequisites check..." -# Configuration -PRIMARY_CLUSTER="ocp-primary" -SECONDARY_CLUSTER="ocp-secondary" +# Configuration (PRIMARY_CLUSTER and SECONDARY_CLUSTER from values.yaml via env) +PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" +SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" KUBECONFIG_DIR="/tmp/kubeconfigs" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks diff --git a/charts/hub/rdr/templates/_helpers.tpl b/charts/hub/rdr/templates/_helpers.tpl new file mode 100644 index 0000000..3f80695 --- /dev/null +++ b/charts/hub/rdr/templates/_helpers.tpl @@ -0,0 +1,150 @@ +{{/* + Sanitize install_config for OpenShift installer: ensure apiVersion, pass through all + install-config fields (including full platform.aws: region, subnets, userTags, amiID, + defaultMachinePlatform, serviceEndpoints, etc.) so regionalDR and clusterOverrides + can override platform/region effectively. Only strip keys known invalid for the + installer (e.g. vpc in platform.aws). +*/}} +{{- define "rdr.sanitizeInstallConfig" -}} +{{- $raw := . -}} +{{- $withVersion := merge (dict "apiVersion" "v1") $raw -}} +{{- $platform := index $withVersion "platform" | default dict -}} +{{- $aws := index $platform "aws" | default dict -}} +{{- /* Pass through full platform.aws (region, subnets, userTags, amiID, defaultMachinePlatform, serviceEndpoints, etc.); omit only known-invalid keys like vpc */ -}} +{{- $awsSafe := ternary (omit $aws "vpc") $aws (and (kindIs "map" $aws) (hasKey $aws "vpc")) -}} +{{- $platformSafe := merge $platform (dict "aws" $awsSafe) -}} +{{- $allowed := dict "apiVersion" (index $withVersion "apiVersion") "baseDomain" (index $withVersion "baseDomain") "metadata" (index $withVersion "metadata") "controlPlane" (index $withVersion "controlPlane") "compute" (index $withVersion "compute") "networking" (index $withVersion "networking") "platform" $platformSafe "publish" (index $withVersion "publish") "pullSecret" (index $withVersion "pullSecret") "sshKey" (index $withVersion "sshKey") -}} +{{- $allowed | toJson -}} +{{- end -}} + +{{/* + Deep-merge install_config so clusterOverrides can override only platform/region, + metadata, or any subset without replacing the rest of base install_config. + Call with dict "base" "over" . +*/}} +{{- define "rdr.mergeInstallConfig" -}} +{{- $base := .base | default dict -}} +{{- $over := .over | default dict -}} +{{- /* Sprig merge: first dict wins; put over first so override wins */ -}} +{{- $merged := merge $over $base -}} +{{- $metadataMerged := merge (index $over "metadata" | default dict) (index $base "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $base "platform" | default dict -}} +{{- $platformOver := index $over "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index $platformOver "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- merge $merged (dict "platform" $platformFinal) | toJson -}} +{{- end -}} + +{{/* + Effective primary cluster: merge of regionalDR[0].clusters.primary and clusterOverrides.primary. + Use when clusterOverrides is set to avoid replacing full regionalDR in override file. + Call with a context that has .Values and optionally .primaryOverrideInstallConfig (override install_config); + if primaryOverrideInstallConfig is not provided, falls back to .Values.clusterOverrides.primary.install_config. +*/}} +{{- define "rdr.effectivePrimaryCluster" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- $over := index (.Values.clusterOverrides | default dict) "primary" | default dict -}} +{{- $base := $dr.clusters.primary -}} +{{- $baseIC := $base.install_config | default dict -}} +{{- /* When values-hub (or similar) replaces regionalDR with minimal structure, base has no install_config; use chart default */ -}} +{{- if and (index . "Files") (not (hasKey $baseIC "controlPlane")) -}} +{{- $baseIC = fromJson ((index . "Files").Get "files/default-primary-install-config.json") | default dict -}} +{{- end -}} +{{- $overIC := index . "primaryOverrideInstallConfig" | default $over.install_config | default dict -}} +{{- /* Shallow merge: over wins. Deep-merge metadata, platform.aws, controlPlane, compute so over wins but base keeps machine types when over is partial. */ -}} +{{- $merged := merge $overIC $baseIC -}} +{{- $metadataMerged := merge (index $overIC "metadata" | default dict) (index $baseIC "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $baseIC "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index (index $overIC "platform" | default dict) "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- $merged := merge $merged (dict "platform" $platformFinal) -}} +{{- /* Deep-merge controlPlane so override can set platform.aws.type without dropping base name/replicas */ -}} +{{- $cpBase := index $baseIC "controlPlane" | default dict -}} +{{- $cpOver := index $overIC "controlPlane" | default dict -}} +{{- $cpMerged := merge $cpOver $cpBase -}} +{{- $cpPlatformBase := index $cpBase "platform" | default dict -}} +{{- $cpPlatformOver := index $cpOver "platform" | default dict -}} +{{- $cpAwsBase := index $cpPlatformBase "aws" | default dict -}} +{{- $cpAwsOver := index $cpPlatformOver "aws" | default dict -}} +{{- $cpAwsMerged := merge $cpAwsOver $cpAwsBase -}} +{{- $cpPlatformFinal := merge $cpPlatformBase (dict "aws" $cpAwsMerged) -}} +{{- $controlPlaneFinal := merge $cpMerged (dict "platform" $cpPlatformFinal) -}} +{{- $merged := merge $merged (dict "controlPlane" $controlPlaneFinal) -}} +{{- /* Compute: override list wins if non-empty; else use base so base machine types are kept */ -}} +{{- $computeBase := index $baseIC "compute" | default list -}} +{{- $computeOver := index $overIC "compute" | default list -}} +{{- $computeFinal := ternary $computeOver $computeBase (gt (len $computeOver) 0) -}} +{{- $installConfig := merge $merged (dict "compute" $computeFinal) -}} +{{- $installConfigSafe := fromJson (include "rdr.sanitizeInstallConfig" $installConfig) -}} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) -}} +{{- $installConfigWithBase := merge $installConfigSafe (dict "baseDomain" ($defaultBaseDomain | default (index $installConfigSafe "baseDomain"))) -}} +{{- $clusterGroup := index $over "clusterGroup" | default $base.clusterGroup | default $dr.name -}} +{{- dict "name" (index $over "name" | default $base.name) "version" (index $over "version" | default $base.version) "clusterGroup" $clusterGroup "install_config" $installConfigWithBase | toJson -}} +{{- end -}} + +{{/* + Effective secondary cluster: merge of regionalDR[0].clusters.secondary and clusterOverrides.secondary. + Call with a context that has .Values and optionally .secondaryOverrideInstallConfig. +*/}} +{{- define "rdr.effectiveSecondaryCluster" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- $over := index (.Values.clusterOverrides | default dict) "secondary" | default dict -}} +{{- $base := $dr.clusters.secondary -}} +{{- $baseIC := $base.install_config | default dict -}} +{{- if and (index . "Files") (not (hasKey $baseIC "controlPlane")) -}} +{{- $baseIC = fromJson ((index . "Files").Get "files/default-secondary-install-config.json") | default dict -}} +{{- end -}} +{{- $overIC := index . "secondaryOverrideInstallConfig" | default $over.install_config | default dict -}} +{{- $merged := merge $overIC $baseIC -}} +{{- $metadataMerged := merge (index $overIC "metadata" | default dict) (index $baseIC "metadata" | default dict) -}} +{{- $merged := merge $merged (dict "metadata" $metadataMerged) -}} +{{- $platformBase := index $baseIC "platform" | default dict -}} +{{- $awsBase := index $platformBase "aws" | default dict -}} +{{- $awsOver := index (index $overIC "platform" | default dict) "aws" | default dict -}} +{{- $awsMerged := merge $awsOver $awsBase -}} +{{- $platformFinal := merge $platformBase (dict "aws" $awsMerged) -}} +{{- $merged := merge $merged (dict "platform" $platformFinal) -}} +{{- $cpBase := index $baseIC "controlPlane" | default dict -}} +{{- $cpOver := index $overIC "controlPlane" | default dict -}} +{{- $cpMerged := merge $cpOver $cpBase -}} +{{- $cpPlatformBase := index $cpBase "platform" | default dict -}} +{{- $cpPlatformOver := index $cpOver "platform" | default dict -}} +{{- $cpAwsBase := index $cpPlatformBase "aws" | default dict -}} +{{- $cpAwsOver := index $cpPlatformOver "aws" | default dict -}} +{{- $cpAwsMerged := merge $cpAwsOver $cpAwsBase -}} +{{- $cpPlatformFinal := merge $cpPlatformBase (dict "aws" $cpAwsMerged) -}} +{{- $controlPlaneFinal := merge $cpMerged (dict "platform" $cpPlatformFinal) -}} +{{- $merged := merge $merged (dict "controlPlane" $controlPlaneFinal) -}} +{{- $computeBase := index $baseIC "compute" | default list -}} +{{- $computeOver := index $overIC "compute" | default list -}} +{{- $computeFinal := ternary $computeOver $computeBase (gt (len $computeOver) 0) -}} +{{- $installConfig := merge $merged (dict "compute" $computeFinal) -}} +{{- $installConfigSafe := fromJson (include "rdr.sanitizeInstallConfig" $installConfig) -}} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) -}} +{{- $installConfigWithBase := merge $installConfigSafe (dict "baseDomain" ($defaultBaseDomain | default (index $installConfigSafe "baseDomain"))) -}} +{{- $clusterGroup := index $over "clusterGroup" | default $base.clusterGroup | default $dr.name -}} +{{- dict "name" (index $over "name" | default $base.name) "version" (index $over "version" | default $base.version) "clusterGroup" $clusterGroup "install_config" $installConfigWithBase | toJson -}} +{{- end -}} + +{{/* Primary cluster name for use in drpc, jobs, etc. */}} +{{- define "rdr.primaryClusterName" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- index (index (.Values.clusterOverrides | default dict) "primary" | default dict) "name" | default $dr.clusters.primary.name -}} +{{- end -}} + +{{/* Secondary cluster name */}} +{{- define "rdr.secondaryClusterName" -}} +{{- $dr := index .Values.regionalDR 0 -}} +{{- index (index (.Values.clusterOverrides | default dict) "secondary" | default dict) "name" | default $dr.clusters.secondary.name -}} +{{- end -}} + +{{/* Preferred cluster for DRPC (default: primary). Override via values.drpc.preferredCluster. */}} +{{- define "rdr.preferredClusterName" -}} +{{- (index (.Values.drpc | default dict) "preferredCluster") | default (include "rdr.primaryClusterName" .) -}} +{{- end -}} diff --git a/charts/hub/rdr/templates/cluster_deployments.yaml b/charts/hub/rdr/templates/cluster_deployments.yaml index 4ad029b..f1221fb 100644 --- a/charts/hub/rdr/templates/cluster_deployments.yaml +++ b/charts/hub/rdr/templates/cluster_deployments.yaml @@ -1,7 +1,14 @@ -{{- range .Values.regionalDR }} -{{ $clusterSet := .name }} -{{- range list .clusters.primary .clusters.secondary }} -{{ $cluster := . }} +{{- $dr := index .Values.regionalDR 0 }} +{{- $clusterSet := $dr.name }} +{{- $co := .Values.clusterOverrides | default dict -}} +{{- $ctx := dict "Values" .Values "Files" .Files "primaryOverrideInstallConfig" (index (index $co "primary" | default dict) "install_config" | default dict) "secondaryOverrideInstallConfig" (index (index $co "secondary" | default dict) "install_config" | default dict) }} +{{- $effectivePrimary := include "rdr.effectivePrimaryCluster" $ctx | fromJson }} +{{- $effectiveSecondary := include "rdr.effectiveSecondaryCluster" $ctx | fromJson }} +{{- $defaultBaseDomain := join "." (slice (splitList "." (.Values.global.clusterDomain | default "cluster.example.com")) 1) }} +{{- range list $effectivePrimary $effectiveSecondary }} +{{- $cluster := . }} +{{- $baseDomainRaw := index $cluster.install_config "baseDomain" }} +{{- $baseDomainStr := default $defaultBaseDomain (and (kindIs "string" $baseDomainRaw) $baseDomainRaw) }} --- apiVersion: v1 kind: Namespace @@ -104,7 +111,7 @@ data: install-config.yaml: {{ tpl $install_config $ | b64enc }} --- -{{- $domainLabels := splitList "." (tpl $cluster.install_config.baseDomain $) }} +{{- $domainLabels := splitList "." (tpl $baseDomainStr $) }} {{- $baseDomain := join "." (slice $domainLabels 1) }} apiVersion: hive.openshift.io/v1 kind: ClusterDeployment @@ -115,6 +122,7 @@ metadata: purpose: regionalDR vendor: OpenShift cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} annotations: argocd.argoproj.io/sync-wave: "1" spec: @@ -146,7 +154,7 @@ metadata: name: {{ $cluster.name }} vendor: OpenShift cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} - clusterGroup: {{ $cluster.clusterGroup }} + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} purpose: regionalDR name: {{ $cluster.name }} annotations: @@ -167,6 +175,9 @@ spec: clusterNamespace: {{ $cluster.name }} clusterLabels: vendor: OpenShift + cluster.open-cluster-management.io/clusterset: {{ $clusterSet }} + purpose: regionalDR + clusterGroup: {{ $cluster.clusterGroup | default $clusterSet }} applicationManager: enabled: true policyController: @@ -178,4 +189,3 @@ spec: iamPolicyController: enabled: true {{- end }} -{{- end }} diff --git a/charts/hub/rdr/templates/dr_policy.yaml b/charts/hub/rdr/templates/dr_policy.yaml index e33cb09..9b4aaaf 100644 --- a/charts/hub/rdr/templates/dr_policy.yaml +++ b/charts/hub/rdr/templates/dr_policy.yaml @@ -16,8 +16,8 @@ metadata: cluster.open-cluster-management.io/backup: ramen spec: drClusters: - - {{ $clusters.primary.name }} - - {{ $clusters.secondary.name }} + - {{ include "rdr.primaryClusterName" $ }} + - {{ include "rdr.secondaryClusterName" $ }} schedulingInterval: {{ .interval }} {{- if .vmSupport }} replicationClassSelector: diff --git a/charts/hub/rdr/templates/drpc.yaml b/charts/hub/rdr/templates/drpc.yaml index 0f59902..2fef27a 100644 --- a/charts/hub/rdr/templates/drpc.yaml +++ b/charts/hub/rdr/templates/drpc.yaml @@ -1,38 +1,51 @@ +{{- $drpc := .Values.drpc | default dict }} --- apiVersion: ramendr.openshift.io/v1alpha1 kind: DRPlacementControl metadata: annotations: - drplacementcontrol.ramendr.openshift.io/app-namespace: openshift-dr-ops + drplacementcontrol.ramendr.openshift.io/app-namespace: {{ $drpc.namespace | default "openshift-dr-ops" }} argocd.argoproj.io/sync-wave: "9" labels: cluster.open-cluster-management.io/backup: ramen - name: gitops-vm-protection - namespace: openshift-dr-ops + name: {{ $drpc.name | default "gitops-vm-protection" }} + namespace: {{ $drpc.namespace | default "openshift-dr-ops" }} spec: drPolicyRef: apiVersion: ramendr.openshift.io/v1alpha1 kind: DRPolicy - name: 2m-vm + name: {{ index $drpc "drPolicyRef" "name" | default "2m-vm" }} kubeObjectProtection: - captureInterval: 2m0s + captureInterval: {{ index $drpc "kubeObjectProtection" "captureInterval" | default "2m0s" }} kubeObjectSelector: matchExpressions: + {{- with index $drpc "kubeObjectProtection" "kubeObjectSelector" "matchExpressions" }} + {{- toYaml . | nindent 6 }} + {{- else }} - key: drprotection operator: In values: - "true" + {{- end }} placementRef: apiVersion: cluster.open-cluster-management.io/v1beta1 kind: Placement - name: gitops-vm-protection-placement-1 - namespace: openshift-dr-ops - preferredCluster: ocp-primary + name: {{ index $drpc "placementRef" "name" | default "gitops-vm-protection-placement-1" }} + namespace: {{ index $drpc "placementRef" "namespace" | default "openshift-dr-ops" }} + preferredCluster: {{ include "rdr.preferredClusterName" . }} protectedNamespaces: + {{- with index $drpc "protectedNamespaces" }} + {{- toYaml . | nindent 2 }} + {{- else }} - gitops-vms + {{- end }} pvcSelector: matchExpressions: + {{- with index $drpc "pvcSelector" "matchExpressions" }} + {{- toYaml . | nindent 6 }} + {{- else }} - key: app.kubernetes.io/component operator: In values: - storage + {{- end }} \ No newline at end of file diff --git a/charts/hub/rdr/templates/job-drcluster-validation.yaml b/charts/hub/rdr/templates/job-drcluster-validation.yaml index 9da52ba..18260d3 100644 --- a/charts/hub/rdr/templates/job-drcluster-validation.yaml +++ b/charts/hub/rdr/templates/job-drcluster-validation.yaml @@ -34,9 +34,9 @@ spec: echo "Starting DRCluster validation check..." - # Configuration - PRIMARY_CLUSTER="{{ $clusters.primary.name }}" - SECONDARY_CLUSTER="{{ $clusters.secondary.name }}" + # Configuration (clusterOverrides or regionalDR) + PRIMARY_CLUSTER="{{ include "rdr.primaryClusterName" $ }}" + SECONDARY_CLUSTER="{{ include "rdr.secondaryClusterName" $ }}" DRPOLICY_NAME="{{ $name }}" MAX_ATTEMPTS=120 # 2 hours with 1 minute intervals SLEEP_INTERVAL=60 # 1 minute between checks @@ -105,10 +105,20 @@ spec: done echo "❌ DRCluster validation check failed after $MAX_ATTEMPTS attempts" - echo "Please ensure:" + echo "" + echo "If the message was 'DRClusterConfig is not applied to cluster':" + echo " Ramen has not yet applied the DR config to that managed cluster (e.g. via ManifestWork)." + echo " - Ensure the ODF/Ramen DR operator is installed on the hub and DRPolicy/DRCluster exist." + echo " - Ensure both clusters are joined (ManagedCluster) and available (hub can reach them)." + echo " - On the hub, check ManifestWorks in each cluster namespace for Ramen/DRClusterConfig:" + echo " oc get manifestwork -n $PRIMARY_CLUSTER" + echo " oc get manifestwork -n $SECONDARY_CLUSTER" + echo " - Check Ramen controller logs on the hub if ManifestWorks are missing or failing." + echo "" + echo "General checks:" echo "1. DRPolicy $DRPOLICY_NAME is created and properly configured" echo "2. Both clusters ($PRIMARY_CLUSTER and $SECONDARY_CLUSTER) are available and joined" - echo "3. DR operator is installed and running on both clusters" + echo "3. DR operator (Ramen/ODF) is installed and running on the hub and on both clusters" echo "4. ODF is properly configured on both clusters" echo "" echo "Current DRCluster status:" diff --git a/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml b/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml index 294ca3e..9ba9910 100644 --- a/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml +++ b/charts/hub/rdr/templates/job-edge-gitops-vms-deploy.yaml @@ -60,6 +60,12 @@ spec: env: - name: KUBECONFIG value: "" + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} + - name: HELM_CHART_VERSION + value: {{ index (index .Values "edgeGitopsVms" | default dict) "chartVersion" | default "0.2.10" | quote }} volumes: - name: values configMap: diff --git a/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml b/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml index 37f4dfc..dab05e0 100644 --- a/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml +++ b/charts/hub/rdr/templates/job-odf-dr-prerequisites.yaml @@ -17,6 +17,11 @@ spec: containers: - name: prerequisites-checker image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/rdr/templates/job-submariner-prerequisites.yaml b/charts/hub/rdr/templates/job-submariner-prerequisites.yaml index 59159a0..5c3dd7a 100644 --- a/charts/hub/rdr/templates/job-submariner-prerequisites.yaml +++ b/charts/hub/rdr/templates/job-submariner-prerequisites.yaml @@ -17,6 +17,11 @@ spec: containers: - name: prerequisites-checker image: registry.redhat.io/openshift4/ose-cli:latest + env: + - name: PRIMARY_CLUSTER + value: {{ include "rdr.primaryClusterName" . | quote }} + - name: SECONDARY_CLUSTER + value: {{ include "rdr.secondaryClusterName" . | quote }} command: - /bin/bash - -c diff --git a/charts/hub/rdr/templates/mirrorpeer_create.yaml b/charts/hub/rdr/templates/mirrorpeer_create.yaml index 59635b7..3d01617 100644 --- a/charts/hub/rdr/templates/mirrorpeer_create.yaml +++ b/charts/hub/rdr/templates/mirrorpeer_create.yaml @@ -12,11 +12,11 @@ metadata: argocd.argoproj.io/sync-wave: "8" spec: items: - - clusterName: {{ $clusters.primary.name }} + - clusterName: {{ include "rdr.primaryClusterName" $ }} storageClusterRef: name: ocs-storagecluster namespace: openshift-storage - - clusterName: {{ $clusters.secondary.name }} + - clusterName: {{ include "rdr.secondaryClusterName" $ }} storageClusterRef: name: ocs-storagecluster namespace: openshift-storage diff --git a/charts/hub/rdr/templates/submariner_addon_install.yaml b/charts/hub/rdr/templates/submariner_addon_install.yaml index 0e25a5a..2b82b4a 100644 --- a/charts/hub/rdr/templates/submariner_addon_install.yaml +++ b/charts/hub/rdr/templates/submariner_addon_install.yaml @@ -1,18 +1,19 @@ -{{- range .Values.regionalDR }} -{{ $globalnetEnabled := .globalnetEnabled }} +{{- $dr := index .Values.regionalDR 0 }} +{{- $clusterSet := $dr.name }} +{{- $globalnetEnabled := $dr.globalnetEnabled }} --- apiVersion: v1 kind: Namespace metadata: annotations: argocd.argoproj.io/sync-wave: "3" - name: {{ .name }}-broker + name: {{ $clusterSet }}-broker --- apiVersion: submariner.io/v1alpha1 kind: Broker metadata: name: submariner-broker - namespace: {{ .name }}-broker + namespace: {{ $clusterSet }}-broker labels: cluster.open-cluster-management.io/backup: submariner annotations: @@ -21,8 +22,10 @@ spec: globalnetEnabled: {{ $globalnetEnabled | default false }} -{{- range list .clusters.primary .clusters.secondary }} -{{ $cluster := . }} +{{- $effectivePrimary := include "rdr.effectivePrimaryCluster" . | fromJson }} +{{- $effectiveSecondary := include "rdr.effectiveSecondaryCluster" . | fromJson }} +{{- range list $effectivePrimary $effectiveSecondary }} +{{- $cluster := . }} --- apiVersion: addon.open-cluster-management.io/v1alpha1 kind: ManagedClusterAddOn @@ -54,4 +57,3 @@ spec: name: {{ $cluster.name }}-cluster-aws-creds {{- end }} -{{- end }} diff --git a/charts/hub/rdr/values.yaml b/charts/hub/rdr/values.yaml index 9e1f5e5..67f0f5b 100644 --- a/charts/hub/rdr/values.yaml +++ b/charts/hub/rdr/values.yaml @@ -2,6 +2,53 @@ global: clusterDomain: cluster.example.com +# Edge GitOps VMs deployment (script and helm chart) +edgeGitopsVms: + chartVersion: "0.2.10" + +# Minimal overrides for cluster names, versions, regions (e.g. overrides/values-cluster-names.yaml). +# Merge override file FIRST so full regionalDR stays here and only clusterOverrides are applied. +# clusterOverrides: +# primary: +# name: ocp-primary +# version: 4.18.7 +# install_config: +# metadata: +# name: ocp-primary +# platform: +# aws: +# region: us-west-1 +# secondary: +# name: ocp-secondary +# ... + +# DRPlacementControl (gitops-vm-protection) - override in values-cluster-names or values +drpc: + name: gitops-vm-protection + namespace: openshift-dr-ops + drPolicyRef: + name: 2m-vm + kubeObjectProtection: + captureInterval: 2m0s + kubeObjectSelector: + matchExpressions: + - key: drprotection + operator: In + values: + - "true" + placementRef: + name: gitops-vm-protection-placement-1 + namespace: openshift-dr-ops + protectedNamespaces: + - gitops-vms + pvcSelector: + matchExpressions: + - key: app.kubernetes.io/component + operator: In + values: + - storage + preferredCluster: ocp-primary # default; override in values-cluster-names or values + regionalDR: - name: resilient # Matches with ClusterSet globalnetEnabled: false # Support for overlapping CIDR diff --git a/overrides/values-cluster-names.yaml b/overrides/values-cluster-names.yaml new file mode 100644 index 0000000..31e1a1f --- /dev/null +++ b/overrides/values-cluster-names.yaml @@ -0,0 +1,29 @@ +--- +# Minimal override for DR cluster names, versions, and regions. +# This file is applied after chart defaults (override merged last). It only contains overlay keys +# (clusterOverrides, drpc) so chart regionalDR defaults are preserved. +# Also works if merged first: -f overrides/values-cluster-names.yaml -f charts/hub/rdr/values.yaml + +clusterOverrides: + primary: + name: ocp-primary + version: 4.18.7 + install_config: + metadata: + name: ocp-primary + platform: + aws: + region: us-west-1 + secondary: + name: ocp-secondary + version: 4.18.7 + install_config: + metadata: + name: ocp-secondary + platform: + aws: + region: us-east-1 + +# DRPC overrides (preferredCluster defaults to primary if unset) +drpc: + preferredCluster: ocp-primary diff --git a/overrides/values-minimal-regional-dr.yaml b/overrides/values-minimal-regional-dr.yaml new file mode 100644 index 0000000..5a572bf --- /dev/null +++ b/overrides/values-minimal-regional-dr.yaml @@ -0,0 +1,10 @@ +--- +# Simulates old values-hub when it had a minimal regionalDR (no install_config). +# Used only for testing: chart falls back to files/default-*-install-config.json. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary diff --git a/scripts/cleanup-gitops-vms-non-primary.sh b/scripts/cleanup-gitops-vms-non-primary.sh index 9035233..43f120c 100755 --- a/scripts/cleanup-gitops-vms-non-primary.sh +++ b/scripts/cleanup-gitops-vms-non-primary.sh @@ -3,7 +3,7 @@ set -euo pipefail # Script to manually cleanup gitops-vms namespace on the non-primary cluster # This script will: -# 1. Determine the non-primary cluster (ocp-secondary by default) +# 1. Determine the non-primary cluster (discovered from DR policy; override with PRIMARY_CLUSTER/SECONDARY_CLUSTER env if needed) # 2. Render the helm template with the same chart version and values # 3. Extract resource kinds and names # 4. Delete them from the gitops-vms namespace diff --git a/scripts/download-kubeconfigs.sh b/scripts/download-kubeconfigs.sh index 7f03484..26ca345 100755 --- a/scripts/download-kubeconfigs.sh +++ b/scripts/download-kubeconfigs.sh @@ -120,12 +120,14 @@ show_usage() { echo "" echo "Examples:" echo " $0 # Download all managed cluster kubeconfigs" - echo " $0 -c ocp-primary # Download only ocp-primary kubeconfig" + echo " $0 -c \${PRIMARY_CLUSTER:-ocp-primary} # Download only primary cluster kubeconfig (set PRIMARY_CLUSTER to match values.yaml)" echo " $0 -o /tmp/kubeconfigs # Download to /tmp/kubeconfigs directory" echo " $0 --dry-run # Show what would be downloaded" echo "" echo "Environment variables:" echo " KUBECONFIG # Kubeconfig for hub cluster (if not using current context)" + echo " PRIMARY_CLUSTER # Primary DR cluster name (default: ocp-primary; match values.yaml)" + echo " SECONDARY_CLUSTER # Secondary DR cluster name (default: ocp-secondary; match values.yaml)" } # Main function diff --git a/scripts/extract-cluster-cas.sh b/scripts/extract-cluster-cas.sh index 94f3843..69c076b 100755 --- a/scripts/extract-cluster-cas.sh +++ b/scripts/extract-cluster-cas.sh @@ -108,18 +108,19 @@ main() { ((cluster_index++)) done - # For the specific clusters in your configuration (ocp-primary, ocp-secondary) - # These would need to be extracted when the clusters are available + # For the specific clusters in your configuration (set PRIMARY_CLUSTER/SECONDARY_CLUSTER to match values.yaml) + PRIMARY_CLUSTER="${PRIMARY_CLUSTER:-ocp-primary}" + SECONDARY_CLUSTER="${SECONDARY_CLUSTER:-ocp-secondary}" echo "" echo "CA certificate extraction completed." echo "Certificates are stored in: $CA_OUTPUT_DIR" echo "" echo "To extract CAs from your specific clusters, run:" - echo " # For ocp-primary cluster:" - echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/ocp-primary-ca.crt" + echo " # For $PRIMARY_CLUSTER cluster:" + echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/${PRIMARY_CLUSTER}-ca.crt" echo "" - echo " # For ocp-secondary cluster:" - echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/ocp-secondary-ca.crt" + echo " # For $SECONDARY_CLUSTER cluster:" + echo " oc --kubeconfig= get configmap -n openshift-config-managed trusted-ca-bundle -o jsonpath=\"{.data['ca-bundle\\.crt']}\" > $CA_OUTPUT_DIR/${SECONDARY_CLUSTER}-ca.crt" echo "" echo "Then update your values files with the CA data." } diff --git a/scripts/test-rdr-install-config.sh b/scripts/test-rdr-install-config.sh new file mode 100755 index 0000000..f294bdc --- /dev/null +++ b/scripts/test-rdr-install-config.sh @@ -0,0 +1,239 @@ +#!/usr/bin/env bash +# Test rdr chart install_config rendering in multiple merge scenarios. +# +# Scenarios: +# 1. Baseline: chart values only β†’ full install_config from chart regionalDR. +# 2. Chart + overrides/values-cluster-names.yaml β†’ overridden names/regions, full structure. +# 3. Chart + values-hub + overrides β†’ values-hub has no regionalDR, so chart regionalDR kept; overrides apply. +# 4. values-hub + overrides (no explicit chart -f) β†’ chart defaults still load; same as 3. +# 5. Minimal regionalDR + overrides β†’ simulates old values-hub with minimal regionalDR; chart uses +# files/default-*-install-config.json so install_config is still full; overrides apply. +# +# Ensures all required fields are present (metadata, controlPlane, compute, networking, platform) +# in every scenario, and that overridden fields (metadata.name, platform.aws.region) match overrides when used. + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +CHART="$REPO_ROOT/charts/hub/rdr" +RDR_VALUES="$REPO_ROOT/charts/hub/rdr/values.yaml" +OVERRIDES="$REPO_ROOT/overrides/values-cluster-names.yaml" +VALUES_HUB="$REPO_ROOT/values-hub.yaml" +DOMAIN="${TEST_CLUSTER_DOMAIN:-example.com}" + +# Minimal regionalDR (simulates old values-hub that replaced full regionalDR) +# This triggers the chart's default install_config files. +MINIMAL_REGIONAL_DR="$REPO_ROOT/overrides/values-minimal-regional-dr.yaml" + +run_helm() { + helm template rdr "$CHART" "$@" --set "global.clusterDomain=$DOMAIN" 2>/dev/null +} + +# Run helm; stdout+stderr to stdout, exit code preserved (caller can redirect and check $?) +run_helm_capture() { + helm template rdr "$CHART" "$@" --set "global.clusterDomain=$DOMAIN" 2>&1 +} + +# Extract and decode install-config.yaml from the Nth Secret (1=primary, 2=secondary) +get_install_config() { + local out="$1" + local nth="${2:-1}" + echo "$out" | grep -A1 "install-config.yaml:" | grep "install-config.yaml" | sed -n "${nth}p" | awk '{print $2}' | base64 -d 2>/dev/null || true +} + +# Validate decoded install_config has required structure (no empty compute, no null networking, etc.) +validate_install_config() { + local yaml="$1" + local label="$2" + local err=0 + if echo "$yaml" | grep -q "compute: \[\]"; then + echo " FAIL $label: compute is empty []" + err=1 + fi + if echo "$yaml" | grep -q "networking: null"; then + echo " FAIL $label: networking is null" + err=1 + fi + if echo "$yaml" | grep -q "publish: null"; then + echo " FAIL $label: publish is null" + err=1 + fi + if echo "$yaml" | grep -qE "platform:\s*$" -A1 | grep -q "aws: {}"; then + echo " FAIL $label: platform.aws is empty {}" + err=1 + fi + if ! echo "$yaml" | grep -q "controlPlane:"; then + echo " FAIL $label: missing controlPlane" + err=1 + fi + if ! echo "$yaml" | grep -q "type: m5"; then + echo " FAIL $label: missing machine type (m5.4xlarge or m5.metal)" + err=1 + fi + if ! echo "$yaml" | grep -q "metadata:"; then + echo " FAIL $label: missing metadata" + err=1 + fi + if ! echo "$yaml" | grep -q "platform:"; then + echo " FAIL $label: missing platform" + err=1 + fi + if ! echo "$yaml" | grep -q "region:"; then + echo " FAIL $label: missing platform.aws.region" + err=1 + fi + if [[ $err -eq 0 ]]; then + echo " OK $label: required fields present" + fi + return $err +} + +# Return 0 if install_config YAML has nulled/empty sections (broken); 1 if full +is_install_config_broken() { + local yaml="$1" + if echo "$yaml" | grep -q "compute: \[\]"; then return 0; fi + if echo "$yaml" | grep -q "networking: null"; then return 0; fi + if ! echo "$yaml" | grep -q "controlPlane:"; then return 0; fi + if ! echo "$yaml" | grep -q "type: m5"; then return 0; fi + return 1 +} + +# Create minimal regionalDR override once (used for scenario 5) +ensure_minimal_regional_dr() { + if [[ ! -f "$MINIMAL_REGIONAL_DR" ]]; then + cat > "$MINIMAL_REGIONAL_DR" << 'EOF' +--- +# Simulates old values-hub when it had a minimal regionalDR (no install_config). +# Used only for testing: chart falls back to files/default-*-install-config.json. +regionalDR: + - name: resilient + clusters: + primary: + name: ocp-primary + secondary: + name: ocp-secondary +EOF + echo "Created $MINIMAL_REGIONAL_DR for testing." + fi +} + +main() { + local total_fail=0 + echo "=== RDR install_config rendering tests (domain=$DOMAIN) ===" + echo "" + + # Scenario 1: Baseline – chart defaults only + echo "--- Scenario 1: Baseline (chart values only) ---" + out=$(run_helm -f "$RDR_VALUES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (baseline)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (baseline)" || ((total_fail++)) + pname=$(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + sname=$(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + echo " Primary metadata.name: $pname" + echo " Secondary metadata.name: $sname" + [[ "$pname" == "ocp-primary" && "$sname" == "ocp-secondary" ]] || { echo " FAIL baseline: expected ocp-primary / ocp-secondary"; ((total_fail++)); } + echo "" + + # Scenario 2: Chart + overrides (values-cluster-names) + echo "--- Scenario 2: Chart + overrides/values-cluster-names.yaml ---" + out=$(run_helm -f "$RDR_VALUES" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (chart+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (chart+overrides)" || ((total_fail++)) + pname=$(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + sname=$(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}') + preg=$(echo "$primary" | grep 'region:' | head -1 | awk '{print $2}') + sreg=$(echo "$secondary" | grep 'region:' | head -1 | awk '{print $2}') + echo " Primary metadata.name: $pname" + echo " Primary region: $preg" + echo " Secondary metadata.name: $sname" + echo " Secondary region: $sreg" + # Override file may use ocp-p/ocp-s or other names; just ensure regions are set + [[ -n "$preg" && -n "$sreg" ]] || { echo " FAIL chart+overrides: regions should be set"; ((total_fail++)); } + echo "" + + # Scenario 3: Chart + values-hub (no regionalDR) + overrides + echo "--- Scenario 3: Chart + values-hub + overrides ---" + out=$(run_helm -f "$RDR_VALUES" -f "$VALUES_HUB" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (chart+hub+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (chart+hub+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo "" + + # Scenario 4: values-hub + overrides only (no explicit chart values file; chart defaults still load) + echo "--- Scenario 4: values-hub + overrides (chart defaults implicit) ---" + out=$(run_helm -f "$VALUES_HUB" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (hub+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (hub+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo "" + + # Scenario 5: Minimal regionalDR (simulates old values-hub with regionalDR) + overrides β†’ uses default files + ensure_minimal_regional_dr + echo "--- Scenario 5: Minimal regionalDR + overrides (uses chart default install_config files) ---" + out=$(run_helm -f "$MINIMAL_REGIONAL_DR" -f "$OVERRIDES") + primary=$(get_install_config "$out" 1) + secondary=$(get_install_config "$out" 2) + validate_install_config "$primary" "primary (minimal regionalDR+overrides)" || ((total_fail++)) + validate_install_config "$secondary" "secondary (minimal regionalDR+overrides)" || ((total_fail++)) + echo " Primary metadata.name: $(echo "$primary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Primary region: $(echo "$primary" | grep 'region:' | head -1 | awk '{print $2}')" + echo " Secondary metadata.name: $(echo "$secondary" | grep -A1 '^metadata:' | grep 'name:' | head -1 | awk '{print $2}')" + echo " Secondary region: $(echo "$secondary" | grep 'region:' | head -1 | awk '{print $2}')" + echo "" + + # --- Validate default JSON files are required when regionalDR is minimal --- + echo "--- Validate: default JSON files prevent nulled install_config when regionalDR is minimal ---" + DEFAULT_PRIMARY="$REPO_ROOT/charts/hub/rdr/files/default-primary-install-config.json" + DEFAULT_SECONDARY="$REPO_ROOT/charts/hub/rdr/files/default-secondary-install-config.json" + if [[ ! -f "$DEFAULT_PRIMARY" || ! -f "$DEFAULT_SECONDARY" ]]; then + echo " SKIP default JSON files not found (cannot run validation)" + else + ensure_minimal_regional_dr + # Temporarily hide default files so the chart cannot use them + mv "$DEFAULT_PRIMARY" "${DEFAULT_PRIMARY}.bak" + mv "$DEFAULT_SECONDARY" "${DEFAULT_SECONDARY}.bak" + tmpout=$(mktemp) + trap "mv -f '${DEFAULT_PRIMARY}.bak' '$DEFAULT_PRIMARY' 2>/dev/null; mv -f '${DEFAULT_SECONDARY}.bak' '$DEFAULT_SECONDARY' 2>/dev/null; rm -f '$tmpout'" EXIT + run_helm_capture -f "$MINIMAL_REGIONAL_DR" -f "$OVERRIDES" >"$tmpout" + helm_ret=$? + out=$(cat "$tmpout") + # Restore files immediately so later tests or reruns work + mv -f "${DEFAULT_PRIMARY}.bak" "$DEFAULT_PRIMARY" 2>/dev/null || true + mv -f "${DEFAULT_SECONDARY}.bak" "$DEFAULT_SECONDARY" 2>/dev/null || true + trap - EXIT + rm -f "$tmpout" + + if [[ $helm_ret -ne 0 ]]; then + echo " OK Without default JSON files: helm template fails (exit $helm_ret) as expected." + else + primary_nojson=$(get_install_config "$out" 1) + if is_install_config_broken "$primary_nojson"; then + echo " OK Without default JSON files: install_config has nulled/empty sections (compute: [], networking: null, or missing types) as expected." + else + echo " FAIL Without default JSON files: install_config was still full; default files may be redundant." + ((total_fail++)) + fi + fi + echo " => Default JSON files are required when regionalDR is minimal (no install_config in base)." + fi + echo "" + + if [[ $total_fail -gt 0 ]]; then + echo "=== RESULT: $total_fail validation(s) failed ===" + exit 1 + fi + echo "=== RESULT: All scenarios passed ===" + exit 0 +} + +main "$@" diff --git a/scripts/update-ca-bundle.sh b/scripts/update-ca-bundle.sh index 9fc30b4..f0bfc3b 100755 --- a/scripts/update-ca-bundle.sh +++ b/scripts/update-ca-bundle.sh @@ -164,7 +164,7 @@ if [[ "${1:-}" == "--help" || "${1:-}" == "-h" ]]; then echo "Examples:" echo " $0 status # Check current status" echo " $0 add /path/to/ca.crt # Add CA from file" - echo " $0 extract ocp-primary # Extract CA from managed cluster" + echo " $0 extract \${PRIMARY_CLUSTER:-ocp-primary} # Extract CA from primary managed cluster (set PRIMARY_CLUSTER to match values.yaml)" echo " $0 update-all # Update with all managed cluster CAs" exit 0 fi diff --git a/scripts/update-rdr-default-install-config-json.sh b/scripts/update-rdr-default-install-config-json.sh new file mode 100755 index 0000000..71572be --- /dev/null +++ b/scripts/update-rdr-default-install-config-json.sh @@ -0,0 +1,136 @@ +#!/usr/bin/env bash +# Update charts/hub/rdr/files/default-*-install-config.json from the install_config +# sections in charts/hub/rdr/values.yaml. Run this when you change machine types, +# networking, platform, or other install_config in the rdr chart values. +# +# Requires: Python 3 with PyYAML (pip install pyyaml) or yq. + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" +RDR_CHART="$REPO_ROOT/charts/hub/rdr" +VALUES_YAML="$RDR_CHART/values.yaml" +DEFAULT_BASE_DOMAIN="cluster.example.com" +OUT_PRIMARY="$RDR_CHART/files/default-primary-install-config.json" +OUT_SECONDARY="$RDR_CHART/files/default-secondary-install-config.json" + +usage() { + echo "Usage: $0 [--dry-run]" + echo " Updates $OUT_PRIMARY and $OUT_SECONDARY from $VALUES_YAML" + echo " --dry-run Print what would be written, do not overwrite files." + exit 0 +} + +DRY_RUN= +for arg in "$@"; do + case "$arg" in + --dry-run) DRY_RUN=1 ;; + -h|--help) usage ;; + esac +done + +[[ -f "$VALUES_YAML" ]] || { echo "Error: $VALUES_YAML not found."; exit 1; } + +# Prefer Python so we have one code path and predictable JSON formatting +run_python() { + python3 - "$VALUES_YAML" "$DEFAULT_BASE_DOMAIN" "$OUT_PRIMARY" "$OUT_SECONDARY" "$DRY_RUN" << 'PY' +import json +import sys +import yaml + +def main(): + values_path = sys.argv[1] + default_base_domain = sys.argv[2] + out_primary = sys.argv[3] + out_secondary = sys.argv[4] + dry_run = sys.argv[5] == "1" + + with open(values_path) as f: + data = yaml.safe_load(f) + + try: + clusters = data["regionalDR"][0]["clusters"] + primary_ic = clusters["primary"]["install_config"] + secondary_ic = clusters["secondary"]["install_config"] + except (KeyError, TypeError) as e: + sys.stderr.write("Error: could not find regionalDR[0].clusters.primary/secondary.install_config in values.yaml\n") + sys.exit(1) + + def normalize(ic): + # Deep copy and replace template baseDomain with static default + out = json.loads(json.dumps(ic)) + if isinstance(out.get("baseDomain"), str) and "{{" in out["baseDomain"]: + out["baseDomain"] = default_base_domain + return out + + primary = normalize(primary_ic) + secondary = normalize(secondary_ic) + + opts = {"indent": 2, "sort_keys": False} + primary_json = json.dumps(primary, **opts) + secondary_json = json.dumps(secondary, **opts) + + if dry_run: + print("--- primary (would write to", out_primary, ") ---") + print(primary_json) + print("--- secondary (would write to", out_secondary, ") ---") + print(secondary_json) + return + + with open(out_primary, "w") as f: + f.write(primary_json) + f.write("\n") + with open(out_secondary, "w") as f: + f.write(secondary_json) + f.write("\n") + print("Wrote", out_primary) + print("Wrote", out_secondary) + +if __name__ == "__main__": + main() +PY +} + +if command -v python3 &>/dev/null; then + if python3 -c "import yaml" 2>/dev/null; then + run_python + exit 0 + fi +fi + +# Fallback: yq (if available) +if command -v yq &>/dev/null; then + echo "Using yq (Python/PyYAML not available)." + extract_one() { + local path="$1" + local out="$2" + local tmp + tmp=$(mktemp) + trap "rm -f $tmp" EXIT + # yq v4: .regionalDR[0].clusters.primary.install_config + yq eval '.regionalDR[0].clusters.'"$path"'.install_config' "$VALUES_YAML" -o=json > "$tmp" 2>/dev/null || \ + yq r -j "$VALUES_YAML" "regionalDR.0.clusters.$path.install_config" > "$tmp" 2>/dev/null || \ + { echo "Error: yq could not extract install_config. Try: pip install pyyaml && $0"; exit 1; } + # Replace template baseDomain + if command -v jq &>/dev/null; then + jq --arg dom "$DEFAULT_BASE_DOMAIN" '.baseDomain = $dom' "$tmp" > "${tmp}.2" && mv "${tmp}.2" "$tmp" + else + sed -i "s/\"{{ join.*}}\"/\"$DEFAULT_BASE_DOMAIN\"/" "$tmp" 2>/dev/null || true + fi + if [[ -n "$DRY_RUN" ]]; then + echo "--- $out (dry-run) ---" + cat "$tmp" + else + cp "$tmp" "$out" + echo "Wrote $out" + fi + rm -f "$tmp" + trap - EXIT + } + extract_one "primary" "$OUT_PRIMARY" + extract_one "secondary" "$OUT_SECONDARY" + exit 0 +fi + +echo "Error: Need Python 3 with PyYAML (pip install pyyaml) or yq to run this script." +exit 1 diff --git a/values-hub.yaml b/values-hub.yaml index 8775dce..c06fbf5 100644 --- a/values-hub.yaml +++ b/values-hub.yaml @@ -12,12 +12,11 @@ clusterGroup: - openshift-dr-ops - openshift-storage - policies - # - regional-dr-trigger subscriptions: acm: name: advanced-cluster-management namespace: open-cluster-management - channel: release-2.13 + channel: release-2.14 odf-multicluster-orchestrator: name: odf-multicluster-orchestrator @@ -97,6 +96,8 @@ clusterGroup: - PrunePropagationPolicy=foreground - RespectIgnoreDifferences=true - ServerSideApply=true + extraValueFiles: + - '/overrides/values-cluster-names.yaml' ignoreDifferences: - group: operators.openshift.io kind: Console @@ -121,6 +122,8 @@ clusterGroup: maxDuration: 3m syncOptions: - RespectIgnoreDifferences=true + extraValueFiles: + - '/overrides/values-cluster-names.yaml' ignoreDifferences: # Prevent ArgoCD from pruning dynamically created OCM resources # These resources are created by OCM operators and should never be pruned