From f02caedc153e4cfc8ac9a6097a9453de21f21dac Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 11 Feb 2026 21:48:05 +0200 Subject: [PATCH 1/4] Support Instance type of c5.metal --- deploy/aws-hypervisor/scripts/create.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 99d5737..1288d9c 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -42,7 +42,7 @@ echo -e "AMI ID: $RHEL_HOST_AMI" echo -e "Machine Type: $EC2_INSTANCE_TYPE" ec2Type="VirtualMachine" -if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[gn].metal ]]; then +if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[a-z]*.metal ]]; then ec2Type="MetalMachine" fi From 73e97838922aca725631550092af668cfe287878 Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Sun, 15 Feb 2026 18:42:44 +0200 Subject: [PATCH 2/4] Add assisted installer deployment method for spoke TNF clusters Deploy a spoke TNF cluster via ACM/MCE assisted installer on an existing hub cluster. This adds a third deployment path alongside dev-scripts (IPI) and kcli methods. New Ansible roles: - assisted/acm-install: installs ACM operator, MultiClusterHub, AgentServiceConfig with auto-detected RHCOS ISO, enables TNF support, and configures provisioning for external BMH management - assisted/assisted-spoke: creates spoke libvirt network and VMs, deploys cluster resources (ClusterDeployment, AgentClusterInstall, InfraEnv, BareMetalHost with fencing credentials), monitors installation, and extracts spoke credentials Usage: make deploy fencing-assisted Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 26 +++- deploy/Makefile | 5 + .../openshift-clusters/assisted-install.yml | 108 ++++++++++++++++ .../collections/requirements.yml | 2 + .../assisted/acm-install/defaults/main.yml | 28 +++++ .../tasks/agent-service-config.yml | 118 ++++++++++++++++++ .../assisted/acm-install/tasks/enable-tnf.yml | 45 +++++++ .../tasks/enable-watch-all-namespaces.yml | 46 +++++++ .../acm-install/tasks/install-operator.yml | 115 +++++++++++++++++ .../roles/assisted/acm-install/tasks/main.yml | 23 ++++ .../assisted/acm-install/tasks/storage.yml | 62 +++++++++ .../assisted/acm-install/tasks/validate.yml | 36 ++++++ .../templates/agentserviceconfig.yml.j2 | 22 ++++ .../templates/multiclusterhub.yml.j2 | 7 ++ .../templates/operator-subscription.yml.j2 | 11 ++ .../roles/assisted/acm-install/vars/main.yml | 23 ++++ .../assisted/assisted-spoke/defaults/main.yml | 42 +++++++ .../assisted/assisted-spoke/tasks/cleanup.yml | 48 +++++++ .../assisted-spoke/tasks/create-bmh.yml | 63 ++++++++++ .../tasks/create-cluster-resources.yml | 101 +++++++++++++++ .../tasks/create-spoke-network.yml | 48 +++++++ .../assisted-spoke/tasks/create-spoke-vms.yml | 75 +++++++++++ .../assisted/assisted-spoke/tasks/main.yml | 30 +++++ .../tasks/retrieve-credentials.yml | 100 +++++++++++++++ .../assisted-spoke/tasks/setup-ksushy.yml | 34 +++++ .../assisted-spoke/tasks/wait-for-install.yml | 103 +++++++++++++++ .../templates/agentclusterinstall.yml.j2 | 22 ++++ .../templates/clusterdeployment.yml.j2 | 20 +++ .../templates/clusterimageset.yml.j2 | 6 + .../assisted-spoke/templates/infraenv.yml.j2 | 15 +++ .../templates/spoke-network.xml.j2 | 29 +++++ .../assisted/assisted-spoke/vars/main.yml | 21 ++++ .../scripts/deploy-fencing-assisted.sh | 53 ++++++++ .../vars/assisted.yml.template | 48 +++++++ 34 files changed, 1532 insertions(+), 3 deletions(-) create mode 100644 deploy/openshift-clusters/assisted-install.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml create mode 100755 deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh create mode 100644 deploy/openshift-clusters/vars/assisted.yml.template diff --git a/CLAUDE.md b/CLAUDE.md index ea34093..93645e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,8 +13,9 @@ Two-Node Toolbox (TNF) is a comprehensive deployment automation framework for Op # From the deploy/ directory: # Deploy AWS hypervisor and cluster in one command -make deploy arbiter-ipi # Deploy arbiter topology cluster +make deploy arbiter-ipi # Deploy arbiter topology cluster make deploy fencing-ipi # Deploy fencing topology cluster +make deploy fencing-assisted # Deploy hub + spoke TNF via assisted installer # Instance lifecycle management make create # Create new EC2 instance @@ -70,6 +71,15 @@ ansible-playbook kcli-install.yml -i inventory.ini -e "test_cluster_name=my-clus ansible-playbook kcli-install.yml -i inventory.ini -e "force_cleanup=true" ``` +#### Assisted Installer Method (Spoke TNF via ACM) +```bash +# Copy and customize the configuration template +cp vars/assisted.yml.template vars/assisted.yml + +# Deploy hub + spoke TNF cluster via assisted installer +make deploy fencing-assisted +``` + ### Linting and Validation ```bash # Shell script linting (from repository root) @@ -88,14 +98,17 @@ make shellcheck - Automatic inventory management for Ansible integration 2. **OpenShift Cluster Deployment** (`deploy/openshift-clusters/`) - - Two deployment methods: dev-scripts (traditional) and kcli (modern) + - Three deployment methods: dev-scripts (traditional), kcli (modern), and assisted installer (spoke via ACM) - Ansible roles for complete cluster automation - Support for both arbiter and fencing topologies + - Assisted installer deploys spoke TNF clusters on an existing hub via ACM/MCE - Proxy configuration for external cluster access 3. **Ansible Roles Architecture**: - `dev-scripts/install-dev`: Traditional deployment using openshift-metal3/dev-scripts - `kcli/kcli-install`: Modern deployment using kcli virtualization management + - `assisted/acm-install`: Install ACM/MCE + assisted service + enable TNF on hub + - `assisted/assisted-spoke`: Deploy spoke TNF cluster via assisted installer + BMH - `proxy-setup`: Squid proxy for cluster external access - `redfish`: Automated stonith configuration for fencing topology - `config`: SSH key and git configuration @@ -119,8 +132,15 @@ make shellcheck - `roles/kcli/kcli-install/files/pull-secret.json`: OpenShift pull secret - SSH key automatically read from `~/.ssh/id_ed25519.pub` on ansible controller +#### Assisted Installer Method +- `vars/assisted.yml`: Variable override file (copy from `vars/assisted.yml.template`) +- Hub cluster must be deployed first via dev-scripts (`make deploy fencing-ipi`) +- Spoke credentials output to `~//auth/` on hypervisor +- Hub proxy preserved as `hub-proxy.env` + #### Generated Files - `proxy.env`: Generated proxy configuration (source this to access cluster) +- `hub-proxy.env`: Hub proxy config (preserved when spoke proxy is configured) - `kubeconfig`: OpenShift cluster kubeconfig - `kubeadmin-password`: Default admin password @@ -128,7 +148,7 @@ make shellcheck 1. **Environment Setup**: Use `deploy/aws-hypervisor/` tools or bring your own RHEL 9 host 2. **Configuration**: Edit inventory and config files based on chosen deployment method -3. **Deployment**: Run appropriate Ansible playbook (setup.yml or kcli-install.yml) +3. **Deployment**: Run appropriate Ansible playbook (setup.yml, kcli-install.yml, or assisted-install.yml) 4. **Access**: Source `proxy.env` and use `oc` commands or WebUI through proxy 5. **Cleanup**: Use cleanup make targets or Ansible playbooks diff --git a/deploy/Makefile b/deploy/Makefile index a6fd992..f69ecc5 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -55,6 +55,10 @@ arbiter-ipi: arbiter-agent: @./openshift-clusters/scripts/deploy-arbiter-agent.sh +fencing-assisted: + @$(MAKE) fencing-ipi + @./openshift-clusters/scripts/deploy-fencing-assisted.sh + patch-nodes: @./openshift-clusters/scripts/patch-nodes.sh get-tnf-logs: @@ -82,6 +86,7 @@ help: @echo " fencing-agent - Deploy fencing Agent cluster (non-interactive) (WIP Experimental)" @echo " arbiter-ipi - Deploy arbiter IPI cluster (non-interactive)" @echo " arbiter-agent - Deploy arbiter Agent cluster (non-interactive)" + @echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer" @echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy" @echo " shutdown-cluster - Shutdown OpenShift cluster VMs in orderly fashion" @echo " startup-cluster - Start up OpenShift cluster VMs and proxy container" diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml new file mode 100644 index 0000000..85e54dc --- /dev/null +++ b/deploy/openshift-clusters/assisted-install.yml @@ -0,0 +1,108 @@ +--- +# Deploy a spoke TNF cluster via ACM/assisted installer on an existing hub cluster. +# +# Prerequisites: +# - vars/assisted.yml exists (copy from vars/assisted.yml.template) +# +# Usage: +# make deploy fencing-assisted + +- hosts: metal_machine + gather_facts: yes + + vars: + topology: fencing + interactive_mode: false + + vars_files: + - vars/assisted.yml + + pre_tasks: + - name: Check that proxy.env exists (hub must be deployed first) + stat: + path: "{{ playbook_dir }}/proxy.env" + delegate_to: localhost + register: proxy_env_check + + - name: Fail if proxy.env is missing + fail: + msg: >- + proxy.env not found. The hub cluster must be deployed first + using 'make deploy fencing-ipi'. proxy.env is required for + cluster access. + when: not proxy_env_check.stat.exists + + - name: Check that hub kubeconfig exists + stat: + path: "{{ ansible_user_dir }}/auth/kubeconfig" + register: hub_kubeconfig_check + + - name: Fail if hub kubeconfig is missing + fail: + msg: >- + Hub kubeconfig not found at ~/auth/kubeconfig. + The hub cluster must be deployed first. + when: not hub_kubeconfig_check.stat.exists + + - name: Set hub KUBECONFIG path + set_fact: + hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + + - name: Preserve hub proxy.env as hub-proxy.env + copy: + src: "{{ playbook_dir }}/proxy.env" + dest: "{{ playbook_dir }}/hub-proxy.env" + remote_src: no + backup: no + delegate_to: localhost + + - name: Display assisted installer configuration + debug: + msg: | + Assisted Installer Configuration: + Hub operator: {{ hub_operator }} + ACM/MCE channel: {{ acm_channel if hub_operator == 'acm' else mce_channel }} + Spoke cluster: {{ spoke_cluster_name }}.{{ spoke_base_domain }} + Spoke release image: {{ spoke_release_image }} + Spoke VMs: {{ spoke_ctlplanes }}x ({{ spoke_vm_vcpus }} vCPUs, {{ spoke_vm_memory }}MB RAM, {{ spoke_vm_disk_size }}GB disk) + Spoke network: {{ spoke_network_cidr }} + API VIP: {{ spoke_api_vip }} + Ingress VIP: {{ spoke_ingress_vip }} + Storage method: {{ assisted_storage_method }} + Force cleanup: {{ force_cleanup }} + + roles: + - role: assisted/acm-install + - role: assisted/assisted-spoke + + post_tasks: + - name: Setup proxy access for spoke cluster + include_role: + name: proxy-setup + vars: + kubeconfig_path: "{{ spoke_kubeconfig_path }}" + kubeadmin_password_path: "{{ spoke_kubeadmin_password_path }}" + + - name: Update cluster inventory with spoke VMs + include_role: + name: common + tasks_from: update-cluster-inventory + vars: + test_cluster_name: "{{ spoke_cluster_name }}" + + - name: Display deployment summary + debug: + msg: | + Spoke TNF cluster deployed successfully! + + Spoke credentials: + Kubeconfig: {{ spoke_kubeconfig_path }} + Admin password: {{ spoke_kubeadmin_password_path }} + + Access spoke cluster: + source proxy.env + KUBECONFIG={{ spoke_kubeconfig_path }} oc get nodes + + Access hub cluster: + source hub-proxy.env + KUBECONFIG=~/auth/kubeconfig oc get nodes diff --git a/deploy/openshift-clusters/collections/requirements.yml b/deploy/openshift-clusters/collections/requirements.yml index 291137f..4f4bdfd 100644 --- a/deploy/openshift-clusters/collections/requirements.yml +++ b/deploy/openshift-clusters/collections/requirements.yml @@ -13,3 +13,5 @@ collections: version: ">=2.0" - name: community.general version: ">=5.0.0" + - name: ansible.utils + version: ">=2.0.0" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml new file mode 100644 index 0000000..2e078f4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -0,0 +1,28 @@ +--- +# Default variables for acm-install role + +# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) +hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + +# Hub operator to install: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest +acm_channel: "auto" +mce_channel: "auto" + +# Storage method for assisted service: "hostpath" +assisted_storage_method: "hostpath" + +# hostPath directories on hub nodes +assisted_images_path: /var/lib/assisted-images +assisted_db_path: /var/lib/assisted-db +assisted_images_size: 50Gi +assisted_db_size: 10Gi +assisted_storage_class: assisted-service + +# Timeouts (seconds) +acm_csv_timeout: 900 +multiclusterhub_timeout: 1800 +assisted_service_timeout: 600 +metal3_stabilize_timeout: 300 \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml new file mode 100644 index 0000000..2f127f8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -0,0 +1,118 @@ +--- +# Create AgentServiceConfig with RHCOS ISO auto-extracted from release image + +- name: Get hub release image + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image + changed_when: false + +- name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' \ + | cut -d. -f1-2 + register: hub_ocp_version + changed_when: false + +- name: Determine spoke release image + set_fact: + effective_release_image: >- + {{ hub_release_image.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + +- name: Extract RHCOS ISO URL from release image + shell: | + # Get the machine-os-images reference from the release image + RHCOS_REF=$(oc adm release info "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --image-for=machine-os-images 2>/dev/null) + if [ -z "$RHCOS_REF" ]; then + echo "FAILED: Could not extract machine-os-images from release image" + exit 1 + fi + # Extract the RHCOS ISO URL from the image labels/annotations + oc image info "$RHCOS_REF" --registry-config="{{ pull_secret_path }}" \ + -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + labels = data.get('config', {}).get('config', {}).get('Labels', {}) + stream = labels.get('coreos.stream', '') + version = labels.get('version', '') + if stream and version: + url = f'https://rhcos.mirror.openshift.com/art/storage/prod/streams/{stream}/builds/{version}/x86_64/rhcos-{version}-live-iso.x86_64.iso' + print(url) + else: + print('NEEDS_FALLBACK') + " + register: rhcos_iso_extraction + changed_when: false + failed_when: "'FAILED' in rhcos_iso_extraction.stdout" + +- name: Try fallback RHCOS ISO extraction via coreos print-stream-json + shell: | + rm -rf /tmp/oc-extract && mkdir -p /tmp/oc-extract + RHCOS_URL=$(oc adm release extract "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --command=openshift-install --to=/tmp/oc-extract 2>/dev/null && \ + /tmp/oc-extract/openshift-install coreos print-stream-json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + iso = data['architectures']['x86_64']['artifacts']['metal']['formats']['iso']['disk'] + print(iso['location']) + " 2>/dev/null) || true + rm -rf /tmp/oc-extract + if [ -n "$RHCOS_URL" ]; then + echo "$RHCOS_URL" + else + echo "FAILED" + fi + register: rhcos_iso_fallback + changed_when: false + when: "'NEEDS_FALLBACK' in rhcos_iso_extraction.stdout" + +- name: Set RHCOS ISO URL fact + set_fact: + rhcos_iso_url: >- + {{ rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout) | trim }} + failed_when: rhcos_iso_url == 'FAILED' or rhcos_iso_url == 'NEEDS_FALLBACK' + +- name: Display RHCOS ISO URL + debug: + msg: "RHCOS ISO URL: {{ rhcos_iso_url }}" + +- name: Get RHCOS version from ISO URL + set_fact: + rhcos_version: "{{ rhcos_iso_url | regex_search('rhcos-([\\d.]+-\\d+)-live', '\\1') | first }}" + +- name: Create AgentServiceConfig + template: + src: agentserviceconfig.yml.j2 + dest: /tmp/agentserviceconfig.yml + mode: '0644' + +- name: Apply AgentServiceConfig + shell: | + oc apply -f /tmp/agentserviceconfig.yml + register: asc_result + changed_when: "'created' in asc_result.stdout" + +- name: Wait for assisted-service pod to be Running (2/2) + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ + --no-headers 2>/dev/null | grep -q '2/2.*Running' + register: assisted_pod + until: assisted_pod.rc == 0 + retries: "{{ (assisted_service_timeout / 15) | int }}" + delay: 15 + +- name: Display assisted-service pod status + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service + register: pod_status + changed_when: false + +- name: Show assisted-service pod + debug: + msg: "{{ pod_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml new file mode 100644 index 0000000..00dbd95 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -0,0 +1,45 @@ +--- +# Enable TNF cluster support in assisted service +# Requires both a ConfigMap AND an annotation on AgentServiceConfig + +- name: Create assisted-unsupported-config ConfigMap + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: ConfigMap + metadata: + name: assisted-unsupported-config + namespace: {{ assisted_service_namespace }} + data: + TNF_CLUSTERS_SUPPORT: "true" + EOF + register: cm_result + changed_when: "'created' in cm_result.stdout" + +- name: Annotate AgentServiceConfig to mount unsupported config + shell: | + oc annotate agentserviceconfig agent \ + unsupported.agent-install.openshift.io/assisted-service-configmap=assisted-unsupported-config \ + --overwrite + register: annotate_result + changed_when: "'annotated' in annotate_result.stdout" + +- name: Wait for assisted-service rollout after annotation + shell: | + oc rollout status deployment/assisted-service \ + -n {{ assisted_service_namespace }} --timeout=120s + register: rollout_result + changed_when: false + +- name: Verify TNF support is enabled + shell: | + oc exec -n {{ assisted_service_namespace }} \ + $(oc get pod -n {{ assisted_service_namespace }} -l app=assisted-service -o name | head -1) \ + -c assisted-service -- env | grep -i TNF_CLUSTERS_SUPPORT + register: tnf_verify + changed_when: false + failed_when: "'TNF_CLUSTERS_SUPPORT=true' not in tnf_verify.stdout" + +- name: Display TNF support status + debug: + msg: "{{ tnf_verify.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml new file mode 100644 index 0000000..ba932f3 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml @@ -0,0 +1,46 @@ +--- +# Patch Provisioning CR to: +# 1. Enable watchAllNamespaces (BMO processes BMHs in all namespaces) +# 2. Disable provisioning network (spoke VMs not on provisioning network) +# 3. Remove leftover provisioning fields that cause ironic to hang + +- name: Patch Provisioning CR - enable watchAllNamespaces and disable provisioning network + shell: | + oc patch provisioning provisioning-configuration --type=merge \ + -p '{"spec":{"provisioningNetwork":"Disabled","watchAllNamespaces":true}}' + register: patch_result + changed_when: "'patched' in patch_result.stdout" + +- name: Remove leftover provisioning fields from Provisioning CR + shell: | + oc patch provisioning provisioning-configuration --type=json \ + -p '[ + {"op":"remove","path":"/spec/provisioningIP"}, + {"op":"remove","path":"/spec/provisioningDHCPRange"}, + {"op":"remove","path":"/spec/provisioningNetworkCIDR"}, + {"op":"remove","path":"/spec/provisioningInterface"} + ]' 2>&1 || echo "Some provisioning fields may not exist, continuing" + register: remove_result + changed_when: "'patched' in remove_result.stdout" + failed_when: false + +- name: Wait for metal3 pod to stabilize after provisioning change + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state \ + --no-headers 2>/dev/null | grep -v Terminating | grep -q Running + register: metal3_pod + until: metal3_pod.rc == 0 + retries: "{{ (metal3_stabilize_timeout / 15) | int }}" + delay: 15 + +- name: Display metal3 pod status + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state + register: metal3_status + changed_when: false + +- name: Show metal3 pod + debug: + msg: "{{ metal3_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml new file mode 100644 index 0000000..18362ea --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -0,0 +1,115 @@ +--- +# Install ACM or MCE operator with auto-detected channel + +- name: Set operator configuration + set_fact: + op_config: "{{ operator_config[hub_operator] }}" + +- name: Create operator namespace + shell: | + oc create namespace {{ op_config.namespace }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create OperatorGroup + shell: | + oc apply -f - <<'EOF' + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} + spec: + targetNamespaces: + - {{ op_config.namespace }} + EOF + register: og_result + changed_when: "'created' in og_result.stdout" + +- name: Determine operator channel + block: + - name: Auto-detect channel from packagemanifest + shell: | + oc get packagemanifest {{ op_config.package_name }} \ + -o jsonpath='{.status.defaultChannel}' + register: detected_channel + changed_when: false + + - name: Set operator channel fact + set_fact: + operator_channel: "{{ detected_channel.stdout }}" + when: (hub_operator == 'acm' and acm_channel == 'auto') or + (hub_operator == 'mce' and mce_channel == 'auto') + +- name: Use user-specified channel + set_fact: + operator_channel: "{{ acm_channel if hub_operator == 'acm' else mce_channel }}" + when: (hub_operator == 'acm' and acm_channel != 'auto') or + (hub_operator == 'mce' and mce_channel != 'auto') + +- name: Display operator channel + debug: + msg: "Installing {{ hub_operator | upper }} with channel: {{ operator_channel }}" + +- name: Create operator Subscription + template: + src: operator-subscription.yml.j2 + dest: /tmp/operator-subscription.yml + mode: '0644' + +- name: Apply operator Subscription + shell: | + oc apply -f /tmp/operator-subscription.yml + register: sub_result + changed_when: "'created' in sub_result.stdout" + +- name: Wait for operator CSV to succeed + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ + | grep {{ op_config.package_name }} \ + | grep -q Succeeded + register: csv_result + until: csv_result.rc == 0 + retries: "{{ (acm_csv_timeout / 15) | int }}" + delay: 15 + +- name: Display operator install result + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers \ + | grep {{ op_config.package_name }} + register: csv_info + changed_when: false + +- name: Show installed operator + debug: + msg: "{{ csv_info.stdout }}" + +# Create MultiClusterHub (for ACM) or MultiClusterEngine (for MCE) +- name: Create MultiClusterHub CR + template: + src: multiclusterhub.yml.j2 + dest: /tmp/multiclusterhub.yml + mode: '0644' + when: hub_operator == 'acm' + +- name: Apply MultiClusterHub CR + shell: | + oc apply -f /tmp/multiclusterhub.yml + register: mch_result + changed_when: "'created' in mch_result.stdout" + when: hub_operator == 'acm' + +- name: Wait for MultiClusterHub to reach Running phase + shell: | + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ + -o jsonpath='{.status.phase}' + register: mch_phase + until: mch_phase.stdout == 'Running' + retries: "{{ (multiclusterhub_timeout / 30) | int }}" + delay: 30 + when: hub_operator == 'acm' + +- name: Display MultiClusterHub status + debug: + msg: "MultiClusterHub phase: {{ mch_phase.stdout }}" + when: hub_operator == 'acm' \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml new file mode 100644 index 0000000..9299e93 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml @@ -0,0 +1,23 @@ +--- +# Install ACM/MCE + assisted service + enable TNF support on hub cluster + +- block: + - name: Validate hub cluster health + include_tasks: validate.yml + + - name: Provision storage for assisted service + include_tasks: storage.yml + + - name: Install {{ hub_operator | upper }} operator + include_tasks: install-operator.yml + + - name: Create AgentServiceConfig + include_tasks: agent-service-config.yml + + - name: Enable TNF cluster support in assisted service + include_tasks: enable-tnf.yml + + - name: Enable BMO watch all namespaces and disable provisioning network + include_tasks: enable-watch-all-namespaces.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml new file mode 100644 index 0000000..4506150 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -0,0 +1,62 @@ +--- +# Provision hostPath storage for assisted service +# Requires chmod 777 (non-root containers) + SELinux context fix on both nodes + +- name: Create StorageClass and PersistentVolumes for assisted service + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-images + spec: + capacity: + storage: {{ assisted_images_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_images_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-db + spec: + capacity: + storage: {{ assisted_db_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_db_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: {{ assisted_storage_class }} + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer + EOF + register: storage_result + changed_when: "'created' in storage_result.stdout" + +- name: Get hub cluster node names + shell: | + oc get nodes --no-headers -o custom-columns=NAME:.metadata.name + register: hub_nodes + changed_when: false + +- name: Fix hostPath permissions and SELinux context on each hub node + shell: | + oc debug node/{{ item }} -- chroot /host bash -c " + mkdir -p {{ assisted_images_path }} {{ assisted_db_path }} + rm -rf {{ assisted_images_path }}/* {{ assisted_db_path }}/* + chmod 777 {{ assisted_images_path }} {{ assisted_db_path }} + chcon -Rt container_file_t {{ assisted_images_path }} {{ assisted_db_path }} + " + loop: "{{ hub_nodes.stdout_lines }}" + register: selinux_fix + changed_when: true + +- name: Display storage setup result + debug: + msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml new file mode 100644 index 0000000..57be824 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml @@ -0,0 +1,36 @@ +--- +# Validate hub cluster is healthy before proceeding + +- name: Check hub cluster nodes are Ready + shell: | + oc get nodes --no-headers | awk '{print $2}' | sort -u + register: node_statuses + changed_when: false + failed_when: "'NotReady' in node_statuses.stdout" + +- name: Check hub cluster node count + shell: | + oc get nodes --no-headers | wc -l + register: node_count + changed_when: false + failed_when: node_count.stdout | int < 2 + +- name: Check for degraded cluster operators + shell: | + oc get co -o json | python3 -c " + import json, sys + cos = json.load(sys.stdin)['items'] + degraded = [c['metadata']['name'] for c in cos + if any(cond['type'] == 'Degraded' and cond['status'] == 'True' + for cond in c['status']['conditions'])] + if degraded: + print('Degraded operators: ' + ', '.join(degraded)) + sys.exit(1) + print('All cluster operators healthy') + " + register: co_check + changed_when: false + +- name: Display hub cluster status + debug: + msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 new file mode 100644 index 0000000..8b71e4b --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: AgentServiceConfig +metadata: + name: agent +spec: + databaseStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_db_size }} + filesystemStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_images_size }} + osImages: + - cpuArchitecture: x86_64 + openshiftVersion: "{{ hub_ocp_version.stdout }}" + url: "{{ rhcos_iso_url }}" + version: "{{ rhcos_version }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 new file mode 100644 index 0000000..2b68364 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 @@ -0,0 +1,7 @@ +apiVersion: operator.open-cluster-management.io/v1 +kind: MultiClusterHub +metadata: + name: multiclusterhub + namespace: {{ op_config.namespace }} +spec: + availabilityConfig: Basic \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 new file mode 100644 index 0000000..f6c3109 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 @@ -0,0 +1,11 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} +spec: + channel: {{ operator_channel }} + installPlanApproval: Automatic + name: {{ op_config.subscription_name }} + source: {{ op_config.source }} + sourceNamespace: openshift-marketplace \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml new file mode 100644 index 0000000..a32a832 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -0,0 +1,23 @@ +--- +# Role-internal variables (not user-overridable) + +acm_namespace: open-cluster-management +mce_namespace: multicluster-engine + +operator_config: + acm: + namespace: "{{ acm_namespace }}" + package_name: advanced-cluster-management + subscription_name: advanced-cluster-management + source: redhat-operators + mce: + namespace: "{{ mce_namespace }}" + package_name: multicluster-engine + subscription_name: multicluster-engine + source: redhat-operators + +# The MCE namespace is always multicluster-engine regardless of hub_operator +assisted_service_namespace: multicluster-engine + +# Pull secret location (dev-scripts standard path) +pull_secret_path: /opt/dev-scripts/pull_secret.json \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml new file mode 100644 index 0000000..668375f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -0,0 +1,42 @@ +--- +# Default variables for assisted-spoke role + +# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) +hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version - "auto" uses hub release image +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 +spoke_ctlplanes: 2 + +# Spoke network +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# BMC / sushy-tools +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Deployment options +force_cleanup: false + +# Timeouts (seconds) +spoke_install_timeout: 3600 +spoke_agent_register_timeout: 900 +spoke_credentials_timeout: 1800 + +# Hub network CIDR (for cross-bridge nftables rules) +hub_network_cidr: "192.168.111.0/24" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml new file mode 100644 index 0000000..55cc58a --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -0,0 +1,48 @@ +--- +# Remove existing spoke resources for re-deployment + +- name: Delete spoke namespace (removes all cluster resources) + shell: | + oc delete namespace {{ spoke_cluster_name }} --ignore-not-found --timeout=120s + register: ns_delete + changed_when: "'deleted' in ns_delete.stdout" + failed_when: false + +- name: Delete ClusterImageSet + shell: | + OCP_VER=$(oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2) + oc delete clusterimageset "${OCP_VER}.0" --ignore-not-found + changed_when: false + failed_when: false + +- name: Destroy spoke VMs + shell: | + for i in $(seq 0 {{ spoke_ctlplanes - 1 }}); do + sudo virsh destroy {{ spoke_cluster_name }}-master-${i} 2>/dev/null || true + sudo virsh undefine {{ spoke_cluster_name }}-master-${i} --remove-all-storage 2>/dev/null || true + done + changed_when: true + failed_when: false + +- name: Remove spoke libvirt network + shell: | + sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true + sudo virsh net-undefine {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + failed_when: false + +- name: Remove spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: absent + +- name: Remove spoke /etc/hosts entry + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: absent + become: true + +- name: Display cleanup result + debug: + msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml new file mode 100644 index 0000000..fb3ceb1 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml @@ -0,0 +1,63 @@ +--- +# Create BareMetalHost resources with BMC secrets and fencing credentials for each spoke node + +- name: Create BMH resources for each spoke node + shell: | + oc apply -f - <<'EOF' + --- + apiVersion: v1 + data: + password: {{ spoke_bmc_password | b64encode }} + username: {{ spoke_bmc_user | b64encode }} + kind: Secret + metadata: + name: {{ item.name }}-bmc-secret + namespace: {{ spoke_cluster_name }} + type: Opaque + --- + apiVersion: metal3.io/v1alpha1 + kind: BareMetalHost + metadata: + name: {{ item.name }}-bmh + namespace: {{ spoke_cluster_name }} + annotations: + bmac.agent-install.openshift.io/hostname: "{{ item.name }}" + bmac.agent-install.openshift.io/role: "master" + bmac.agent-install.openshift.io/fencing-credentials-secret-name: "{{ item.name }}-fencing-credentials" + labels: + infraenvs.agent-install.openshift.io: "{{ spoke_cluster_name }}" + spec: + architecture: x86_64 + bmc: + address: redfish-virtualmedia+https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + credentialsName: {{ item.name }}-bmc-secret + disableCertificateVerification: true + bootMACAddress: {{ item.mac }} + automatedCleaningMode: disabled + online: true + --- + apiVersion: v1 + stringData: + address: https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + certificateVerification: Disabled + username: {{ spoke_bmc_user }} + password: {{ spoke_bmc_password }} + kind: Secret + metadata: + name: {{ item.name }}-fencing-credentials + namespace: {{ spoke_cluster_name }} + type: Opaque + EOF + loop: "{{ spoke_vms }}" + register: bmh_result + changed_when: "'created' in bmh_result.stdout" + +- name: Display BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: bmh_status + changed_when: false + +- name: Show BMH resources + debug: + msg: "{{ bmh_status.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml new file mode 100644 index 0000000..f6276a4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -0,0 +1,101 @@ +--- +# Create spoke cluster resources on hub: namespace, secrets, ClusterDeployment, +# AgentClusterInstall, InfraEnv, ClusterImageSet + +- name: Get hub release image for spoke + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image + changed_when: false + +- name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 + register: hub_ocp_version + changed_when: false + +- name: Set effective spoke release image + set_fact: + effective_release_image: >- + {{ hub_release_image.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + effective_ocp_version: "{{ hub_ocp_version.stdout }}" + +- name: Get SSH public key + shell: | + cat ~/.ssh/id_rsa.pub + register: ssh_pub_key + changed_when: false + +- name: Create spoke namespace + shell: | + oc create namespace {{ spoke_cluster_name }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create spoke pull secret + shell: | + oc get secret {{ spoke_cluster_name }}-pull-secret -n {{ spoke_cluster_name }} 2>/dev/null \ + && echo "Already exists" \ + || oc create secret generic {{ spoke_cluster_name }}-pull-secret \ + -n {{ spoke_cluster_name }} \ + --from-file=.dockerconfigjson={{ pull_secret_path }} \ + --type=kubernetes.io/dockerconfigjson + register: ps_result + changed_when: "'created' in ps_result.stdout" + +- name: Create ClusterImageSet + template: + src: clusterimageset.yml.j2 + dest: /tmp/clusterimageset.yml + mode: '0644' + +- name: Apply ClusterImageSet + shell: | + oc apply -f /tmp/clusterimageset.yml + register: cis_result + changed_when: "'created' in cis_result.stdout" + +- name: Create ClusterDeployment + template: + src: clusterdeployment.yml.j2 + dest: /tmp/clusterdeployment.yml + mode: '0644' + +- name: Apply ClusterDeployment + shell: | + oc apply -f /tmp/clusterdeployment.yml + register: cd_result + changed_when: "'created' in cd_result.stdout" + +- name: Create AgentClusterInstall + template: + src: agentclusterinstall.yml.j2 + dest: /tmp/agentclusterinstall.yml + mode: '0644' + +- name: Apply AgentClusterInstall + shell: | + oc apply -f /tmp/agentclusterinstall.yml + register: aci_result + changed_when: "'created' in aci_result.stdout" + +- name: Create InfraEnv + template: + src: infraenv.yml.j2 + dest: /tmp/infraenv.yml + mode: '0644' + +- name: Apply InfraEnv + shell: | + oc apply -f /tmp/infraenv.yml + register: ie_result + changed_when: "'created' in ie_result.stdout" + +- name: Display cluster resources status + debug: + msg: >- + Spoke cluster resources created: ClusterImageSet={{ effective_ocp_version }}.0, + ClusterDeployment={{ spoke_cluster_name }}, + AgentClusterInstall={{ spoke_cluster_name }}, + InfraEnv={{ spoke_cluster_name }} \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml new file mode 100644 index 0000000..4384bdf --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml @@ -0,0 +1,48 @@ +--- +# Create dedicated libvirt network for spoke cluster with DNS for api/apps VIPs +# Then add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + +- name: Check if spoke network already exists + shell: | + sudo virsh net-info {{ spoke_network_name }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + register: net_check + changed_when: false + +- name: Create spoke libvirt network definition + template: + src: spoke-network.xml.j2 + dest: /tmp/spoke-network.xml + mode: '0644' + when: "'NOT_FOUND' in net_check.stdout" + +- name: Define spoke libvirt network + shell: | + sudo virsh net-define /tmp/spoke-network.xml + when: "'NOT_FOUND' in net_check.stdout" + +- name: Start spoke libvirt network + shell: | + sudo virsh net-start {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + +- name: Set spoke libvirt network to autostart + shell: | + sudo virsh net-autostart {{ spoke_network_name }} + changed_when: true + +- name: Add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + shell: | + # Check if rules already exist + if sudo nft list chain ip filter FORWARD 2>/dev/null | grep -q "{{ spoke_network_cidr }}.*{{ hub_network_cidr }}"; then + echo "Rules already exist" + else + sudo nft insert rule ip filter FORWARD ip saddr {{ spoke_network_cidr }} ip daddr {{ hub_network_cidr }} accept + sudo nft insert rule ip filter FORWARD ip saddr {{ hub_network_cidr }} ip daddr {{ spoke_network_cidr }} accept + echo "Rules added" + fi + register: nft_result + changed_when: "'added' in nft_result.stdout" + +- name: Display network setup result + debug: + msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml new file mode 100644 index 0000000..e29487e --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml @@ -0,0 +1,75 @@ +--- +# Create empty libvirt VMs for spoke cluster, capture UUID/MAC + +- name: Create spoke VM disks + shell: | + sudo qemu-img create -f qcow2 \ + {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2 \ + {{ spoke_vm_disk_size }}G + args: + creates: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + loop: "{{ range(spoke_ctlplanes) | list }}" + +- name: Check if spoke VMs already exist + shell: | + sudo virsh dominfo {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_check + changed_when: false + +- name: Create spoke VMs (defined but not started) + shell: | + sudo virt-install \ + --name {{ spoke_cluster_name }}-master-{{ item.item }} \ + --ram {{ spoke_vm_memory }} \ + --vcpus {{ spoke_vm_vcpus }} \ + --disk {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item.item }}.qcow2,bus=virtio \ + --network network={{ spoke_network_name }},model=virtio \ + --os-variant rhel9.0 \ + --graphics none \ + --noautoconsole \ + --boot hd,network \ + --noreboot \ + --import + loop: "{{ vm_check.results }}" + when: "'NOT_FOUND' in item.stdout" + +- name: Ensure spoke VMs are shut off + shell: | + sudo virsh destroy {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null || true + loop: "{{ range(spoke_ctlplanes) | list }}" + changed_when: false + failed_when: false + +- name: Capture spoke VM UUIDs + shell: | + sudo virsh domuuid {{ spoke_cluster_name }}-master-{{ item }} + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_uuids + changed_when: false + +- name: Capture spoke VM MAC addresses + shell: | + sudo virsh domiflist {{ spoke_cluster_name }}-master-{{ item }} \ + | grep {{ spoke_network_name }} | awk '{print $5}' + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_macs + changed_when: false + +- name: Build spoke VM info list + set_fact: + spoke_vms: >- + {{ spoke_vms | default([]) + [ + { + 'index': item.item, + 'name': spoke_cluster_name ~ '-master-' ~ item.item, + 'uuid': vm_uuids.results[item.item].stdout | trim, + 'mac': vm_macs.results[item.item].stdout | trim + } + ] }} + loop: "{{ vm_uuids.results }}" + +- name: Display spoke VM info + debug: + msg: "VM {{ item.name }}: UUID={{ item.uuid }}, MAC={{ item.mac }}" + loop: "{{ spoke_vms }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml new file mode 100644 index 0000000..43a420c --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml @@ -0,0 +1,30 @@ +--- +# Deploy spoke TNF cluster via assisted installer + BMH + +- block: + - name: Cleanup existing spoke resources + include_tasks: cleanup.yml + when: force_cleanup | bool + + - name: Create dedicated libvirt network for spoke cluster + include_tasks: create-spoke-network.yml + + - name: Create spoke VMs + include_tasks: create-spoke-vms.yml + + - name: Verify sushy-tools is running + include_tasks: setup-ksushy.yml + + - name: Create spoke cluster resources on hub + include_tasks: create-cluster-resources.yml + + - name: Create BareMetalHost resources + include_tasks: create-bmh.yml + + - name: Wait for spoke cluster installation to complete + include_tasks: wait-for-install.yml + + - name: Retrieve spoke cluster credentials + include_tasks: retrieve-credentials.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml new file mode 100644 index 0000000..15b15e4 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml @@ -0,0 +1,100 @@ +--- +# Extract spoke cluster credentials and set up DNS for hypervisor access + +- name: Create spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: directory + mode: '0700' + +- name: Wait for admin-kubeconfig secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: kubeconfig_secret + until: kubeconfig_secret.rc == 0 + retries: 10 + delay: 15 + +- name: Extract spoke kubeconfig + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.kubeconfig}' | base64 -d + register: spoke_kubeconfig + changed_when: false + +- name: Save spoke kubeconfig + copy: + content: "{{ spoke_kubeconfig.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeconfig" + mode: '0600' + +- name: Wait for admin-password secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: password_secret + until: password_secret.rc == 0 + retries: "{{ (spoke_credentials_timeout / 30) | int }}" + delay: 30 + +- name: Extract spoke admin password + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.password}' | base64 -d + register: spoke_password + changed_when: false + +- name: Save spoke admin password + copy: + content: "{{ spoke_password.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeadmin-password" + mode: '0600' + +- name: Add spoke API DNS to hypervisor /etc/hosts + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + line: "{{ spoke_api_vip }} api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} api-int.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: present + become: true + +- name: Ensure spoke VMs are running + shell: | + STATE=$(virsh domstate {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null) + if [ "$STATE" != "running" ]; then + virsh start {{ spoke_cluster_name }}-master-{{ item }} + echo "STARTED" + else + echo "ALREADY_RUNNING" + fi + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_start_result + changed_when: "'STARTED' in vm_start_result.stdout" + failed_when: false + become: true + +- name: Wait for spoke VMs to boot + pause: + seconds: 120 + when: vm_start_result.results | selectattr('stdout', 'search', 'STARTED') | list | length > 0 + +- name: Verify spoke cluster access + shell: | + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes + register: spoke_nodes + changed_when: false + retries: 20 + delay: 30 + until: spoke_nodes.rc == 0 + +- name: Display spoke cluster nodes + debug: + msg: "{{ spoke_nodes.stdout }}" + +- name: Set spoke kubeconfig path as fact for post-deployment tasks + set_fact: + spoke_kubeconfig_path: "{{ spoke_auth_dir }}/kubeconfig" + spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml new file mode 100644 index 0000000..88a4026 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml @@ -0,0 +1,34 @@ +--- +# Verify sushy-tools is running on the hypervisor (should already exist from dev-scripts) + +- name: Check if sushy-tools container is running + shell: | + sudo podman ps --format '{{ '{{' }}.Names{{ '}}' }}' | grep -q sushy-tools + register: sushy_check + changed_when: false + failed_when: false + +- name: Fail if sushy-tools is not running + fail: + msg: >- + sushy-tools container is not running. It should be started by dev-scripts. + Ensure the hub was deployed with 'make deploy fencing-ipi' before running + the assisted installer. + when: sushy_check.rc != 0 + +- name: Verify spoke VMs are visible via sushy-tools + shell: | + curl -sk https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/ \ + -u {{ spoke_bmc_user }}:{{ spoke_bmc_password }} \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(d['Members@odata.count'])" + register: sushy_systems + changed_when: false + +- name: Verify expected number of systems visible + assert: + that: + - sushy_systems.stdout | int >= (spoke_ctlplanes + 2) + fail_msg: >- + Expected at least {{ spoke_ctlplanes + 2 }} systems in sushy-tools + ({{ spoke_ctlplanes }} spoke + 2 hub), but found {{ sushy_systems.stdout }}. + success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml new file mode 100644 index 0000000..e0dd662 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml @@ -0,0 +1,103 @@ +--- +# Monitor BMH provisioning, agent registration, and installation progress + +- name: Wait for agents to register + shell: | + oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l + register: agent_count + until: agent_count.stdout | int >= spoke_ctlplanes + retries: "{{ (spoke_agent_register_timeout / 30) | int }}" + delay: 30 + +- name: Display registered agents + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: agents_info + changed_when: false + +- name: Show registered agents + debug: + msg: "{{ agents_info.stdout }}" + +- name: Wait for spoke cluster installation to complete + shell: | + ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) + echo "$ACI_STATE" + case "$ACI_STATE" in + adding-hosts|installed) + exit 0 + ;; + error|failed) + echo "INSTALL FAILED" + oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool + exit 2 + ;; + *) + exit 1 + ;; + esac + register: install_state + until: install_state.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + failed_when: install_state.rc == 2 + +- name: Display final installation state + debug: + msg: "Spoke cluster installation state: {{ install_state.stdout_lines[0] }}" + +- name: Wait for all agents to reach Done stage + shell: | + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json + import sys + + data = json.load(sys.stdin) + agents = data.get('items', []) + total = len(agents) + done = 0 + stuck = [] + for a in agents: + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + if stage == 'Done': + done += 1 + else: + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + hostname = a.get('spec', {}).get('hostname', 'unknown') + stuck.append(f'{hostname}: state={state}, stage={stage}') + + print(f'Agents Done: {done} / {total}') + for s in stuck: + print(f' {s}') + sys.exit(0 if done == total else 1) + " + register: agents_done + until: agents_done.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + changed_when: false + +- name: Display final agent status + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: final_agents + changed_when: false + +- name: Show final agents + debug: + msg: "{{ final_agents.stdout }}" + +- name: Display final BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: final_bmh + changed_when: false + +- name: Show final BMH + debug: + msg: "{{ final_bmh.stdout }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 new file mode 100644 index 0000000..0e18b80 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: extensions.hive.openshift.io/v1beta1 +kind: AgentClusterInstall +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterDeploymentRef: + name: {{ spoke_cluster_name }} + imageSetRef: + name: "{{ effective_ocp_version }}.0" + apiVIP: "{{ spoke_api_vip }}" + ingressVIP: "{{ spoke_ingress_vip }}" + platformType: BareMetal + networking: + clusterNetwork: + - cidr: "{{ spoke_cluster_network_cidr }}" + hostPrefix: 23 + serviceNetwork: + - "{{ spoke_service_network_cidr }}" + provisionRequirements: + controlPlaneAgents: {{ spoke_ctlplanes }} + sshPublicKey: "{{ ssh_pub_key.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 new file mode 100644 index 0000000..a31289f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 @@ -0,0 +1,20 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterDeployment +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + baseDomain: {{ spoke_base_domain }} + clusterName: {{ spoke_cluster_name }} + clusterInstallRef: + group: extensions.hive.openshift.io + kind: AgentClusterInstall + name: {{ spoke_cluster_name }} + version: v1beta1 + platform: + agentBareMetal: + agentSelector: + matchLabels: + cluster: tnf + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 new file mode 100644 index 0000000..82eed09 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 @@ -0,0 +1,6 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterImageSet +metadata: + name: "{{ effective_ocp_version }}.0" +spec: + releaseImage: {{ effective_release_image }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 new file mode 100644 index 0000000..4945f81 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: InfraEnv +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterRef: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} + sshAuthorizedKey: "{{ ssh_pub_key.stdout }}" + agentLabels: + cluster: tnf + cpuArchitecture: x86_64 + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 new file mode 100644 index 0000000..63f2e23 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 @@ -0,0 +1,29 @@ + + {{ spoke_network_name }} + + + + + + + + + + apps.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + + + + + + + + + + + + diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml new file mode 100644 index 0000000..3712692 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -0,0 +1,21 @@ +--- +# Role-internal variables + +# Derived from spoke_network_cidr +spoke_network_gateway: "{{ spoke_network_cidr | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_start: "{{ spoke_network_cidr | ansible.utils.ipaddr('50') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_end: "{{ spoke_network_cidr | ansible.utils.ipaddr('150') | ansible.utils.ipaddr('address') }}" +spoke_network_prefix: "{{ spoke_network_cidr | ansible.utils.ipaddr('prefix') }}" + +# Libvirt network name (derived from spoke cluster name) +spoke_network_name: "{{ spoke_cluster_name }}" + +# Pull secret location (dev-scripts standard path) +pull_secret_path: /opt/dev-scripts/pull_secret.json + +# VM image path +spoke_vm_image_dir: /var/lib/libvirt/images + +# Credential output paths +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" +hub_auth_dir: "{{ ansible_user_dir }}/auth" \ No newline at end of file diff --git a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh new file mode 100755 index 0000000..eba97c4 --- /dev/null +++ b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +# Get the directory where this script is located +SCRIPT_DIR=$(dirname "$0") +# Get the deploy directory (two levels up from scripts) +DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +set -o nounset +set -o errexit +set -o pipefail + +# Check if instance data exists +if [[ ! -f "${DEPLOY_DIR}/aws-hypervisor/instance-data/aws-instance-id" ]]; then + echo "Error: No instance found. Please run 'make deploy' first." + exit 1 +fi + +# Check if inventory.ini exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then + echo "Error: inventory.ini not found in ${DEPLOY_DIR}/openshift-clusters/" + echo "Please ensure the inventory file is properly configured." + echo "You can run 'make inventory' to update it with current instance information." + exit 1 +fi + +# Check if vars/assisted.yml exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" ]]; then + echo "Error: vars/assisted.yml not found." + echo "Copy the template and customize it:" + echo " cp ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml.template ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" + exit 1 +fi + +echo "Deploying spoke TNF cluster via assisted installer..." + +cd "${DEPLOY_DIR}/openshift-clusters" + +if ansible-playbook assisted-install.yml -i inventory.ini; then + echo "" + echo "OpenShift spoke TNF cluster deployment via assisted installer completed successfully!" + echo "" + echo "Next steps:" + echo "1. Access spoke cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" + echo " KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes" + echo "2. Access hub cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/hub-proxy.env" + echo " KUBECONFIG=~/auth/kubeconfig oc get nodes" +else + echo "Error: Spoke cluster deployment failed!" + echo "Check the Ansible logs for more details." + exit 1 +fi diff --git a/deploy/openshift-clusters/vars/assisted.yml.template b/deploy/openshift-clusters/vars/assisted.yml.template new file mode 100644 index 0000000..dc4f0d0 --- /dev/null +++ b/deploy/openshift-clusters/vars/assisted.yml.template @@ -0,0 +1,48 @@ +# Assisted Installer Configuration +# Copy this file to vars/assisted.yml and customize as needed +# +# Usage: After deploying a hub cluster with 'make deploy fencing-ipi', +# run 'make fencing-assisted' to deploy a spoke TNF cluster via ACM/assisted installer. + +# Hub operator: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest (recommended) +# Override with specific channel like "release-2.15" if needed +acm_channel: "auto" +mce_channel: "auto" + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version +# "auto" uses the same release image as the hub (recommended) +# Or specify an explicit release image URL +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 # MB (32GB) +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 # GB +spoke_ctlplanes: 2 # Always 2 for TNF + +# Spoke network configuration +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# BMC / sushy-tools (defaults match dev-scripts deployment) +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Storage for assisted service on hub +# Currently only "hostpath" is supported +assisted_storage_method: "hostpath" + +# Deployment options +force_cleanup: false \ No newline at end of file From 3a61c0167cd10bd3e311530890e29f0f83204624 Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 18 Feb 2026 13:09:50 +0200 Subject: [PATCH 3/4] Fix critical review issues for assisted installer roles - C1: SSH key detection now tries ed25519 first, falls back to rsa/ecdsa - C2: Deduplicate pull_secret_path and hub_kubeconfig to playbook level - C3: Move hub release image extraction to playbook pre_tasks (run once) - C4: RHCOS ISO extraction checks rc and catches empty string failures - C5: Explicit disk cleanup prevents stale qcow2 reuse on re-deploy - Remove unused hub_auth_dir variable (I11) Co-Authored-By: Claude Opus 4.6 --- .../openshift-clusters/assisted-install.yml | 31 ++++++++++++++++--- .../assisted/acm-install/defaults/main.yml | 3 -- .../tasks/agent-service-config.yml | 31 ++++++------------- .../templates/agentserviceconfig.yml.j2 | 2 +- .../roles/assisted/acm-install/vars/main.yml | 5 +-- .../assisted/assisted-spoke/defaults/main.yml | 3 -- .../assisted/assisted-spoke/tasks/cleanup.yml | 7 +++++ .../tasks/create-cluster-resources.yml | 26 ++++------------ .../assisted/assisted-spoke/vars/main.yml | 6 +--- 9 files changed, 52 insertions(+), 62 deletions(-) diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml index 85e54dc..71a3027 100644 --- a/deploy/openshift-clusters/assisted-install.yml +++ b/deploy/openshift-clusters/assisted-install.yml @@ -13,6 +13,8 @@ vars: topology: fencing interactive_mode: false + pull_secret_path: /opt/dev-scripts/pull_secret.json + hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" vars_files: - vars/assisted.yml @@ -44,10 +46,6 @@ The hub cluster must be deployed first. when: not hub_kubeconfig_check.stat.exists - - name: Set hub KUBECONFIG path - set_fact: - hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - - name: Preserve hub proxy.env as hub-proxy.env copy: src: "{{ playbook_dir }}/proxy.env" @@ -56,6 +54,31 @@ backup: no delegate_to: localhost + - name: Get hub release image + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 + register: hub_ocp_version_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Set hub release facts + set_fact: + hub_release_image: "{{ hub_release_image_raw.stdout }}" + hub_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + effective_release_image: >- + {{ hub_release_image_raw.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + effective_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + - name: Display assisted installer configuration debug: msg: | diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml index 2e078f4..4e933c1 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -1,9 +1,6 @@ --- # Default variables for acm-install role -# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) -hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - # Hub operator to install: "acm" or "mce" hub_operator: acm diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml index 2f127f8..6273479 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -1,24 +1,6 @@ --- # Create AgentServiceConfig with RHCOS ISO auto-extracted from release image - -- name: Get hub release image - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.image}' - register: hub_release_image - changed_when: false - -- name: Get hub OCP version - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.version}' \ - | cut -d. -f1-2 - register: hub_ocp_version - changed_when: false - -- name: Determine spoke release image - set_fact: - effective_release_image: >- - {{ hub_release_image.stdout if spoke_release_image == 'auto' - else spoke_release_image }} +# hub_release_image, hub_ocp_version, effective_release_image are set by playbook pre_tasks - name: Extract RHCOS ISO URL from release image shell: | @@ -47,7 +29,9 @@ " register: rhcos_iso_extraction changed_when: false - failed_when: "'FAILED' in rhcos_iso_extraction.stdout" + failed_when: >- + rhcos_iso_extraction.rc != 0 or + 'FAILED' in rhcos_iso_extraction.stdout - name: Try fallback RHCOS ISO extraction via coreos print-stream-json shell: | @@ -75,8 +59,11 @@ - name: Set RHCOS ISO URL fact set_fact: rhcos_iso_url: >- - {{ rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout) | trim }} - failed_when: rhcos_iso_url == 'FAILED' or rhcos_iso_url == 'NEEDS_FALLBACK' + {{ (rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout)) | trim }} + failed_when: >- + rhcos_iso_url == 'FAILED' or + rhcos_iso_url == 'NEEDS_FALLBACK' or + rhcos_iso_url == '' - name: Display RHCOS ISO URL debug: diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 index 8b71e4b..0d8527e 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -17,6 +17,6 @@ spec: storage: {{ assisted_images_size }} osImages: - cpuArchitecture: x86_64 - openshiftVersion: "{{ hub_ocp_version.stdout }}" + openshiftVersion: "{{ hub_ocp_version }}" url: "{{ rhcos_iso_url }}" version: "{{ rhcos_version }}" \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml index a32a832..8f1a561 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -17,7 +17,4 @@ operator_config: source: redhat-operators # The MCE namespace is always multicluster-engine regardless of hub_operator -assisted_service_namespace: multicluster-engine - -# Pull secret location (dev-scripts standard path) -pull_secret_path: /opt/dev-scripts/pull_secret.json \ No newline at end of file +assisted_service_namespace: multicluster-engine \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml index 668375f..86a371e 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -1,9 +1,6 @@ --- # Default variables for assisted-spoke role -# Hub kubeconfig path (set by playbook pre_tasks, fallback to ansible_user_dir) -hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" - # Spoke cluster identity spoke_cluster_name: spoke-tnf spoke_base_domain: example.com diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml index 55cc58a..2dbaa36 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -24,6 +24,13 @@ changed_when: true failed_when: false +- name: Remove spoke VM disk images + file: + path: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + state: absent + loop: "{{ range(spoke_ctlplanes) | list }}" + become: true + - name: Remove spoke libvirt network shell: | sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml index f6276a4..fa1962c 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -1,29 +1,15 @@ --- # Create spoke cluster resources on hub: namespace, secrets, ClusterDeployment, # AgentClusterInstall, InfraEnv, ClusterImageSet - -- name: Get hub release image for spoke - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.image}' - register: hub_release_image - changed_when: false - -- name: Get hub OCP version - shell: | - oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 - register: hub_ocp_version - changed_when: false - -- name: Set effective spoke release image - set_fact: - effective_release_image: >- - {{ hub_release_image.stdout if spoke_release_image == 'auto' - else spoke_release_image }} - effective_ocp_version: "{{ hub_ocp_version.stdout }}" +# hub_release_image, hub_ocp_version, effective_release_image, effective_ocp_version +# are set by playbook pre_tasks - name: Get SSH public key shell: | - cat ~/.ssh/id_rsa.pub + for key in ~/.ssh/id_ed25519.pub ~/.ssh/id_rsa.pub ~/.ssh/id_ecdsa.pub; do + [ -f "$key" ] && cat "$key" && exit 0 + done + echo "ERROR: No SSH public key found" >&2 && exit 1 register: ssh_pub_key changed_when: false diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml index 3712692..6434d77 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -10,12 +10,8 @@ spoke_network_prefix: "{{ spoke_network_cidr | ansible.utils.ipaddr('prefix') }} # Libvirt network name (derived from spoke cluster name) spoke_network_name: "{{ spoke_cluster_name }}" -# Pull secret location (dev-scripts standard path) -pull_secret_path: /opt/dev-scripts/pull_secret.json - # VM image path spoke_vm_image_dir: /var/lib/libvirt/images # Credential output paths -spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" -hub_auth_dir: "{{ ansible_user_dir }}/auth" \ No newline at end of file +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" \ No newline at end of file From 38bfca3b53cbc0eda4fb8d34c01950c1549ac7dd Mon Sep 17 00:00:00 2001 From: Gal Amado Date: Wed, 18 Feb 2026 19:53:09 +0200 Subject: [PATCH 4/4] Fix important review issues and add READMEs for assisted installer roles - I6: Parse spoke_cluster_name from vars/assisted.yml in deploy script instead of hardcoding ~/spoke-tnf/auth/kubeconfig - I8: Add cluster state tracking (deploying/deployed) to assisted-install.yml using common/cluster-state.yml, consistent with dev-scripts and kcli methods - I9: Replace blanket failed_when:false with ignore_errors:true in cleanup, conditional error checking in enable-watch-all-namespaces, and remove failed_when:false from virsh start in retrieve-credentials - I10: Add block/rescue diagnostic handlers to all 9 wait loops across install-operator, agent-service-config, wait-for-install, and retrieve-credentials, dumping relevant status on timeout - I13: Add README.md for acm-install and assisted-spoke roles - S16: Document DHCP range constraints in assisted.yml.template - S17: Expose hub_network_cidr in assisted.yml.template - S18: Add trailing newlines to all 22 new files Co-Authored-By: Claude Opus 4.6 --- .../openshift-clusters/assisted-install.yml | 23 ++ .../roles/assisted/acm-install/README.md | 79 +++++++ .../assisted/acm-install/defaults/main.yml | 2 +- .../tasks/agent-service-config.yml | 41 +++- .../assisted/acm-install/tasks/enable-tnf.yml | 2 +- .../tasks/enable-watch-all-namespaces.yml | 9 +- .../acm-install/tasks/install-operator.yml | 79 +++++-- .../assisted/acm-install/tasks/storage.yml | 2 +- .../assisted/acm-install/tasks/validate.yml | 2 +- .../templates/agentserviceconfig.yml.j2 | 2 +- .../templates/multiclusterhub.yml.j2 | 2 +- .../templates/operator-subscription.yml.j2 | 2 +- .../roles/assisted/acm-install/vars/main.yml | 2 +- .../roles/assisted/assisted-spoke/README.md | 132 +++++++++++ .../assisted/assisted-spoke/defaults/main.yml | 2 +- .../assisted/assisted-spoke/tasks/cleanup.yml | 10 +- .../assisted-spoke/tasks/create-bmh.yml | 2 +- .../tasks/create-cluster-resources.yml | 2 +- .../tasks/create-spoke-network.yml | 2 +- .../assisted-spoke/tasks/create-spoke-vms.yml | 2 +- .../tasks/retrieve-credentials.yml | 113 +++++++--- .../assisted-spoke/tasks/setup-ksushy.yml | 2 +- .../assisted-spoke/tasks/wait-for-install.yml | 209 ++++++++++++------ .../assisted/assisted-spoke/vars/main.yml | 2 +- .../scripts/deploy-fencing-assisted.sh | 8 +- .../vars/assisted.yml.template | 8 +- 26 files changed, 600 insertions(+), 141 deletions(-) create mode 100644 deploy/openshift-clusters/roles/assisted/acm-install/README.md create mode 100644 deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml index 71a3027..ccff642 100644 --- a/deploy/openshift-clusters/assisted-install.yml +++ b/deploy/openshift-clusters/assisted-install.yml @@ -15,6 +15,9 @@ interactive_mode: false pull_secret_path: /opt/dev-scripts/pull_secret.json hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + method: assisted + cluster_state_dir: "../aws-hypervisor/instance-data" + cluster_state_filename: "cluster-vm-state.json" vars_files: - vars/assisted.yml @@ -94,6 +97,16 @@ Storage method: {{ assisted_storage_method }} Force cleanup: {{ force_cleanup }} + - name: Update cluster state to deploying + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deploying' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + roles: - role: assisted/acm-install - role: assisted/assisted-spoke @@ -113,6 +126,16 @@ vars: test_cluster_name: "{{ spoke_cluster_name }}" + - name: Update cluster state to deployed + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deployed' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + - name: Display deployment summary debug: msg: | diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/README.md b/deploy/openshift-clusters/roles/assisted/acm-install/README.md new file mode 100644 index 0000000..8e6a125 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/README.md @@ -0,0 +1,79 @@ +# acm-install Role + +Installs ACM or MCE operator on a hub cluster and configures the assisted installer service for spoke TNF cluster deployment. + +## Description + +This role prepares an existing hub OpenShift cluster to deploy spoke TNF clusters via the assisted installer. It: + +1. Validates hub cluster health and prerequisites +2. Provisions hostPath storage for the assisted service +3. Installs the ACM or MCE operator (auto-detects channel) +4. Creates the AgentServiceConfig with RHCOS ISO auto-extracted from the hub release image +5. Enables TNF cluster support in the assisted service +6. Configures BMO to watch all namespaces and disables the provisioning network + +## Requirements + +- A running hub OpenShift cluster (deployed via `make deploy fencing-ipi` or equivalent) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- Pull secret with access to required registries +- `oc` CLI available on the hypervisor + +## Role Variables + +### Configurable Variables (defaults/main.yml) + +- `hub_operator`: Operator to install - `"acm"` or `"mce"` (default: `"acm"`) +- `acm_channel`: ACM operator channel - `"auto"` detects from packagemanifest (default: `"auto"`) +- `mce_channel`: MCE operator channel (default: `"auto"`) +- `assisted_storage_method`: Storage backend - currently only `"hostpath"` (default: `"hostpath"`) +- `assisted_images_path`: Host directory for ISO images (default: `/var/lib/assisted-images`) +- `assisted_db_path`: Host directory for database (default: `/var/lib/assisted-db`) +- `assisted_images_size`: PV size for images (default: `50Gi`) +- `assisted_db_size`: PV size for database (default: `10Gi`) +- `assisted_storage_class`: StorageClass name (default: `assisted-service`) + +### Timeout Variables + +- `acm_csv_timeout`: Operator CSV install timeout in seconds (default: `900`) +- `multiclusterhub_timeout`: MultiClusterHub readiness timeout (default: `1800`) +- `assisted_service_timeout`: Assisted service pod readiness timeout (default: `600`) +- `metal3_stabilize_timeout`: Metal3 pod stabilization timeout after provisioning changes (default: `300`) + +### Variables Set by Playbook + +These are set in `assisted-install.yml` and passed to the role: + +- `hub_kubeconfig`: Path to hub cluster kubeconfig +- `pull_secret_path`: Path to pull secret on the hypervisor +- `hub_release_image`: Hub cluster release image (extracted in playbook pre_tasks) +- `hub_ocp_version`: Hub OCP version major.minor (extracted in playbook pre_tasks) +- `effective_release_image`: Release image to use for the spoke (hub image or user override) + +## Task Flow + +1. **validate.yml** - Checks hub cluster health, node readiness, and API access +2. **storage.yml** - Creates hostPath PVs, StorageClass, and fixes permissions/SELinux on hub nodes +3. **install-operator.yml** - Installs ACM/MCE operator subscription, waits for CSV, creates MultiClusterHub +4. **agent-service-config.yml** - Extracts RHCOS ISO URL from release image, creates AgentServiceConfig +5. **enable-tnf.yml** - Enables TNF support in assisted service configuration +6. **enable-watch-all-namespaces.yml** - Patches Provisioning CR to enable BMO in all namespaces + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +## Troubleshooting + +- Check operator CSV status: `oc get csv -n open-cluster-management` +- Check MultiClusterHub status: `oc get multiclusterhub -n open-cluster-management` +- Check assisted service pods: `oc get pods -n multicluster-engine -l app=assisted-service` +- Check AgentServiceConfig: `oc get agentserviceconfig agent -o yaml` +- Check events: `oc get events -n multicluster-engine --sort-by='.lastTimestamp'` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml index 4e933c1..0c4d760 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -22,4 +22,4 @@ assisted_storage_class: assisted-service acm_csv_timeout: 900 multiclusterhub_timeout: 1800 assisted_service_timeout: 600 -metal3_stabilize_timeout: 300 \ No newline at end of file +metal3_stabilize_timeout: 300 diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml index 6273479..0cd9fc7 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -85,14 +85,37 @@ register: asc_result changed_when: "'created' in asc_result.stdout" -- name: Wait for assisted-service pod to be Running (2/2) - shell: | - oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ - --no-headers 2>/dev/null | grep -q '2/2.*Running' - register: assisted_pod - until: assisted_pod.rc == 0 - retries: "{{ (assisted_service_timeout / 15) | int }}" - delay: 15 +- block: + - name: Wait for assisted-service pod to be Running (2/2) + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ + --no-headers 2>/dev/null | grep -q '2/2.*Running' + register: assisted_pod + until: assisted_pod.rc == 0 + retries: "{{ (assisted_service_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect assisted-service timeout diagnostics + shell: | + echo "=== Assisted Service Pods ===" + oc get pods -n {{ assisted_service_namespace }} 2>/dev/null + echo "" + echo "=== Pod Details ===" + oc describe pods -n {{ assisted_service_namespace }} -l app=assisted-service 2>/dev/null | tail -40 + echo "" + echo "=== Recent Events ===" + oc get events -n {{ assisted_service_namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: assisted_diag + changed_when: false + failed_when: false + + - name: Display assisted-service timeout diagnostics + debug: + msg: "{{ assisted_diag.stdout }}" + + - name: Fail after assisted-service timeout + fail: + msg: "assisted-service pod did not reach Running (2/2) state within timeout" - name: Display assisted-service pod status shell: | @@ -102,4 +125,4 @@ - name: Show assisted-service pod debug: - msg: "{{ pod_status.stdout }}" \ No newline at end of file + msg: "{{ pod_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml index 00dbd95..3a00c4c 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -42,4 +42,4 @@ - name: Display TNF support status debug: - msg: "{{ tnf_verify.stdout | trim }}" \ No newline at end of file + msg: "{{ tnf_verify.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml index ba932f3..cb7dd4f 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml @@ -19,10 +19,13 @@ {"op":"remove","path":"/spec/provisioningDHCPRange"}, {"op":"remove","path":"/spec/provisioningNetworkCIDR"}, {"op":"remove","path":"/spec/provisioningInterface"} - ]' 2>&1 || echo "Some provisioning fields may not exist, continuing" + ]' 2>&1 register: remove_result changed_when: "'patched' in remove_result.stdout" - failed_when: false + failed_when: >- + remove_result.rc != 0 and + 'does not exist' not in remove_result.stderr and + 'does not exist' not in remove_result.stdout - name: Wait for metal3 pod to stabilize after provisioning change shell: | @@ -43,4 +46,4 @@ - name: Show metal3 pod debug: - msg: "{{ metal3_status.stdout }}" \ No newline at end of file + msg: "{{ metal3_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml index 18362ea..caeafae 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -63,15 +63,35 @@ register: sub_result changed_when: "'created' in sub_result.stdout" -- name: Wait for operator CSV to succeed - shell: | - oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ - | grep {{ op_config.package_name }} \ - | grep -q Succeeded - register: csv_result - until: csv_result.rc == 0 - retries: "{{ (acm_csv_timeout / 15) | int }}" - delay: 15 +- block: + - name: Wait for operator CSV to succeed + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ + | grep {{ op_config.package_name }} \ + | grep -q Succeeded + register: csv_result + until: csv_result.rc == 0 + retries: "{{ (acm_csv_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect CSV timeout diagnostics + shell: | + echo "=== CSV Status ===" + oc get csv -n {{ op_config.namespace }} 2>/dev/null + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: csv_diag + changed_when: false + failed_when: false + + - name: Display CSV timeout diagnostics + debug: + msg: "{{ csv_diag.stdout }}" + + - name: Fail after CSV timeout + fail: + msg: "Operator CSV did not reach Succeeded state within timeout" - name: Display operator install result shell: | @@ -99,17 +119,40 @@ changed_when: "'created' in mch_result.stdout" when: hub_operator == 'acm' -- name: Wait for MultiClusterHub to reach Running phase - shell: | - oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ - -o jsonpath='{.status.phase}' - register: mch_phase - until: mch_phase.stdout == 'Running' - retries: "{{ (multiclusterhub_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for MultiClusterHub to reach Running phase + shell: | + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ + -o jsonpath='{.status.phase}' + register: mch_phase + until: mch_phase.stdout == 'Running' + retries: "{{ (multiclusterhub_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect MCH timeout diagnostics + shell: | + echo "=== MultiClusterHub Status ===" + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} -o yaml 2>/dev/null | grep -A 50 'status:' + echo "" + echo "=== Non-Running Pods ===" + oc get pods -n {{ op_config.namespace }} --no-headers 2>/dev/null | grep -v Running + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: mch_diag + changed_when: false + failed_when: false + + - name: Display MCH timeout diagnostics + debug: + msg: "{{ mch_diag.stdout }}" + + - name: Fail after MCH timeout + fail: + msg: "MultiClusterHub did not reach Running phase within timeout" when: hub_operator == 'acm' - name: Display MultiClusterHub status debug: msg: "MultiClusterHub phase: {{ mch_phase.stdout }}" - when: hub_operator == 'acm' \ No newline at end of file + when: hub_operator == 'acm' diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml index 4506150..47c715d 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -59,4 +59,4 @@ - name: Display storage setup result debug: - msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" \ No newline at end of file + msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml index 57be824..a5c103a 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml @@ -33,4 +33,4 @@ - name: Display hub cluster status debug: - msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" \ No newline at end of file + msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 index 0d8527e..dc97a08 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -19,4 +19,4 @@ spec: - cpuArchitecture: x86_64 openshiftVersion: "{{ hub_ocp_version }}" url: "{{ rhcos_iso_url }}" - version: "{{ rhcos_version }}" \ No newline at end of file + version: "{{ rhcos_version }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 index 2b68364..fce239b 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 @@ -4,4 +4,4 @@ metadata: name: multiclusterhub namespace: {{ op_config.namespace }} spec: - availabilityConfig: Basic \ No newline at end of file + availabilityConfig: Basic diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 index f6c3109..6bec2ad 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 @@ -8,4 +8,4 @@ spec: installPlanApproval: Automatic name: {{ op_config.subscription_name }} source: {{ op_config.source }} - sourceNamespace: openshift-marketplace \ No newline at end of file + sourceNamespace: openshift-marketplace diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml index 8f1a561..b4679a9 100644 --- a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -17,4 +17,4 @@ operator_config: source: redhat-operators # The MCE namespace is always multicluster-engine regardless of hub_operator -assisted_service_namespace: multicluster-engine \ No newline at end of file +assisted_service_namespace: multicluster-engine diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md new file mode 100644 index 0000000..067b3d8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md @@ -0,0 +1,132 @@ +# assisted-spoke Role + +Deploys a spoke TNF (Two-Node with Fencing) cluster on a hub via the assisted installer and BareMetalHost resources. + +## Description + +This role creates and installs a spoke TNF cluster on an existing hub that has ACM/MCE and the assisted service configured (via the `acm-install` role). It: + +1. Optionally cleans up existing spoke resources (when `force_cleanup=true`) +2. Creates a dedicated libvirt network for the spoke cluster +3. Creates spoke VMs with the specified resources +4. Verifies sushy-tools (Redfish BMC simulator) is running +5. Creates cluster resources on the hub (ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet) +6. Creates BareMetalHost resources to trigger agent-based installation +7. Monitors agent registration, cluster installation, and agent completion +8. Retrieves spoke cluster credentials (kubeconfig, admin password) + +## Requirements + +- Hub cluster with ACM/MCE and assisted service configured (run `acm-install` role first) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- libvirt/KVM available on the hypervisor +- sushy-tools installed for Redfish BMC simulation +- `oc` and `virsh` CLIs available on the hypervisor + +## Role Variables + +### Spoke Cluster Identity + +- `spoke_cluster_name`: Cluster name, must be DNS-safe (default: `"spoke-tnf"`) +- `spoke_base_domain`: Base domain for the spoke cluster (default: `"example.com"`) +- `spoke_release_image`: Release image - `"auto"` uses the hub release image (default: `"auto"`) + +### VM Specifications + +- `spoke_vm_memory`: Memory per node in MB (default: `32768`) +- `spoke_vm_vcpus`: CPU cores per node (default: `4`) +- `spoke_vm_disk_size`: Disk size per node in GB (default: `120`) +- `spoke_ctlplanes`: Number of control plane nodes, must be 2 for TNF (default: `2`) + +### Network Configuration + +- `spoke_network_cidr`: Spoke cluster network CIDR (default: `"192.168.125.0/24"`) +- `spoke_api_vip`: API VIP address (default: `"192.168.125.5"`) +- `spoke_ingress_vip`: Ingress VIP address (default: `"192.168.125.10"`) +- `spoke_cluster_network_cidr`: Pod network CIDR (default: `"10.132.0.0/14"`) +- `spoke_service_network_cidr`: Service network CIDR (default: `"172.31.0.0/16"`) +- `hub_network_cidr`: Hub network CIDR for cross-bridge nftables rules (default: `"192.168.111.0/24"`) + +### BMC / sushy-tools + +- `spoke_bmc_user`: BMC username (default: `"admin"`) +- `spoke_bmc_password`: BMC password (default: `"password"`) +- `spoke_ksushy_ip`: sushy-tools listen IP (default: `"192.168.111.1"`) +- `spoke_ksushy_port`: sushy-tools port (default: `8000`) + +### Deployment Options + +- `force_cleanup`: Remove existing spoke resources before deployment (default: `false`) + +### Timeout Variables + +- `spoke_install_timeout`: Cluster installation timeout in seconds (default: `3600`) +- `spoke_agent_register_timeout`: Agent registration timeout (default: `900`) +- `spoke_credentials_timeout`: Credential retrieval timeout (default: `1800`) + +### Computed Variables (vars/main.yml) + +These are derived automatically and should not be overridden: + +- `spoke_network_gateway`: First IP in spoke CIDR +- `spoke_dhcp_start` / `spoke_dhcp_end`: DHCP range within spoke CIDR +- `spoke_network_name`: Libvirt network name (matches `spoke_cluster_name`) +- `spoke_vm_image_dir`: VM disk image directory (`/var/lib/libvirt/images`) +- `spoke_auth_dir`: Credential output directory (`~//auth`) + +## Task Flow + +1. **cleanup.yml** - Removes existing spoke namespace, VMs, network, credentials (when `force_cleanup=true`) +2. **create-spoke-network.yml** - Creates dedicated libvirt network with DHCP for spoke VMs +3. **create-spoke-vms.yml** - Creates spoke VM disk images and defines libvirt domains +4. **setup-ksushy.yml** - Verifies sushy-tools is running for Redfish BMC +5. **create-cluster-resources.yml** - Creates ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet on hub +6. **create-bmh.yml** - Creates BareMetalHost resources that trigger spoke installation +7. **wait-for-install.yml** - Monitors agent registration, installation progress, and agent completion +8. **retrieve-credentials.yml** - Extracts kubeconfig and admin password, configures DNS, verifies access + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +### Configuration + +Copy and customize the variables template: + +```bash +cp vars/assisted.yml.template vars/assisted.yml +# Edit vars/assisted.yml with desired spoke configuration +``` + +### Accessing the Spoke Cluster + +After deployment: + +```bash +source proxy.env +KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes +``` + +### Redeployment + +To redeploy with cleanup of existing resources: + +```bash +ansible-playbook assisted-install.yml -i inventory.ini -e "force_cleanup=true" +``` + +## Troubleshooting + +- Check spoke VMs: `sudo virsh list --all | grep spoke` +- Check agents: `oc get agents -n ` +- Check BMH status: `oc get bmh -n ` +- Check installation progress: `oc get agentclusterinstall -n -o yaml` +- Check spoke events: `oc get events -n --sort-by='.lastTimestamp'` +- Check sushy-tools: `sudo systemctl status ksushy` +- Check spoke network: `sudo virsh net-list | grep ` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml index 86a371e..bdcc277 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -36,4 +36,4 @@ spoke_agent_register_timeout: 900 spoke_credentials_timeout: 1800 # Hub network CIDR (for cross-bridge nftables rules) -hub_network_cidr: "192.168.111.0/24" \ No newline at end of file +hub_network_cidr: "192.168.111.0/24" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml index 2dbaa36..9ff463d 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -6,14 +6,14 @@ oc delete namespace {{ spoke_cluster_name }} --ignore-not-found --timeout=120s register: ns_delete changed_when: "'deleted' in ns_delete.stdout" - failed_when: false + ignore_errors: true - name: Delete ClusterImageSet shell: | OCP_VER=$(oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2) oc delete clusterimageset "${OCP_VER}.0" --ignore-not-found changed_when: false - failed_when: false + ignore_errors: true - name: Destroy spoke VMs shell: | @@ -22,7 +22,7 @@ sudo virsh undefine {{ spoke_cluster_name }}-master-${i} --remove-all-storage 2>/dev/null || true done changed_when: true - failed_when: false + ignore_errors: true - name: Remove spoke VM disk images file: @@ -36,7 +36,7 @@ sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true sudo virsh net-undefine {{ spoke_network_name }} 2>/dev/null || true changed_when: true - failed_when: false + ignore_errors: true - name: Remove spoke credential directory file: @@ -52,4 +52,4 @@ - name: Display cleanup result debug: - msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" \ No newline at end of file + msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml index fb3ceb1..42cfffc 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml @@ -60,4 +60,4 @@ - name: Show BMH resources debug: - msg: "{{ bmh_status.stdout }}" \ No newline at end of file + msg: "{{ bmh_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml index fa1962c..285b20c 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -84,4 +84,4 @@ Spoke cluster resources created: ClusterImageSet={{ effective_ocp_version }}.0, ClusterDeployment={{ spoke_cluster_name }}, AgentClusterInstall={{ spoke_cluster_name }}, - InfraEnv={{ spoke_cluster_name }} \ No newline at end of file + InfraEnv={{ spoke_cluster_name }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml index 4384bdf..15347a6 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml @@ -45,4 +45,4 @@ - name: Display network setup result debug: - msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" \ No newline at end of file + msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml index e29487e..e59a541 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml @@ -72,4 +72,4 @@ - name: Display spoke VM info debug: msg: "VM {{ item.name }}: UUID={{ item.uuid }}, MAC={{ item.mac }}" - loop: "{{ spoke_vms }}" \ No newline at end of file + loop: "{{ spoke_vms }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml index 15b15e4..9b72882 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml @@ -7,14 +7,35 @@ state: directory mode: '0700' -- name: Wait for admin-kubeconfig secret - shell: | - oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ - -n {{ spoke_cluster_name }} -o name 2>/dev/null - register: kubeconfig_secret - until: kubeconfig_secret.rc == 0 - retries: 10 - delay: 15 +- block: + - name: Wait for admin-kubeconfig secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: kubeconfig_secret + until: kubeconfig_secret.rc == 0 + retries: 10 + delay: 15 + rescue: + - name: Collect kubeconfig secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: kube_secret_diag + changed_when: false + failed_when: false + + - name: Display kubeconfig secret timeout diagnostics + debug: + msg: "{{ kube_secret_diag.stdout }}" + + - name: Fail after kubeconfig secret timeout + fail: + msg: "admin-kubeconfig secret not found within timeout" - name: Extract spoke kubeconfig shell: | @@ -30,14 +51,35 @@ dest: "{{ spoke_auth_dir }}/kubeconfig" mode: '0600' -- name: Wait for admin-password secret - shell: | - oc get secret {{ spoke_cluster_name }}-admin-password \ - -n {{ spoke_cluster_name }} -o name 2>/dev/null - register: password_secret - until: password_secret.rc == 0 - retries: "{{ (spoke_credentials_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for admin-password secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: password_secret + until: password_secret.rc == 0 + retries: "{{ (spoke_credentials_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect password secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: pwd_secret_diag + changed_when: false + failed_when: false + + - name: Display password secret timeout diagnostics + debug: + msg: "{{ pwd_secret_diag.stdout }}" + + - name: Fail after password secret timeout + fail: + msg: "admin-password secret not found within timeout" - name: Extract spoke admin password shell: | @@ -73,7 +115,6 @@ loop: "{{ range(spoke_ctlplanes) | list }}" register: vm_start_result changed_when: "'STARTED' in vm_start_result.stdout" - failed_when: false become: true - name: Wait for spoke VMs to boot @@ -81,14 +122,34 @@ seconds: 120 when: vm_start_result.results | selectattr('stdout', 'search', 'STARTED') | list | length > 0 -- name: Verify spoke cluster access - shell: | - KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes - register: spoke_nodes - changed_when: false - retries: 20 - delay: 30 - until: spoke_nodes.rc == 0 +- block: + - name: Verify spoke cluster access + shell: | + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes + register: spoke_nodes + changed_when: false + retries: 20 + delay: 30 + until: spoke_nodes.rc == 0 + rescue: + - name: Collect spoke access timeout diagnostics + shell: | + echo "=== VM Status ===" + sudo virsh list --all 2>/dev/null | grep {{ spoke_cluster_name }} || echo "No spoke VMs found" + echo "" + echo "=== Last oc error ===" + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes 2>&1 || true + register: spoke_access_diag + changed_when: false + failed_when: false + + - name: Display spoke access timeout diagnostics + debug: + msg: "{{ spoke_access_diag.stdout }}" + + - name: Fail after spoke access timeout + fail: + msg: "Could not access spoke cluster within timeout" - name: Display spoke cluster nodes debug: @@ -97,4 +158,4 @@ - name: Set spoke kubeconfig path as fact for post-deployment tasks set_fact: spoke_kubeconfig_path: "{{ spoke_auth_dir }}/kubeconfig" - spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" \ No newline at end of file + spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml index 88a4026..49ce70a 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml @@ -31,4 +31,4 @@ fail_msg: >- Expected at least {{ spoke_ctlplanes + 2 }} systems in sushy-tools ({{ spoke_ctlplanes }} spoke + 2 hub), but found {{ sushy_systems.stdout }}. - success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" \ No newline at end of file + success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml index e0dd662..a5bae64 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml @@ -1,13 +1,39 @@ --- # Monitor BMH provisioning, agent registration, and installation progress -- name: Wait for agents to register - shell: | - oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l - register: agent_count - until: agent_count.stdout | int >= spoke_ctlplanes - retries: "{{ (spoke_agent_register_timeout / 30) | int }}" - delay: 30 +- block: + - name: Wait for agents to register + shell: | + oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l + register: agent_count + until: agent_count.stdout | int >= spoke_ctlplanes + retries: "{{ (spoke_agent_register_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect agent registration timeout diagnostics + shell: | + echo "=== Agents ({{ agent_count.stdout | default('0') }} / {{ spoke_ctlplanes }} registered) ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null || echo "No agents found" + echo "" + echo "=== BareMetalHosts ===" + oc get bmh -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== InfraEnv Status ===" + oc get infraenv -n {{ spoke_cluster_name }} -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: agent_reg_diag + changed_when: false + failed_when: false + + - name: Display agent registration timeout diagnostics + debug: + msg: "{{ agent_reg_diag.stdout }}" + + - name: Fail after agent registration timeout + fail: + msg: "Expected {{ spoke_ctlplanes }} agents but only {{ agent_count.stdout | default('0') }} registered within timeout" - name: Display registered agents shell: | @@ -19,68 +45,125 @@ debug: msg: "{{ agents_info.stdout }}" -- name: Wait for spoke cluster installation to complete - shell: | - ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ - -n {{ spoke_cluster_name }} \ - -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) - echo "$ACI_STATE" - case "$ACI_STATE" in - adding-hosts|installed) - exit 0 - ;; - error|failed) - echo "INSTALL FAILED" - oc get agentclusterinstall {{ spoke_cluster_name }} \ +- block: + - name: Wait for spoke cluster installation to complete + shell: | + ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ -n {{ spoke_cluster_name }} \ - -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool - exit 2 - ;; - *) - exit 1 - ;; - esac - register: install_state - until: install_state.rc == 0 - retries: "{{ (spoke_install_timeout / 30) | int }}" - delay: 30 - failed_when: install_state.rc == 2 + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) + echo "$ACI_STATE" + case "$ACI_STATE" in + adding-hosts|installed) + exit 0 + ;; + error|failed) + echo "INSTALL FAILED" + oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool + exit 2 + ;; + *) + exit 1 + ;; + esac + register: install_state + until: install_state.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + failed_when: install_state.rc == 2 + rescue: + - name: Collect installation timeout diagnostics + shell: | + echo "=== AgentClusterInstall Status ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 30 'status:' + echo "" + echo "=== Agents ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== ClusterDeployment ===" + oc get clusterdeployment {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: install_diag + changed_when: false + failed_when: false + + - name: Display installation timeout diagnostics + debug: + msg: "{{ install_diag.stdout }}" + + - name: Fail after installation timeout + fail: + msg: "Spoke cluster installation did not complete within timeout. Last state: {{ install_state.stdout_lines[0] | default('unknown') }}" - name: Display final installation state debug: msg: "Spoke cluster installation state: {{ install_state.stdout_lines[0] }}" -- name: Wait for all agents to reach Done stage - shell: | - oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ - | python3 -c " - import json - import sys - - data = json.load(sys.stdin) - agents = data.get('items', []) - total = len(agents) - done = 0 - stuck = [] - for a in agents: - stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') - if stage == 'Done': - done += 1 - else: +- block: + - name: Wait for all agents to reach Done stage + shell: | + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json + import sys + + data = json.load(sys.stdin) + agents = data.get('items', []) + total = len(agents) + done = 0 + stuck = [] + for a in agents: + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + if stage == 'Done': + done += 1 + else: + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + hostname = a.get('spec', {}).get('hostname', 'unknown') + stuck.append(f'{hostname}: state={state}, stage={stage}') + + print(f'Agents Done: {done} / {total}') + for s in stuck: + print(f' {s}') + sys.exit(0 if done == total else 1) + " + register: agents_done + until: agents_done.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + changed_when: false + rescue: + - name: Collect agent completion diagnostics + shell: | + echo "=== Agent Details ===" + oc get agents -n {{ spoke_cluster_name }} -o wide 2>/dev/null + echo "" + echo "=== Agent Progress ===" + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + for a in data.get('items', []): + name = a.get('spec', {}).get('hostname', 'unknown') + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') - hostname = a.get('spec', {}).get('hostname', 'unknown') - stuck.append(f'{hostname}: state={state}, stage={stage}') - - print(f'Agents Done: {done} / {total}') - for s in stuck: - print(f' {s}') - sys.exit(0 if done == total else 1) - " - register: agents_done - until: agents_done.rc == 0 - retries: "{{ (spoke_install_timeout / 30) | int }}" - delay: 30 - changed_when: false + print(f'{name}: state={state}, stage={stage}') + " 2>/dev/null || echo "Failed to parse agent details" + register: agents_done_diag + changed_when: false + failed_when: false + + - name: Display agent completion diagnostics + debug: + msg: "{{ agents_done_diag.stdout }}" + + - name: Fail after agent completion timeout + fail: + msg: "Not all agents reached Done stage within timeout" - name: Display final agent status shell: | @@ -100,4 +183,4 @@ - name: Show final BMH debug: - msg: "{{ final_bmh.stdout }}" \ No newline at end of file + msg: "{{ final_bmh.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml index 6434d77..ce4bd5d 100644 --- a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -14,4 +14,4 @@ spoke_network_name: "{{ spoke_cluster_name }}" spoke_vm_image_dir: /var/lib/libvirt/images # Credential output paths -spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" \ No newline at end of file +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" diff --git a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh index eba97c4..980511b 100755 --- a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh +++ b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh @@ -35,6 +35,12 @@ echo "Deploying spoke TNF cluster via assisted installer..." cd "${DEPLOY_DIR}/openshift-clusters" +# Parse spoke_cluster_name from vars/assisted.yml +SPOKE_CLUSTER_NAME=$(grep '^spoke_cluster_name:' vars/assisted.yml | awk '{print $2}' | tr -d '"' | tr -d "'") +if [[ -z "${SPOKE_CLUSTER_NAME}" ]]; then + SPOKE_CLUSTER_NAME="spoke-tnf" +fi + if ansible-playbook assisted-install.yml -i inventory.ini; then echo "" echo "OpenShift spoke TNF cluster deployment via assisted installer completed successfully!" @@ -42,7 +48,7 @@ if ansible-playbook assisted-install.yml -i inventory.ini; then echo "Next steps:" echo "1. Access spoke cluster:" echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" - echo " KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes" + echo " KUBECONFIG=~/${SPOKE_CLUSTER_NAME}/auth/kubeconfig oc get nodes" echo "2. Access hub cluster:" echo " source ${DEPLOY_DIR}/openshift-clusters/hub-proxy.env" echo " KUBECONFIG=~/auth/kubeconfig oc get nodes" diff --git a/deploy/openshift-clusters/vars/assisted.yml.template b/deploy/openshift-clusters/vars/assisted.yml.template index dc4f0d0..6c33ce5 100644 --- a/deploy/openshift-clusters/vars/assisted.yml.template +++ b/deploy/openshift-clusters/vars/assisted.yml.template @@ -28,12 +28,18 @@ spoke_vm_disk_size: 120 # GB spoke_ctlplanes: 2 # Always 2 for TNF # Spoke network configuration +# DHCP range is auto-computed as .50-.150 of the CIDR. +# VIPs must be outside that range to avoid conflicts. spoke_network_cidr: "192.168.125.0/24" spoke_api_vip: "192.168.125.5" spoke_ingress_vip: "192.168.125.10" spoke_cluster_network_cidr: "10.132.0.0/14" spoke_service_network_cidr: "172.31.0.0/16" +# Hub network CIDR (for cross-bridge nftables rules between hub and spoke) +# Must match the hub cluster's libvirt network. Default matches dev-scripts. +hub_network_cidr: "192.168.111.0/24" + # BMC / sushy-tools (defaults match dev-scripts deployment) spoke_bmc_user: admin spoke_bmc_password: password @@ -45,4 +51,4 @@ spoke_ksushy_port: 8000 assisted_storage_method: "hostpath" # Deployment options -force_cleanup: false \ No newline at end of file +force_cleanup: false