diff --git a/CLAUDE.md b/CLAUDE.md index ea34093..93645e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,8 +13,9 @@ Two-Node Toolbox (TNF) is a comprehensive deployment automation framework for Op # From the deploy/ directory: # Deploy AWS hypervisor and cluster in one command -make deploy arbiter-ipi # Deploy arbiter topology cluster +make deploy arbiter-ipi # Deploy arbiter topology cluster make deploy fencing-ipi # Deploy fencing topology cluster +make deploy fencing-assisted # Deploy hub + spoke TNF via assisted installer # Instance lifecycle management make create # Create new EC2 instance @@ -70,6 +71,15 @@ ansible-playbook kcli-install.yml -i inventory.ini -e "test_cluster_name=my-clus ansible-playbook kcli-install.yml -i inventory.ini -e "force_cleanup=true" ``` +#### Assisted Installer Method (Spoke TNF via ACM) +```bash +# Copy and customize the configuration template +cp vars/assisted.yml.template vars/assisted.yml + +# Deploy hub + spoke TNF cluster via assisted installer +make deploy fencing-assisted +``` + ### Linting and Validation ```bash # Shell script linting (from repository root) @@ -88,14 +98,17 @@ make shellcheck - Automatic inventory management for Ansible integration 2. **OpenShift Cluster Deployment** (`deploy/openshift-clusters/`) - - Two deployment methods: dev-scripts (traditional) and kcli (modern) + - Three deployment methods: dev-scripts (traditional), kcli (modern), and assisted installer (spoke via ACM) - Ansible roles for complete cluster automation - Support for both arbiter and fencing topologies + - Assisted installer deploys spoke TNF clusters on an existing hub via ACM/MCE - Proxy configuration for external cluster access 3. **Ansible Roles Architecture**: - `dev-scripts/install-dev`: Traditional deployment using openshift-metal3/dev-scripts - `kcli/kcli-install`: Modern deployment using kcli virtualization management + - `assisted/acm-install`: Install ACM/MCE + assisted service + enable TNF on hub + - `assisted/assisted-spoke`: Deploy spoke TNF cluster via assisted installer + BMH - `proxy-setup`: Squid proxy for cluster external access - `redfish`: Automated stonith configuration for fencing topology - `config`: SSH key and git configuration @@ -119,8 +132,15 @@ make shellcheck - `roles/kcli/kcli-install/files/pull-secret.json`: OpenShift pull secret - SSH key automatically read from `~/.ssh/id_ed25519.pub` on ansible controller +#### Assisted Installer Method +- `vars/assisted.yml`: Variable override file (copy from `vars/assisted.yml.template`) +- Hub cluster must be deployed first via dev-scripts (`make deploy fencing-ipi`) +- Spoke credentials output to `~//auth/` on hypervisor +- Hub proxy preserved as `hub-proxy.env` + #### Generated Files - `proxy.env`: Generated proxy configuration (source this to access cluster) +- `hub-proxy.env`: Hub proxy config (preserved when spoke proxy is configured) - `kubeconfig`: OpenShift cluster kubeconfig - `kubeadmin-password`: Default admin password @@ -128,7 +148,7 @@ make shellcheck 1. **Environment Setup**: Use `deploy/aws-hypervisor/` tools or bring your own RHEL 9 host 2. **Configuration**: Edit inventory and config files based on chosen deployment method -3. **Deployment**: Run appropriate Ansible playbook (setup.yml or kcli-install.yml) +3. **Deployment**: Run appropriate Ansible playbook (setup.yml, kcli-install.yml, or assisted-install.yml) 4. **Access**: Source `proxy.env` and use `oc` commands or WebUI through proxy 5. **Cleanup**: Use cleanup make targets or Ansible playbooks diff --git a/deploy/Makefile b/deploy/Makefile index a6fd992..f69ecc5 100644 --- a/deploy/Makefile +++ b/deploy/Makefile @@ -55,6 +55,10 @@ arbiter-ipi: arbiter-agent: @./openshift-clusters/scripts/deploy-arbiter-agent.sh +fencing-assisted: + @$(MAKE) fencing-ipi + @./openshift-clusters/scripts/deploy-fencing-assisted.sh + patch-nodes: @./openshift-clusters/scripts/patch-nodes.sh get-tnf-logs: @@ -82,6 +86,7 @@ help: @echo " fencing-agent - Deploy fencing Agent cluster (non-interactive) (WIP Experimental)" @echo " arbiter-ipi - Deploy arbiter IPI cluster (non-interactive)" @echo " arbiter-agent - Deploy arbiter Agent cluster (non-interactive)" + @echo " fencing-assisted - Deploy hub + spoke TNF cluster via assisted installer" @echo " redeploy-cluster - Redeploy OpenShift cluster using dev-scripts make redeploy" @echo " shutdown-cluster - Shutdown OpenShift cluster VMs in orderly fashion" @echo " startup-cluster - Start up OpenShift cluster VMs and proxy container" diff --git a/deploy/aws-hypervisor/scripts/create.sh b/deploy/aws-hypervisor/scripts/create.sh index 99d5737..1288d9c 100755 --- a/deploy/aws-hypervisor/scripts/create.sh +++ b/deploy/aws-hypervisor/scripts/create.sh @@ -42,7 +42,7 @@ echo -e "AMI ID: $RHEL_HOST_AMI" echo -e "Machine Type: $EC2_INSTANCE_TYPE" ec2Type="VirtualMachine" -if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[gn].metal ]]; then +if [[ "$EC2_INSTANCE_TYPE" =~ c[0-9]+[a-z]*.metal ]]; then ec2Type="MetalMachine" fi diff --git a/deploy/openshift-clusters/assisted-install.yml b/deploy/openshift-clusters/assisted-install.yml new file mode 100644 index 0000000..ccff642 --- /dev/null +++ b/deploy/openshift-clusters/assisted-install.yml @@ -0,0 +1,154 @@ +--- +# Deploy a spoke TNF cluster via ACM/assisted installer on an existing hub cluster. +# +# Prerequisites: +# - vars/assisted.yml exists (copy from vars/assisted.yml.template) +# +# Usage: +# make deploy fencing-assisted + +- hosts: metal_machine + gather_facts: yes + + vars: + topology: fencing + interactive_mode: false + pull_secret_path: /opt/dev-scripts/pull_secret.json + hub_kubeconfig: "{{ ansible_user_dir }}/auth/kubeconfig" + method: assisted + cluster_state_dir: "../aws-hypervisor/instance-data" + cluster_state_filename: "cluster-vm-state.json" + + vars_files: + - vars/assisted.yml + + pre_tasks: + - name: Check that proxy.env exists (hub must be deployed first) + stat: + path: "{{ playbook_dir }}/proxy.env" + delegate_to: localhost + register: proxy_env_check + + - name: Fail if proxy.env is missing + fail: + msg: >- + proxy.env not found. The hub cluster must be deployed first + using 'make deploy fencing-ipi'. proxy.env is required for + cluster access. + when: not proxy_env_check.stat.exists + + - name: Check that hub kubeconfig exists + stat: + path: "{{ ansible_user_dir }}/auth/kubeconfig" + register: hub_kubeconfig_check + + - name: Fail if hub kubeconfig is missing + fail: + msg: >- + Hub kubeconfig not found at ~/auth/kubeconfig. + The hub cluster must be deployed first. + when: not hub_kubeconfig_check.stat.exists + + - name: Preserve hub proxy.env as hub-proxy.env + copy: + src: "{{ playbook_dir }}/proxy.env" + dest: "{{ playbook_dir }}/hub-proxy.env" + remote_src: no + backup: no + delegate_to: localhost + + - name: Get hub release image + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.image}' + register: hub_release_image_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Get hub OCP version + shell: | + oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2 + register: hub_ocp_version_raw + changed_when: false + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" + + - name: Set hub release facts + set_fact: + hub_release_image: "{{ hub_release_image_raw.stdout }}" + hub_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + effective_release_image: >- + {{ hub_release_image_raw.stdout if spoke_release_image == 'auto' + else spoke_release_image }} + effective_ocp_version: "{{ hub_ocp_version_raw.stdout }}" + + - name: Display assisted installer configuration + debug: + msg: | + Assisted Installer Configuration: + Hub operator: {{ hub_operator }} + ACM/MCE channel: {{ acm_channel if hub_operator == 'acm' else mce_channel }} + Spoke cluster: {{ spoke_cluster_name }}.{{ spoke_base_domain }} + Spoke release image: {{ spoke_release_image }} + Spoke VMs: {{ spoke_ctlplanes }}x ({{ spoke_vm_vcpus }} vCPUs, {{ spoke_vm_memory }}MB RAM, {{ spoke_vm_disk_size }}GB disk) + Spoke network: {{ spoke_network_cidr }} + API VIP: {{ spoke_api_vip }} + Ingress VIP: {{ spoke_ingress_vip }} + Storage method: {{ assisted_storage_method }} + Force cleanup: {{ force_cleanup }} + + - name: Update cluster state to deploying + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deploying' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + + roles: + - role: assisted/acm-install + - role: assisted/assisted-spoke + + post_tasks: + - name: Setup proxy access for spoke cluster + include_role: + name: proxy-setup + vars: + kubeconfig_path: "{{ spoke_kubeconfig_path }}" + kubeadmin_password_path: "{{ spoke_kubeadmin_password_path }}" + + - name: Update cluster inventory with spoke VMs + include_role: + name: common + tasks_from: update-cluster-inventory + vars: + test_cluster_name: "{{ spoke_cluster_name }}" + + - name: Update cluster state to deployed + include_role: + name: common + tasks_from: cluster-state + vars: + cluster_state_phase: 'deployed' + default_playbook_name: 'assisted-install.yml' + num_masters: "{{ spoke_ctlplanes }}" + num_workers: 0 + + - name: Display deployment summary + debug: + msg: | + Spoke TNF cluster deployed successfully! + + Spoke credentials: + Kubeconfig: {{ spoke_kubeconfig_path }} + Admin password: {{ spoke_kubeadmin_password_path }} + + Access spoke cluster: + source proxy.env + KUBECONFIG={{ spoke_kubeconfig_path }} oc get nodes + + Access hub cluster: + source hub-proxy.env + KUBECONFIG=~/auth/kubeconfig oc get nodes diff --git a/deploy/openshift-clusters/collections/requirements.yml b/deploy/openshift-clusters/collections/requirements.yml index 291137f..4f4bdfd 100644 --- a/deploy/openshift-clusters/collections/requirements.yml +++ b/deploy/openshift-clusters/collections/requirements.yml @@ -13,3 +13,5 @@ collections: version: ">=2.0" - name: community.general version: ">=5.0.0" + - name: ansible.utils + version: ">=2.0.0" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/README.md b/deploy/openshift-clusters/roles/assisted/acm-install/README.md new file mode 100644 index 0000000..8e6a125 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/README.md @@ -0,0 +1,79 @@ +# acm-install Role + +Installs ACM or MCE operator on a hub cluster and configures the assisted installer service for spoke TNF cluster deployment. + +## Description + +This role prepares an existing hub OpenShift cluster to deploy spoke TNF clusters via the assisted installer. It: + +1. Validates hub cluster health and prerequisites +2. Provisions hostPath storage for the assisted service +3. Installs the ACM or MCE operator (auto-detects channel) +4. Creates the AgentServiceConfig with RHCOS ISO auto-extracted from the hub release image +5. Enables TNF cluster support in the assisted service +6. Configures BMO to watch all namespaces and disables the provisioning network + +## Requirements + +- A running hub OpenShift cluster (deployed via `make deploy fencing-ipi` or equivalent) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- Pull secret with access to required registries +- `oc` CLI available on the hypervisor + +## Role Variables + +### Configurable Variables (defaults/main.yml) + +- `hub_operator`: Operator to install - `"acm"` or `"mce"` (default: `"acm"`) +- `acm_channel`: ACM operator channel - `"auto"` detects from packagemanifest (default: `"auto"`) +- `mce_channel`: MCE operator channel (default: `"auto"`) +- `assisted_storage_method`: Storage backend - currently only `"hostpath"` (default: `"hostpath"`) +- `assisted_images_path`: Host directory for ISO images (default: `/var/lib/assisted-images`) +- `assisted_db_path`: Host directory for database (default: `/var/lib/assisted-db`) +- `assisted_images_size`: PV size for images (default: `50Gi`) +- `assisted_db_size`: PV size for database (default: `10Gi`) +- `assisted_storage_class`: StorageClass name (default: `assisted-service`) + +### Timeout Variables + +- `acm_csv_timeout`: Operator CSV install timeout in seconds (default: `900`) +- `multiclusterhub_timeout`: MultiClusterHub readiness timeout (default: `1800`) +- `assisted_service_timeout`: Assisted service pod readiness timeout (default: `600`) +- `metal3_stabilize_timeout`: Metal3 pod stabilization timeout after provisioning changes (default: `300`) + +### Variables Set by Playbook + +These are set in `assisted-install.yml` and passed to the role: + +- `hub_kubeconfig`: Path to hub cluster kubeconfig +- `pull_secret_path`: Path to pull secret on the hypervisor +- `hub_release_image`: Hub cluster release image (extracted in playbook pre_tasks) +- `hub_ocp_version`: Hub OCP version major.minor (extracted in playbook pre_tasks) +- `effective_release_image`: Release image to use for the spoke (hub image or user override) + +## Task Flow + +1. **validate.yml** - Checks hub cluster health, node readiness, and API access +2. **storage.yml** - Creates hostPath PVs, StorageClass, and fixes permissions/SELinux on hub nodes +3. **install-operator.yml** - Installs ACM/MCE operator subscription, waits for CSV, creates MultiClusterHub +4. **agent-service-config.yml** - Extracts RHCOS ISO URL from release image, creates AgentServiceConfig +5. **enable-tnf.yml** - Enables TNF support in assisted service configuration +6. **enable-watch-all-namespaces.yml** - Patches Provisioning CR to enable BMO in all namespaces + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +## Troubleshooting + +- Check operator CSV status: `oc get csv -n open-cluster-management` +- Check MultiClusterHub status: `oc get multiclusterhub -n open-cluster-management` +- Check assisted service pods: `oc get pods -n multicluster-engine -l app=assisted-service` +- Check AgentServiceConfig: `oc get agentserviceconfig agent -o yaml` +- Check events: `oc get events -n multicluster-engine --sort-by='.lastTimestamp'` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml new file mode 100644 index 0000000..0c4d760 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/defaults/main.yml @@ -0,0 +1,25 @@ +--- +# Default variables for acm-install role + +# Hub operator to install: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest +acm_channel: "auto" +mce_channel: "auto" + +# Storage method for assisted service: "hostpath" +assisted_storage_method: "hostpath" + +# hostPath directories on hub nodes +assisted_images_path: /var/lib/assisted-images +assisted_db_path: /var/lib/assisted-db +assisted_images_size: 50Gi +assisted_db_size: 10Gi +assisted_storage_class: assisted-service + +# Timeouts (seconds) +acm_csv_timeout: 900 +multiclusterhub_timeout: 1800 +assisted_service_timeout: 600 +metal3_stabilize_timeout: 300 diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml new file mode 100644 index 0000000..0cd9fc7 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/agent-service-config.yml @@ -0,0 +1,128 @@ +--- +# Create AgentServiceConfig with RHCOS ISO auto-extracted from release image +# hub_release_image, hub_ocp_version, effective_release_image are set by playbook pre_tasks + +- name: Extract RHCOS ISO URL from release image + shell: | + # Get the machine-os-images reference from the release image + RHCOS_REF=$(oc adm release info "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --image-for=machine-os-images 2>/dev/null) + if [ -z "$RHCOS_REF" ]; then + echo "FAILED: Could not extract machine-os-images from release image" + exit 1 + fi + # Extract the RHCOS ISO URL from the image labels/annotations + oc image info "$RHCOS_REF" --registry-config="{{ pull_secret_path }}" \ + -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + labels = data.get('config', {}).get('config', {}).get('Labels', {}) + stream = labels.get('coreos.stream', '') + version = labels.get('version', '') + if stream and version: + url = f'https://rhcos.mirror.openshift.com/art/storage/prod/streams/{stream}/builds/{version}/x86_64/rhcos-{version}-live-iso.x86_64.iso' + print(url) + else: + print('NEEDS_FALLBACK') + " + register: rhcos_iso_extraction + changed_when: false + failed_when: >- + rhcos_iso_extraction.rc != 0 or + 'FAILED' in rhcos_iso_extraction.stdout + +- name: Try fallback RHCOS ISO extraction via coreos print-stream-json + shell: | + rm -rf /tmp/oc-extract && mkdir -p /tmp/oc-extract + RHCOS_URL=$(oc adm release extract "{{ effective_release_image }}" \ + --registry-config="{{ pull_secret_path }}" \ + --command=openshift-install --to=/tmp/oc-extract 2>/dev/null && \ + /tmp/oc-extract/openshift-install coreos print-stream-json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + iso = data['architectures']['x86_64']['artifacts']['metal']['formats']['iso']['disk'] + print(iso['location']) + " 2>/dev/null) || true + rm -rf /tmp/oc-extract + if [ -n "$RHCOS_URL" ]; then + echo "$RHCOS_URL" + else + echo "FAILED" + fi + register: rhcos_iso_fallback + changed_when: false + when: "'NEEDS_FALLBACK' in rhcos_iso_extraction.stdout" + +- name: Set RHCOS ISO URL fact + set_fact: + rhcos_iso_url: >- + {{ (rhcos_iso_fallback.stdout | default(rhcos_iso_extraction.stdout)) | trim }} + failed_when: >- + rhcos_iso_url == 'FAILED' or + rhcos_iso_url == 'NEEDS_FALLBACK' or + rhcos_iso_url == '' + +- name: Display RHCOS ISO URL + debug: + msg: "RHCOS ISO URL: {{ rhcos_iso_url }}" + +- name: Get RHCOS version from ISO URL + set_fact: + rhcos_version: "{{ rhcos_iso_url | regex_search('rhcos-([\\d.]+-\\d+)-live', '\\1') | first }}" + +- name: Create AgentServiceConfig + template: + src: agentserviceconfig.yml.j2 + dest: /tmp/agentserviceconfig.yml + mode: '0644' + +- name: Apply AgentServiceConfig + shell: | + oc apply -f /tmp/agentserviceconfig.yml + register: asc_result + changed_when: "'created' in asc_result.stdout" + +- block: + - name: Wait for assisted-service pod to be Running (2/2) + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service \ + --no-headers 2>/dev/null | grep -q '2/2.*Running' + register: assisted_pod + until: assisted_pod.rc == 0 + retries: "{{ (assisted_service_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect assisted-service timeout diagnostics + shell: | + echo "=== Assisted Service Pods ===" + oc get pods -n {{ assisted_service_namespace }} 2>/dev/null + echo "" + echo "=== Pod Details ===" + oc describe pods -n {{ assisted_service_namespace }} -l app=assisted-service 2>/dev/null | tail -40 + echo "" + echo "=== Recent Events ===" + oc get events -n {{ assisted_service_namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: assisted_diag + changed_when: false + failed_when: false + + - name: Display assisted-service timeout diagnostics + debug: + msg: "{{ assisted_diag.stdout }}" + + - name: Fail after assisted-service timeout + fail: + msg: "assisted-service pod did not reach Running (2/2) state within timeout" + +- name: Display assisted-service pod status + shell: | + oc get pods -n {{ assisted_service_namespace }} -l app=assisted-service + register: pod_status + changed_when: false + +- name: Show assisted-service pod + debug: + msg: "{{ pod_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml new file mode 100644 index 0000000..3a00c4c --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-tnf.yml @@ -0,0 +1,45 @@ +--- +# Enable TNF cluster support in assisted service +# Requires both a ConfigMap AND an annotation on AgentServiceConfig + +- name: Create assisted-unsupported-config ConfigMap + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: ConfigMap + metadata: + name: assisted-unsupported-config + namespace: {{ assisted_service_namespace }} + data: + TNF_CLUSTERS_SUPPORT: "true" + EOF + register: cm_result + changed_when: "'created' in cm_result.stdout" + +- name: Annotate AgentServiceConfig to mount unsupported config + shell: | + oc annotate agentserviceconfig agent \ + unsupported.agent-install.openshift.io/assisted-service-configmap=assisted-unsupported-config \ + --overwrite + register: annotate_result + changed_when: "'annotated' in annotate_result.stdout" + +- name: Wait for assisted-service rollout after annotation + shell: | + oc rollout status deployment/assisted-service \ + -n {{ assisted_service_namespace }} --timeout=120s + register: rollout_result + changed_when: false + +- name: Verify TNF support is enabled + shell: | + oc exec -n {{ assisted_service_namespace }} \ + $(oc get pod -n {{ assisted_service_namespace }} -l app=assisted-service -o name | head -1) \ + -c assisted-service -- env | grep -i TNF_CLUSTERS_SUPPORT + register: tnf_verify + changed_when: false + failed_when: "'TNF_CLUSTERS_SUPPORT=true' not in tnf_verify.stdout" + +- name: Display TNF support status + debug: + msg: "{{ tnf_verify.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml new file mode 100644 index 0000000..cb7dd4f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/enable-watch-all-namespaces.yml @@ -0,0 +1,49 @@ +--- +# Patch Provisioning CR to: +# 1. Enable watchAllNamespaces (BMO processes BMHs in all namespaces) +# 2. Disable provisioning network (spoke VMs not on provisioning network) +# 3. Remove leftover provisioning fields that cause ironic to hang + +- name: Patch Provisioning CR - enable watchAllNamespaces and disable provisioning network + shell: | + oc patch provisioning provisioning-configuration --type=merge \ + -p '{"spec":{"provisioningNetwork":"Disabled","watchAllNamespaces":true}}' + register: patch_result + changed_when: "'patched' in patch_result.stdout" + +- name: Remove leftover provisioning fields from Provisioning CR + shell: | + oc patch provisioning provisioning-configuration --type=json \ + -p '[ + {"op":"remove","path":"/spec/provisioningIP"}, + {"op":"remove","path":"/spec/provisioningDHCPRange"}, + {"op":"remove","path":"/spec/provisioningNetworkCIDR"}, + {"op":"remove","path":"/spec/provisioningInterface"} + ]' 2>&1 + register: remove_result + changed_when: "'patched' in remove_result.stdout" + failed_when: >- + remove_result.rc != 0 and + 'does not exist' not in remove_result.stderr and + 'does not exist' not in remove_result.stdout + +- name: Wait for metal3 pod to stabilize after provisioning change + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state \ + --no-headers 2>/dev/null | grep -v Terminating | grep -q Running + register: metal3_pod + until: metal3_pod.rc == 0 + retries: "{{ (metal3_stabilize_timeout / 15) | int }}" + delay: 15 + +- name: Display metal3 pod status + shell: | + oc get pods -n openshift-machine-api \ + -l baremetal.openshift.io/cluster-baremetal-operator=metal3-state + register: metal3_status + changed_when: false + +- name: Show metal3 pod + debug: + msg: "{{ metal3_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml new file mode 100644 index 0000000..caeafae --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/install-operator.yml @@ -0,0 +1,158 @@ +--- +# Install ACM or MCE operator with auto-detected channel + +- name: Set operator configuration + set_fact: + op_config: "{{ operator_config[hub_operator] }}" + +- name: Create operator namespace + shell: | + oc create namespace {{ op_config.namespace }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create OperatorGroup + shell: | + oc apply -f - <<'EOF' + apiVersion: operators.coreos.com/v1 + kind: OperatorGroup + metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} + spec: + targetNamespaces: + - {{ op_config.namespace }} + EOF + register: og_result + changed_when: "'created' in og_result.stdout" + +- name: Determine operator channel + block: + - name: Auto-detect channel from packagemanifest + shell: | + oc get packagemanifest {{ op_config.package_name }} \ + -o jsonpath='{.status.defaultChannel}' + register: detected_channel + changed_when: false + + - name: Set operator channel fact + set_fact: + operator_channel: "{{ detected_channel.stdout }}" + when: (hub_operator == 'acm' and acm_channel == 'auto') or + (hub_operator == 'mce' and mce_channel == 'auto') + +- name: Use user-specified channel + set_fact: + operator_channel: "{{ acm_channel if hub_operator == 'acm' else mce_channel }}" + when: (hub_operator == 'acm' and acm_channel != 'auto') or + (hub_operator == 'mce' and mce_channel != 'auto') + +- name: Display operator channel + debug: + msg: "Installing {{ hub_operator | upper }} with channel: {{ operator_channel }}" + +- name: Create operator Subscription + template: + src: operator-subscription.yml.j2 + dest: /tmp/operator-subscription.yml + mode: '0644' + +- name: Apply operator Subscription + shell: | + oc apply -f /tmp/operator-subscription.yml + register: sub_result + changed_when: "'created' in sub_result.stdout" + +- block: + - name: Wait for operator CSV to succeed + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers 2>/dev/null \ + | grep {{ op_config.package_name }} \ + | grep -q Succeeded + register: csv_result + until: csv_result.rc == 0 + retries: "{{ (acm_csv_timeout / 15) | int }}" + delay: 15 + rescue: + - name: Collect CSV timeout diagnostics + shell: | + echo "=== CSV Status ===" + oc get csv -n {{ op_config.namespace }} 2>/dev/null + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: csv_diag + changed_when: false + failed_when: false + + - name: Display CSV timeout diagnostics + debug: + msg: "{{ csv_diag.stdout }}" + + - name: Fail after CSV timeout + fail: + msg: "Operator CSV did not reach Succeeded state within timeout" + +- name: Display operator install result + shell: | + oc get csv -n {{ op_config.namespace }} --no-headers \ + | grep {{ op_config.package_name }} + register: csv_info + changed_when: false + +- name: Show installed operator + debug: + msg: "{{ csv_info.stdout }}" + +# Create MultiClusterHub (for ACM) or MultiClusterEngine (for MCE) +- name: Create MultiClusterHub CR + template: + src: multiclusterhub.yml.j2 + dest: /tmp/multiclusterhub.yml + mode: '0644' + when: hub_operator == 'acm' + +- name: Apply MultiClusterHub CR + shell: | + oc apply -f /tmp/multiclusterhub.yml + register: mch_result + changed_when: "'created' in mch_result.stdout" + when: hub_operator == 'acm' + +- block: + - name: Wait for MultiClusterHub to reach Running phase + shell: | + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} \ + -o jsonpath='{.status.phase}' + register: mch_phase + until: mch_phase.stdout == 'Running' + retries: "{{ (multiclusterhub_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect MCH timeout diagnostics + shell: | + echo "=== MultiClusterHub Status ===" + oc get multiclusterhub multiclusterhub -n {{ op_config.namespace }} -o yaml 2>/dev/null | grep -A 50 'status:' + echo "" + echo "=== Non-Running Pods ===" + oc get pods -n {{ op_config.namespace }} --no-headers 2>/dev/null | grep -v Running + echo "" + echo "=== Recent Events ===" + oc get events -n {{ op_config.namespace }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: mch_diag + changed_when: false + failed_when: false + + - name: Display MCH timeout diagnostics + debug: + msg: "{{ mch_diag.stdout }}" + + - name: Fail after MCH timeout + fail: + msg: "MultiClusterHub did not reach Running phase within timeout" + when: hub_operator == 'acm' + +- name: Display MultiClusterHub status + debug: + msg: "MultiClusterHub phase: {{ mch_phase.stdout }}" + when: hub_operator == 'acm' diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml new file mode 100644 index 0000000..9299e93 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/main.yml @@ -0,0 +1,23 @@ +--- +# Install ACM/MCE + assisted service + enable TNF support on hub cluster + +- block: + - name: Validate hub cluster health + include_tasks: validate.yml + + - name: Provision storage for assisted service + include_tasks: storage.yml + + - name: Install {{ hub_operator | upper }} operator + include_tasks: install-operator.yml + + - name: Create AgentServiceConfig + include_tasks: agent-service-config.yml + + - name: Enable TNF cluster support in assisted service + include_tasks: enable-tnf.yml + + - name: Enable BMO watch all namespaces and disable provisioning network + include_tasks: enable-watch-all-namespaces.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml new file mode 100644 index 0000000..47c715d --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/storage.yml @@ -0,0 +1,62 @@ +--- +# Provision hostPath storage for assisted service +# Requires chmod 777 (non-root containers) + SELinux context fix on both nodes + +- name: Create StorageClass and PersistentVolumes for assisted service + shell: | + oc apply -f - <<'EOF' + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-images + spec: + capacity: + storage: {{ assisted_images_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_images_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: v1 + kind: PersistentVolume + metadata: + name: assisted-pv-db + spec: + capacity: + storage: {{ assisted_db_size }} + accessModes: [ReadWriteOnce] + hostPath: + path: {{ assisted_db_path }} + storageClassName: {{ assisted_storage_class }} + --- + apiVersion: storage.k8s.io/v1 + kind: StorageClass + metadata: + name: {{ assisted_storage_class }} + provisioner: kubernetes.io/no-provisioner + volumeBindingMode: WaitForFirstConsumer + EOF + register: storage_result + changed_when: "'created' in storage_result.stdout" + +- name: Get hub cluster node names + shell: | + oc get nodes --no-headers -o custom-columns=NAME:.metadata.name + register: hub_nodes + changed_when: false + +- name: Fix hostPath permissions and SELinux context on each hub node + shell: | + oc debug node/{{ item }} -- chroot /host bash -c " + mkdir -p {{ assisted_images_path }} {{ assisted_db_path }} + rm -rf {{ assisted_images_path }}/* {{ assisted_db_path }}/* + chmod 777 {{ assisted_images_path }} {{ assisted_db_path }} + chcon -Rt container_file_t {{ assisted_images_path }} {{ assisted_db_path }} + " + loop: "{{ hub_nodes.stdout_lines }}" + register: selinux_fix + changed_when: true + +- name: Display storage setup result + debug: + msg: "Storage provisioned: hostPath PVs with permissions and SELinux fix on {{ hub_nodes.stdout_lines | length }} nodes" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml new file mode 100644 index 0000000..a5c103a --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/tasks/validate.yml @@ -0,0 +1,36 @@ +--- +# Validate hub cluster is healthy before proceeding + +- name: Check hub cluster nodes are Ready + shell: | + oc get nodes --no-headers | awk '{print $2}' | sort -u + register: node_statuses + changed_when: false + failed_when: "'NotReady' in node_statuses.stdout" + +- name: Check hub cluster node count + shell: | + oc get nodes --no-headers | wc -l + register: node_count + changed_when: false + failed_when: node_count.stdout | int < 2 + +- name: Check for degraded cluster operators + shell: | + oc get co -o json | python3 -c " + import json, sys + cos = json.load(sys.stdin)['items'] + degraded = [c['metadata']['name'] for c in cos + if any(cond['type'] == 'Degraded' and cond['status'] == 'True' + for cond in c['status']['conditions'])] + if degraded: + print('Degraded operators: ' + ', '.join(degraded)) + sys.exit(1) + print('All cluster operators healthy') + " + register: co_check + changed_when: false + +- name: Display hub cluster status + debug: + msg: "Hub cluster healthy: {{ node_count.stdout | trim }} nodes Ready, {{ co_check.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 new file mode 100644 index 0000000..dc97a08 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/agentserviceconfig.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: AgentServiceConfig +metadata: + name: agent +spec: + databaseStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_db_size }} + filesystemStorage: + storageClassName: {{ assisted_storage_class }} + accessModes: [ReadWriteOnce] + resources: + requests: + storage: {{ assisted_images_size }} + osImages: + - cpuArchitecture: x86_64 + openshiftVersion: "{{ hub_ocp_version }}" + url: "{{ rhcos_iso_url }}" + version: "{{ rhcos_version }}" diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 new file mode 100644 index 0000000..fce239b --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/multiclusterhub.yml.j2 @@ -0,0 +1,7 @@ +apiVersion: operator.open-cluster-management.io/v1 +kind: MultiClusterHub +metadata: + name: multiclusterhub + namespace: {{ op_config.namespace }} +spec: + availabilityConfig: Basic diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 new file mode 100644 index 0000000..6bec2ad --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/templates/operator-subscription.yml.j2 @@ -0,0 +1,11 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: Subscription +metadata: + name: {{ op_config.subscription_name }} + namespace: {{ op_config.namespace }} +spec: + channel: {{ operator_channel }} + installPlanApproval: Automatic + name: {{ op_config.subscription_name }} + source: {{ op_config.source }} + sourceNamespace: openshift-marketplace diff --git a/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml new file mode 100644 index 0000000..b4679a9 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/acm-install/vars/main.yml @@ -0,0 +1,20 @@ +--- +# Role-internal variables (not user-overridable) + +acm_namespace: open-cluster-management +mce_namespace: multicluster-engine + +operator_config: + acm: + namespace: "{{ acm_namespace }}" + package_name: advanced-cluster-management + subscription_name: advanced-cluster-management + source: redhat-operators + mce: + namespace: "{{ mce_namespace }}" + package_name: multicluster-engine + subscription_name: multicluster-engine + source: redhat-operators + +# The MCE namespace is always multicluster-engine regardless of hub_operator +assisted_service_namespace: multicluster-engine diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md new file mode 100644 index 0000000..067b3d8 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/README.md @@ -0,0 +1,132 @@ +# assisted-spoke Role + +Deploys a spoke TNF (Two-Node with Fencing) cluster on a hub via the assisted installer and BareMetalHost resources. + +## Description + +This role creates and installs a spoke TNF cluster on an existing hub that has ACM/MCE and the assisted service configured (via the `acm-install` role). It: + +1. Optionally cleans up existing spoke resources (when `force_cleanup=true`) +2. Creates a dedicated libvirt network for the spoke cluster +3. Creates spoke VMs with the specified resources +4. Verifies sushy-tools (Redfish BMC simulator) is running +5. Creates cluster resources on the hub (ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet) +6. Creates BareMetalHost resources to trigger agent-based installation +7. Monitors agent registration, cluster installation, and agent completion +8. Retrieves spoke cluster credentials (kubeconfig, admin password) + +## Requirements + +- Hub cluster with ACM/MCE and assisted service configured (run `acm-install` role first) +- Hub kubeconfig accessible at `~/auth/kubeconfig` +- libvirt/KVM available on the hypervisor +- sushy-tools installed for Redfish BMC simulation +- `oc` and `virsh` CLIs available on the hypervisor + +## Role Variables + +### Spoke Cluster Identity + +- `spoke_cluster_name`: Cluster name, must be DNS-safe (default: `"spoke-tnf"`) +- `spoke_base_domain`: Base domain for the spoke cluster (default: `"example.com"`) +- `spoke_release_image`: Release image - `"auto"` uses the hub release image (default: `"auto"`) + +### VM Specifications + +- `spoke_vm_memory`: Memory per node in MB (default: `32768`) +- `spoke_vm_vcpus`: CPU cores per node (default: `4`) +- `spoke_vm_disk_size`: Disk size per node in GB (default: `120`) +- `spoke_ctlplanes`: Number of control plane nodes, must be 2 for TNF (default: `2`) + +### Network Configuration + +- `spoke_network_cidr`: Spoke cluster network CIDR (default: `"192.168.125.0/24"`) +- `spoke_api_vip`: API VIP address (default: `"192.168.125.5"`) +- `spoke_ingress_vip`: Ingress VIP address (default: `"192.168.125.10"`) +- `spoke_cluster_network_cidr`: Pod network CIDR (default: `"10.132.0.0/14"`) +- `spoke_service_network_cidr`: Service network CIDR (default: `"172.31.0.0/16"`) +- `hub_network_cidr`: Hub network CIDR for cross-bridge nftables rules (default: `"192.168.111.0/24"`) + +### BMC / sushy-tools + +- `spoke_bmc_user`: BMC username (default: `"admin"`) +- `spoke_bmc_password`: BMC password (default: `"password"`) +- `spoke_ksushy_ip`: sushy-tools listen IP (default: `"192.168.111.1"`) +- `spoke_ksushy_port`: sushy-tools port (default: `8000`) + +### Deployment Options + +- `force_cleanup`: Remove existing spoke resources before deployment (default: `false`) + +### Timeout Variables + +- `spoke_install_timeout`: Cluster installation timeout in seconds (default: `3600`) +- `spoke_agent_register_timeout`: Agent registration timeout (default: `900`) +- `spoke_credentials_timeout`: Credential retrieval timeout (default: `1800`) + +### Computed Variables (vars/main.yml) + +These are derived automatically and should not be overridden: + +- `spoke_network_gateway`: First IP in spoke CIDR +- `spoke_dhcp_start` / `spoke_dhcp_end`: DHCP range within spoke CIDR +- `spoke_network_name`: Libvirt network name (matches `spoke_cluster_name`) +- `spoke_vm_image_dir`: VM disk image directory (`/var/lib/libvirt/images`) +- `spoke_auth_dir`: Credential output directory (`~//auth`) + +## Task Flow + +1. **cleanup.yml** - Removes existing spoke namespace, VMs, network, credentials (when `force_cleanup=true`) +2. **create-spoke-network.yml** - Creates dedicated libvirt network with DHCP for spoke VMs +3. **create-spoke-vms.yml** - Creates spoke VM disk images and defines libvirt domains +4. **setup-ksushy.yml** - Verifies sushy-tools is running for Redfish BMC +5. **create-cluster-resources.yml** - Creates ClusterDeployment, AgentClusterInstall, InfraEnv, ClusterImageSet on hub +6. **create-bmh.yml** - Creates BareMetalHost resources that trigger spoke installation +7. **wait-for-install.yml** - Monitors agent registration, installation progress, and agent completion +8. **retrieve-credentials.yml** - Extracts kubeconfig and admin password, configures DNS, verifies access + +## Usage + +This role is not called directly. It is invoked via `assisted-install.yml`: + +```bash +make deploy fencing-assisted +# or +ansible-playbook assisted-install.yml -i inventory.ini +``` + +### Configuration + +Copy and customize the variables template: + +```bash +cp vars/assisted.yml.template vars/assisted.yml +# Edit vars/assisted.yml with desired spoke configuration +``` + +### Accessing the Spoke Cluster + +After deployment: + +```bash +source proxy.env +KUBECONFIG=~/spoke-tnf/auth/kubeconfig oc get nodes +``` + +### Redeployment + +To redeploy with cleanup of existing resources: + +```bash +ansible-playbook assisted-install.yml -i inventory.ini -e "force_cleanup=true" +``` + +## Troubleshooting + +- Check spoke VMs: `sudo virsh list --all | grep spoke` +- Check agents: `oc get agents -n ` +- Check BMH status: `oc get bmh -n ` +- Check installation progress: `oc get agentclusterinstall -n -o yaml` +- Check spoke events: `oc get events -n --sort-by='.lastTimestamp'` +- Check sushy-tools: `sudo systemctl status ksushy` +- Check spoke network: `sudo virsh net-list | grep ` \ No newline at end of file diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml new file mode 100644 index 0000000..bdcc277 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/defaults/main.yml @@ -0,0 +1,39 @@ +--- +# Default variables for assisted-spoke role + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version - "auto" uses hub release image +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 +spoke_ctlplanes: 2 + +# Spoke network +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# BMC / sushy-tools +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Deployment options +force_cleanup: false + +# Timeouts (seconds) +spoke_install_timeout: 3600 +spoke_agent_register_timeout: 900 +spoke_credentials_timeout: 1800 + +# Hub network CIDR (for cross-bridge nftables rules) +hub_network_cidr: "192.168.111.0/24" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml new file mode 100644 index 0000000..9ff463d --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/cleanup.yml @@ -0,0 +1,55 @@ +--- +# Remove existing spoke resources for re-deployment + +- name: Delete spoke namespace (removes all cluster resources) + shell: | + oc delete namespace {{ spoke_cluster_name }} --ignore-not-found --timeout=120s + register: ns_delete + changed_when: "'deleted' in ns_delete.stdout" + ignore_errors: true + +- name: Delete ClusterImageSet + shell: | + OCP_VER=$(oc get clusterversion version -o jsonpath='{.status.desired.version}' | cut -d. -f1-2) + oc delete clusterimageset "${OCP_VER}.0" --ignore-not-found + changed_when: false + ignore_errors: true + +- name: Destroy spoke VMs + shell: | + for i in $(seq 0 {{ spoke_ctlplanes - 1 }}); do + sudo virsh destroy {{ spoke_cluster_name }}-master-${i} 2>/dev/null || true + sudo virsh undefine {{ spoke_cluster_name }}-master-${i} --remove-all-storage 2>/dev/null || true + done + changed_when: true + ignore_errors: true + +- name: Remove spoke VM disk images + file: + path: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + state: absent + loop: "{{ range(spoke_ctlplanes) | list }}" + become: true + +- name: Remove spoke libvirt network + shell: | + sudo virsh net-destroy {{ spoke_network_name }} 2>/dev/null || true + sudo virsh net-undefine {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + ignore_errors: true + +- name: Remove spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: absent + +- name: Remove spoke /etc/hosts entry + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: absent + become: true + +- name: Display cleanup result + debug: + msg: "Spoke cluster '{{ spoke_cluster_name }}' resources cleaned up" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml new file mode 100644 index 0000000..42cfffc --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-bmh.yml @@ -0,0 +1,63 @@ +--- +# Create BareMetalHost resources with BMC secrets and fencing credentials for each spoke node + +- name: Create BMH resources for each spoke node + shell: | + oc apply -f - <<'EOF' + --- + apiVersion: v1 + data: + password: {{ spoke_bmc_password | b64encode }} + username: {{ spoke_bmc_user | b64encode }} + kind: Secret + metadata: + name: {{ item.name }}-bmc-secret + namespace: {{ spoke_cluster_name }} + type: Opaque + --- + apiVersion: metal3.io/v1alpha1 + kind: BareMetalHost + metadata: + name: {{ item.name }}-bmh + namespace: {{ spoke_cluster_name }} + annotations: + bmac.agent-install.openshift.io/hostname: "{{ item.name }}" + bmac.agent-install.openshift.io/role: "master" + bmac.agent-install.openshift.io/fencing-credentials-secret-name: "{{ item.name }}-fencing-credentials" + labels: + infraenvs.agent-install.openshift.io: "{{ spoke_cluster_name }}" + spec: + architecture: x86_64 + bmc: + address: redfish-virtualmedia+https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + credentialsName: {{ item.name }}-bmc-secret + disableCertificateVerification: true + bootMACAddress: {{ item.mac }} + automatedCleaningMode: disabled + online: true + --- + apiVersion: v1 + stringData: + address: https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/{{ item.uuid }} + certificateVerification: Disabled + username: {{ spoke_bmc_user }} + password: {{ spoke_bmc_password }} + kind: Secret + metadata: + name: {{ item.name }}-fencing-credentials + namespace: {{ spoke_cluster_name }} + type: Opaque + EOF + loop: "{{ spoke_vms }}" + register: bmh_result + changed_when: "'created' in bmh_result.stdout" + +- name: Display BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: bmh_status + changed_when: false + +- name: Show BMH resources + debug: + msg: "{{ bmh_status.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml new file mode 100644 index 0000000..285b20c --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-cluster-resources.yml @@ -0,0 +1,87 @@ +--- +# Create spoke cluster resources on hub: namespace, secrets, ClusterDeployment, +# AgentClusterInstall, InfraEnv, ClusterImageSet +# hub_release_image, hub_ocp_version, effective_release_image, effective_ocp_version +# are set by playbook pre_tasks + +- name: Get SSH public key + shell: | + for key in ~/.ssh/id_ed25519.pub ~/.ssh/id_rsa.pub ~/.ssh/id_ecdsa.pub; do + [ -f "$key" ] && cat "$key" && exit 0 + done + echo "ERROR: No SSH public key found" >&2 && exit 1 + register: ssh_pub_key + changed_when: false + +- name: Create spoke namespace + shell: | + oc create namespace {{ spoke_cluster_name }} 2>/dev/null || echo "Namespace already exists" + register: ns_result + changed_when: "'created' in ns_result.stdout" + +- name: Create spoke pull secret + shell: | + oc get secret {{ spoke_cluster_name }}-pull-secret -n {{ spoke_cluster_name }} 2>/dev/null \ + && echo "Already exists" \ + || oc create secret generic {{ spoke_cluster_name }}-pull-secret \ + -n {{ spoke_cluster_name }} \ + --from-file=.dockerconfigjson={{ pull_secret_path }} \ + --type=kubernetes.io/dockerconfigjson + register: ps_result + changed_when: "'created' in ps_result.stdout" + +- name: Create ClusterImageSet + template: + src: clusterimageset.yml.j2 + dest: /tmp/clusterimageset.yml + mode: '0644' + +- name: Apply ClusterImageSet + shell: | + oc apply -f /tmp/clusterimageset.yml + register: cis_result + changed_when: "'created' in cis_result.stdout" + +- name: Create ClusterDeployment + template: + src: clusterdeployment.yml.j2 + dest: /tmp/clusterdeployment.yml + mode: '0644' + +- name: Apply ClusterDeployment + shell: | + oc apply -f /tmp/clusterdeployment.yml + register: cd_result + changed_when: "'created' in cd_result.stdout" + +- name: Create AgentClusterInstall + template: + src: agentclusterinstall.yml.j2 + dest: /tmp/agentclusterinstall.yml + mode: '0644' + +- name: Apply AgentClusterInstall + shell: | + oc apply -f /tmp/agentclusterinstall.yml + register: aci_result + changed_when: "'created' in aci_result.stdout" + +- name: Create InfraEnv + template: + src: infraenv.yml.j2 + dest: /tmp/infraenv.yml + mode: '0644' + +- name: Apply InfraEnv + shell: | + oc apply -f /tmp/infraenv.yml + register: ie_result + changed_when: "'created' in ie_result.stdout" + +- name: Display cluster resources status + debug: + msg: >- + Spoke cluster resources created: ClusterImageSet={{ effective_ocp_version }}.0, + ClusterDeployment={{ spoke_cluster_name }}, + AgentClusterInstall={{ spoke_cluster_name }}, + InfraEnv={{ spoke_cluster_name }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml new file mode 100644 index 0000000..15347a6 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-network.yml @@ -0,0 +1,48 @@ +--- +# Create dedicated libvirt network for spoke cluster with DNS for api/apps VIPs +# Then add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + +- name: Check if spoke network already exists + shell: | + sudo virsh net-info {{ spoke_network_name }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + register: net_check + changed_when: false + +- name: Create spoke libvirt network definition + template: + src: spoke-network.xml.j2 + dest: /tmp/spoke-network.xml + mode: '0644' + when: "'NOT_FOUND' in net_check.stdout" + +- name: Define spoke libvirt network + shell: | + sudo virsh net-define /tmp/spoke-network.xml + when: "'NOT_FOUND' in net_check.stdout" + +- name: Start spoke libvirt network + shell: | + sudo virsh net-start {{ spoke_network_name }} 2>/dev/null || true + changed_when: true + +- name: Set spoke libvirt network to autostart + shell: | + sudo virsh net-autostart {{ spoke_network_name }} + changed_when: true + +- name: Add cross-bridge nftables FORWARD rules for spoke<->hub connectivity + shell: | + # Check if rules already exist + if sudo nft list chain ip filter FORWARD 2>/dev/null | grep -q "{{ spoke_network_cidr }}.*{{ hub_network_cidr }}"; then + echo "Rules already exist" + else + sudo nft insert rule ip filter FORWARD ip saddr {{ spoke_network_cidr }} ip daddr {{ hub_network_cidr }} accept + sudo nft insert rule ip filter FORWARD ip saddr {{ hub_network_cidr }} ip daddr {{ spoke_network_cidr }} accept + echo "Rules added" + fi + register: nft_result + changed_when: "'added' in nft_result.stdout" + +- name: Display network setup result + debug: + msg: "Spoke network '{{ spoke_network_name }}' on {{ spoke_network_cidr }} ready, cross-bridge rules: {{ nft_result.stdout | trim }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml new file mode 100644 index 0000000..e59a541 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/create-spoke-vms.yml @@ -0,0 +1,75 @@ +--- +# Create empty libvirt VMs for spoke cluster, capture UUID/MAC + +- name: Create spoke VM disks + shell: | + sudo qemu-img create -f qcow2 \ + {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2 \ + {{ spoke_vm_disk_size }}G + args: + creates: "{{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item }}.qcow2" + loop: "{{ range(spoke_ctlplanes) | list }}" + +- name: Check if spoke VMs already exist + shell: | + sudo virsh dominfo {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null && echo "EXISTS" || echo "NOT_FOUND" + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_check + changed_when: false + +- name: Create spoke VMs (defined but not started) + shell: | + sudo virt-install \ + --name {{ spoke_cluster_name }}-master-{{ item.item }} \ + --ram {{ spoke_vm_memory }} \ + --vcpus {{ spoke_vm_vcpus }} \ + --disk {{ spoke_vm_image_dir }}/{{ spoke_cluster_name }}-master-{{ item.item }}.qcow2,bus=virtio \ + --network network={{ spoke_network_name }},model=virtio \ + --os-variant rhel9.0 \ + --graphics none \ + --noautoconsole \ + --boot hd,network \ + --noreboot \ + --import + loop: "{{ vm_check.results }}" + when: "'NOT_FOUND' in item.stdout" + +- name: Ensure spoke VMs are shut off + shell: | + sudo virsh destroy {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null || true + loop: "{{ range(spoke_ctlplanes) | list }}" + changed_when: false + failed_when: false + +- name: Capture spoke VM UUIDs + shell: | + sudo virsh domuuid {{ spoke_cluster_name }}-master-{{ item }} + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_uuids + changed_when: false + +- name: Capture spoke VM MAC addresses + shell: | + sudo virsh domiflist {{ spoke_cluster_name }}-master-{{ item }} \ + | grep {{ spoke_network_name }} | awk '{print $5}' + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_macs + changed_when: false + +- name: Build spoke VM info list + set_fact: + spoke_vms: >- + {{ spoke_vms | default([]) + [ + { + 'index': item.item, + 'name': spoke_cluster_name ~ '-master-' ~ item.item, + 'uuid': vm_uuids.results[item.item].stdout | trim, + 'mac': vm_macs.results[item.item].stdout | trim + } + ] }} + loop: "{{ vm_uuids.results }}" + +- name: Display spoke VM info + debug: + msg: "VM {{ item.name }}: UUID={{ item.uuid }}, MAC={{ item.mac }}" + loop: "{{ spoke_vms }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml new file mode 100644 index 0000000..43a420c --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/main.yml @@ -0,0 +1,30 @@ +--- +# Deploy spoke TNF cluster via assisted installer + BMH + +- block: + - name: Cleanup existing spoke resources + include_tasks: cleanup.yml + when: force_cleanup | bool + + - name: Create dedicated libvirt network for spoke cluster + include_tasks: create-spoke-network.yml + + - name: Create spoke VMs + include_tasks: create-spoke-vms.yml + + - name: Verify sushy-tools is running + include_tasks: setup-ksushy.yml + + - name: Create spoke cluster resources on hub + include_tasks: create-cluster-resources.yml + + - name: Create BareMetalHost resources + include_tasks: create-bmh.yml + + - name: Wait for spoke cluster installation to complete + include_tasks: wait-for-install.yml + + - name: Retrieve spoke cluster credentials + include_tasks: retrieve-credentials.yml + environment: + KUBECONFIG: "{{ hub_kubeconfig }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml new file mode 100644 index 0000000..9b72882 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/retrieve-credentials.yml @@ -0,0 +1,161 @@ +--- +# Extract spoke cluster credentials and set up DNS for hypervisor access + +- name: Create spoke credential directory + file: + path: "{{ spoke_auth_dir }}" + state: directory + mode: '0700' + +- block: + - name: Wait for admin-kubeconfig secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: kubeconfig_secret + until: kubeconfig_secret.rc == 0 + retries: 10 + delay: 15 + rescue: + - name: Collect kubeconfig secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: kube_secret_diag + changed_when: false + failed_when: false + + - name: Display kubeconfig secret timeout diagnostics + debug: + msg: "{{ kube_secret_diag.stdout }}" + + - name: Fail after kubeconfig secret timeout + fail: + msg: "admin-kubeconfig secret not found within timeout" + +- name: Extract spoke kubeconfig + shell: | + oc get secret {{ spoke_cluster_name }}-admin-kubeconfig \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.kubeconfig}' | base64 -d + register: spoke_kubeconfig + changed_when: false + +- name: Save spoke kubeconfig + copy: + content: "{{ spoke_kubeconfig.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeconfig" + mode: '0600' + +- block: + - name: Wait for admin-password secret + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} -o name 2>/dev/null + register: password_secret + until: password_secret.rc == 0 + retries: "{{ (spoke_credentials_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect password secret timeout diagnostics + shell: | + echo "=== Secrets in namespace ===" + oc get secrets -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== AgentClusterInstall State ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null + register: pwd_secret_diag + changed_when: false + failed_when: false + + - name: Display password secret timeout diagnostics + debug: + msg: "{{ pwd_secret_diag.stdout }}" + + - name: Fail after password secret timeout + fail: + msg: "admin-password secret not found within timeout" + +- name: Extract spoke admin password + shell: | + oc get secret {{ spoke_cluster_name }}-admin-password \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.data.password}' | base64 -d + register: spoke_password + changed_when: false + +- name: Save spoke admin password + copy: + content: "{{ spoke_password.stdout }}" + dest: "{{ spoke_auth_dir }}/kubeadmin-password" + mode: '0600' + +- name: Add spoke API DNS to hypervisor /etc/hosts + lineinfile: + path: /etc/hosts + regexp: "api.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + line: "{{ spoke_api_vip }} api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} api-int.{{ spoke_cluster_name }}.{{ spoke_base_domain }}" + state: present + become: true + +- name: Ensure spoke VMs are running + shell: | + STATE=$(virsh domstate {{ spoke_cluster_name }}-master-{{ item }} 2>/dev/null) + if [ "$STATE" != "running" ]; then + virsh start {{ spoke_cluster_name }}-master-{{ item }} + echo "STARTED" + else + echo "ALREADY_RUNNING" + fi + loop: "{{ range(spoke_ctlplanes) | list }}" + register: vm_start_result + changed_when: "'STARTED' in vm_start_result.stdout" + become: true + +- name: Wait for spoke VMs to boot + pause: + seconds: 120 + when: vm_start_result.results | selectattr('stdout', 'search', 'STARTED') | list | length > 0 + +- block: + - name: Verify spoke cluster access + shell: | + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes + register: spoke_nodes + changed_when: false + retries: 20 + delay: 30 + until: spoke_nodes.rc == 0 + rescue: + - name: Collect spoke access timeout diagnostics + shell: | + echo "=== VM Status ===" + sudo virsh list --all 2>/dev/null | grep {{ spoke_cluster_name }} || echo "No spoke VMs found" + echo "" + echo "=== Last oc error ===" + KUBECONFIG={{ spoke_auth_dir }}/kubeconfig oc get nodes 2>&1 || true + register: spoke_access_diag + changed_when: false + failed_when: false + + - name: Display spoke access timeout diagnostics + debug: + msg: "{{ spoke_access_diag.stdout }}" + + - name: Fail after spoke access timeout + fail: + msg: "Could not access spoke cluster within timeout" + +- name: Display spoke cluster nodes + debug: + msg: "{{ spoke_nodes.stdout }}" + +- name: Set spoke kubeconfig path as fact for post-deployment tasks + set_fact: + spoke_kubeconfig_path: "{{ spoke_auth_dir }}/kubeconfig" + spoke_kubeadmin_password_path: "{{ spoke_auth_dir }}/kubeadmin-password" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml new file mode 100644 index 0000000..49ce70a --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/setup-ksushy.yml @@ -0,0 +1,34 @@ +--- +# Verify sushy-tools is running on the hypervisor (should already exist from dev-scripts) + +- name: Check if sushy-tools container is running + shell: | + sudo podman ps --format '{{ '{{' }}.Names{{ '}}' }}' | grep -q sushy-tools + register: sushy_check + changed_when: false + failed_when: false + +- name: Fail if sushy-tools is not running + fail: + msg: >- + sushy-tools container is not running. It should be started by dev-scripts. + Ensure the hub was deployed with 'make deploy fencing-ipi' before running + the assisted installer. + when: sushy_check.rc != 0 + +- name: Verify spoke VMs are visible via sushy-tools + shell: | + curl -sk https://{{ spoke_ksushy_ip }}:{{ spoke_ksushy_port }}/redfish/v1/Systems/ \ + -u {{ spoke_bmc_user }}:{{ spoke_bmc_password }} \ + | python3 -c "import json,sys; d=json.load(sys.stdin); print(d['Members@odata.count'])" + register: sushy_systems + changed_when: false + +- name: Verify expected number of systems visible + assert: + that: + - sushy_systems.stdout | int >= (spoke_ctlplanes + 2) + fail_msg: >- + Expected at least {{ spoke_ctlplanes + 2 }} systems in sushy-tools + ({{ spoke_ctlplanes }} spoke + 2 hub), but found {{ sushy_systems.stdout }}. + success_msg: "sushy-tools has {{ sushy_systems.stdout }} systems visible ({{ spoke_ctlplanes }} spoke + 2 hub)" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml new file mode 100644 index 0000000..a5bae64 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/tasks/wait-for-install.yml @@ -0,0 +1,186 @@ +--- +# Monitor BMH provisioning, agent registration, and installation progress + +- block: + - name: Wait for agents to register + shell: | + oc get agents -n {{ spoke_cluster_name }} --no-headers 2>/dev/null | wc -l + register: agent_count + until: agent_count.stdout | int >= spoke_ctlplanes + retries: "{{ (spoke_agent_register_timeout / 30) | int }}" + delay: 30 + rescue: + - name: Collect agent registration timeout diagnostics + shell: | + echo "=== Agents ({{ agent_count.stdout | default('0') }} / {{ spoke_ctlplanes }} registered) ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null || echo "No agents found" + echo "" + echo "=== BareMetalHosts ===" + oc get bmh -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== InfraEnv Status ===" + oc get infraenv -n {{ spoke_cluster_name }} -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: agent_reg_diag + changed_when: false + failed_when: false + + - name: Display agent registration timeout diagnostics + debug: + msg: "{{ agent_reg_diag.stdout }}" + + - name: Fail after agent registration timeout + fail: + msg: "Expected {{ spoke_ctlplanes }} agents but only {{ agent_count.stdout | default('0') }} registered within timeout" + +- name: Display registered agents + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: agents_info + changed_when: false + +- name: Show registered agents + debug: + msg: "{{ agents_info.stdout }}" + +- block: + - name: Wait for spoke cluster installation to complete + shell: | + ACI_STATE=$(oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.debugInfo.state}' 2>/dev/null) + echo "$ACI_STATE" + case "$ACI_STATE" in + adding-hosts|installed) + exit 0 + ;; + error|failed) + echo "INSTALL FAILED" + oc get agentclusterinstall {{ spoke_cluster_name }} \ + -n {{ spoke_cluster_name }} \ + -o jsonpath='{.status.conditions}' 2>/dev/null | python3 -m json.tool + exit 2 + ;; + *) + exit 1 + ;; + esac + register: install_state + until: install_state.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + failed_when: install_state.rc == 2 + rescue: + - name: Collect installation timeout diagnostics + shell: | + echo "=== AgentClusterInstall Status ===" + oc get agentclusterinstall {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 30 'status:' + echo "" + echo "=== Agents ===" + oc get agents -n {{ spoke_cluster_name }} 2>/dev/null + echo "" + echo "=== ClusterDeployment ===" + oc get clusterdeployment {{ spoke_cluster_name }} -n {{ spoke_cluster_name }} \ + -o yaml 2>/dev/null | grep -A 20 'status:' + echo "" + echo "=== Recent Events ===" + oc get events -n {{ spoke_cluster_name }} --sort-by='.lastTimestamp' 2>/dev/null | tail -20 + register: install_diag + changed_when: false + failed_when: false + + - name: Display installation timeout diagnostics + debug: + msg: "{{ install_diag.stdout }}" + + - name: Fail after installation timeout + fail: + msg: "Spoke cluster installation did not complete within timeout. Last state: {{ install_state.stdout_lines[0] | default('unknown') }}" + +- name: Display final installation state + debug: + msg: "Spoke cluster installation state: {{ install_state.stdout_lines[0] }}" + +- block: + - name: Wait for all agents to reach Done stage + shell: | + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json + import sys + + data = json.load(sys.stdin) + agents = data.get('items', []) + total = len(agents) + done = 0 + stuck = [] + for a in agents: + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + if stage == 'Done': + done += 1 + else: + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + hostname = a.get('spec', {}).get('hostname', 'unknown') + stuck.append(f'{hostname}: state={state}, stage={stage}') + + print(f'Agents Done: {done} / {total}') + for s in stuck: + print(f' {s}') + sys.exit(0 if done == total else 1) + " + register: agents_done + until: agents_done.rc == 0 + retries: "{{ (spoke_install_timeout / 30) | int }}" + delay: 30 + changed_when: false + rescue: + - name: Collect agent completion diagnostics + shell: | + echo "=== Agent Details ===" + oc get agents -n {{ spoke_cluster_name }} -o wide 2>/dev/null + echo "" + echo "=== Agent Progress ===" + oc get agents -n {{ spoke_cluster_name }} -o json 2>/dev/null \ + | python3 -c " + import json, sys + data = json.load(sys.stdin) + for a in data.get('items', []): + name = a.get('spec', {}).get('hostname', 'unknown') + stage = a.get('status', {}).get('progress', {}).get('currentStage', 'unknown') + state = a.get('status', {}).get('debugInfo', {}).get('state', 'unknown') + print(f'{name}: state={state}, stage={stage}') + " 2>/dev/null || echo "Failed to parse agent details" + register: agents_done_diag + changed_when: false + failed_when: false + + - name: Display agent completion diagnostics + debug: + msg: "{{ agents_done_diag.stdout }}" + + - name: Fail after agent completion timeout + fail: + msg: "Not all agents reached Done stage within timeout" + +- name: Display final agent status + shell: | + oc get agents -n {{ spoke_cluster_name }} + register: final_agents + changed_when: false + +- name: Show final agents + debug: + msg: "{{ final_agents.stdout }}" + +- name: Display final BMH status + shell: | + oc get bmh -n {{ spoke_cluster_name }} + register: final_bmh + changed_when: false + +- name: Show final BMH + debug: + msg: "{{ final_bmh.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 new file mode 100644 index 0000000..0e18b80 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/agentclusterinstall.yml.j2 @@ -0,0 +1,22 @@ +apiVersion: extensions.hive.openshift.io/v1beta1 +kind: AgentClusterInstall +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterDeploymentRef: + name: {{ spoke_cluster_name }} + imageSetRef: + name: "{{ effective_ocp_version }}.0" + apiVIP: "{{ spoke_api_vip }}" + ingressVIP: "{{ spoke_ingress_vip }}" + platformType: BareMetal + networking: + clusterNetwork: + - cidr: "{{ spoke_cluster_network_cidr }}" + hostPrefix: 23 + serviceNetwork: + - "{{ spoke_service_network_cidr }}" + provisionRequirements: + controlPlaneAgents: {{ spoke_ctlplanes }} + sshPublicKey: "{{ ssh_pub_key.stdout }}" diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 new file mode 100644 index 0000000..a31289f --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterdeployment.yml.j2 @@ -0,0 +1,20 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterDeployment +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + baseDomain: {{ spoke_base_domain }} + clusterName: {{ spoke_cluster_name }} + clusterInstallRef: + group: extensions.hive.openshift.io + kind: AgentClusterInstall + name: {{ spoke_cluster_name }} + version: v1beta1 + platform: + agentBareMetal: + agentSelector: + matchLabels: + cluster: tnf + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 new file mode 100644 index 0000000..82eed09 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/clusterimageset.yml.j2 @@ -0,0 +1,6 @@ +apiVersion: hive.openshift.io/v1 +kind: ClusterImageSet +metadata: + name: "{{ effective_ocp_version }}.0" +spec: + releaseImage: {{ effective_release_image }} diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 new file mode 100644 index 0000000..4945f81 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/infraenv.yml.j2 @@ -0,0 +1,15 @@ +apiVersion: agent-install.openshift.io/v1beta1 +kind: InfraEnv +metadata: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} +spec: + clusterRef: + name: {{ spoke_cluster_name }} + namespace: {{ spoke_cluster_name }} + sshAuthorizedKey: "{{ ssh_pub_key.stdout }}" + agentLabels: + cluster: tnf + cpuArchitecture: x86_64 + pullSecretRef: + name: {{ spoke_cluster_name }}-pull-secret diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 new file mode 100644 index 0000000..63f2e23 --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/templates/spoke-network.xml.j2 @@ -0,0 +1,29 @@ + + {{ spoke_network_name }} + + + + + + + + + + apps.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + api.{{ spoke_cluster_name }}.{{ spoke_base_domain }} + + + + + + + + + + + + + + diff --git a/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml new file mode 100644 index 0000000..ce4bd5d --- /dev/null +++ b/deploy/openshift-clusters/roles/assisted/assisted-spoke/vars/main.yml @@ -0,0 +1,17 @@ +--- +# Role-internal variables + +# Derived from spoke_network_cidr +spoke_network_gateway: "{{ spoke_network_cidr | ansible.utils.ipaddr('1') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_start: "{{ spoke_network_cidr | ansible.utils.ipaddr('50') | ansible.utils.ipaddr('address') }}" +spoke_dhcp_end: "{{ spoke_network_cidr | ansible.utils.ipaddr('150') | ansible.utils.ipaddr('address') }}" +spoke_network_prefix: "{{ spoke_network_cidr | ansible.utils.ipaddr('prefix') }}" + +# Libvirt network name (derived from spoke cluster name) +spoke_network_name: "{{ spoke_cluster_name }}" + +# VM image path +spoke_vm_image_dir: /var/lib/libvirt/images + +# Credential output paths +spoke_auth_dir: "{{ ansible_user_dir }}/{{ spoke_cluster_name }}/auth" diff --git a/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh new file mode 100755 index 0000000..980511b --- /dev/null +++ b/deploy/openshift-clusters/scripts/deploy-fencing-assisted.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +# Get the directory where this script is located +SCRIPT_DIR=$(dirname "$0") +# Get the deploy directory (two levels up from scripts) +DEPLOY_DIR="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +set -o nounset +set -o errexit +set -o pipefail + +# Check if instance data exists +if [[ ! -f "${DEPLOY_DIR}/aws-hypervisor/instance-data/aws-instance-id" ]]; then + echo "Error: No instance found. Please run 'make deploy' first." + exit 1 +fi + +# Check if inventory.ini exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/inventory.ini" ]]; then + echo "Error: inventory.ini not found in ${DEPLOY_DIR}/openshift-clusters/" + echo "Please ensure the inventory file is properly configured." + echo "You can run 'make inventory' to update it with current instance information." + exit 1 +fi + +# Check if vars/assisted.yml exists +if [[ ! -f "${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" ]]; then + echo "Error: vars/assisted.yml not found." + echo "Copy the template and customize it:" + echo " cp ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml.template ${DEPLOY_DIR}/openshift-clusters/vars/assisted.yml" + exit 1 +fi + +echo "Deploying spoke TNF cluster via assisted installer..." + +cd "${DEPLOY_DIR}/openshift-clusters" + +# Parse spoke_cluster_name from vars/assisted.yml +SPOKE_CLUSTER_NAME=$(grep '^spoke_cluster_name:' vars/assisted.yml | awk '{print $2}' | tr -d '"' | tr -d "'") +if [[ -z "${SPOKE_CLUSTER_NAME}" ]]; then + SPOKE_CLUSTER_NAME="spoke-tnf" +fi + +if ansible-playbook assisted-install.yml -i inventory.ini; then + echo "" + echo "OpenShift spoke TNF cluster deployment via assisted installer completed successfully!" + echo "" + echo "Next steps:" + echo "1. Access spoke cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/proxy.env" + echo " KUBECONFIG=~/${SPOKE_CLUSTER_NAME}/auth/kubeconfig oc get nodes" + echo "2. Access hub cluster:" + echo " source ${DEPLOY_DIR}/openshift-clusters/hub-proxy.env" + echo " KUBECONFIG=~/auth/kubeconfig oc get nodes" +else + echo "Error: Spoke cluster deployment failed!" + echo "Check the Ansible logs for more details." + exit 1 +fi diff --git a/deploy/openshift-clusters/vars/assisted.yml.template b/deploy/openshift-clusters/vars/assisted.yml.template new file mode 100644 index 0000000..6c33ce5 --- /dev/null +++ b/deploy/openshift-clusters/vars/assisted.yml.template @@ -0,0 +1,54 @@ +# Assisted Installer Configuration +# Copy this file to vars/assisted.yml and customize as needed +# +# Usage: After deploying a hub cluster with 'make deploy fencing-ipi', +# run 'make fencing-assisted' to deploy a spoke TNF cluster via ACM/assisted installer. + +# Hub operator: "acm" or "mce" +hub_operator: acm + +# ACM/MCE channel: "auto" detects from packagemanifest (recommended) +# Override with specific channel like "release-2.15" if needed +acm_channel: "auto" +mce_channel: "auto" + +# Spoke cluster identity +spoke_cluster_name: spoke-tnf +spoke_base_domain: example.com + +# Spoke OCP version +# "auto" uses the same release image as the hub (recommended) +# Or specify an explicit release image URL +spoke_release_image: "auto" + +# Spoke VM specifications +spoke_vm_memory: 32768 # MB (32GB) +spoke_vm_vcpus: 4 +spoke_vm_disk_size: 120 # GB +spoke_ctlplanes: 2 # Always 2 for TNF + +# Spoke network configuration +# DHCP range is auto-computed as .50-.150 of the CIDR. +# VIPs must be outside that range to avoid conflicts. +spoke_network_cidr: "192.168.125.0/24" +spoke_api_vip: "192.168.125.5" +spoke_ingress_vip: "192.168.125.10" +spoke_cluster_network_cidr: "10.132.0.0/14" +spoke_service_network_cidr: "172.31.0.0/16" + +# Hub network CIDR (for cross-bridge nftables rules between hub and spoke) +# Must match the hub cluster's libvirt network. Default matches dev-scripts. +hub_network_cidr: "192.168.111.0/24" + +# BMC / sushy-tools (defaults match dev-scripts deployment) +spoke_bmc_user: admin +spoke_bmc_password: password +spoke_ksushy_ip: "192.168.111.1" +spoke_ksushy_port: 8000 + +# Storage for assisted service on hub +# Currently only "hostpath" is supported +assisted_storage_method: "hostpath" + +# Deployment options +force_cleanup: false