From a6bee58a6c5e4c052540d029218ad322cdeaf015 Mon Sep 17 00:00:00 2001 From: Ben Knutson Date: Wed, 18 Feb 2026 02:52:25 +0000 Subject: [PATCH] Refactor Github Action per b/485167538 --- .github/actions/e2e-setup/action.yml | 16 +++++++--- .github/workflows/e2e_test.yml | 24 ++++++++++----- .github/workflows/reusable_e2e_check.yml | 39 +++++++++++++++++++----- 3 files changed, 58 insertions(+), 21 deletions(-) diff --git a/.github/actions/e2e-setup/action.yml b/.github/actions/e2e-setup/action.yml index 71eda916..15b7be50 100644 --- a/.github/actions/e2e-setup/action.yml +++ b/.github/actions/e2e-setup/action.yml @@ -62,12 +62,18 @@ runs: - name: tp use run: > tp use - --project '${{ inputs.gcp_project }}' - --zone '${{ inputs.gcp_zone }}' - --cluster '${{ inputs.xpk_cluster_name }}' + --project '${INPUTS_GCP_PROJECT}' + --zone '${INPUTS_GCP_ZONE}' + --cluster '${INPUTS_XPK_CLUSTER_NAME}' --num-slices 1 - --artifact-dir '${{ inputs.artifact_dir }}' - --tpu-type '${{ inputs.tpu_type }}' + --artifact-dir '${INPUTS_ARTIFACT_DIR}' + --tpu-type '${INPUTS_TPU_TYPE}' --bq-table 'torchprime-e2e-tests' --upload-metrics shell: bash + env: + INPUTS_GCP_PROJECT: ${{ inputs.gcp_project }} + INPUTS_GCP_ZONE: ${{ inputs.gcp_zone }} + INPUTS_XPK_CLUSTER_NAME: ${{ inputs.xpk_cluster_name }} + INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }} + INPUTS_TPU_TYPE: ${{ inputs.tpu_type }} diff --git a/.github/workflows/e2e_test.yml b/.github/workflows/e2e_test.yml index 01d11974..4c4b9b6d 100644 --- a/.github/workflows/e2e_test.yml +++ b/.github/workflows/e2e_test.yml @@ -53,11 +53,13 @@ jobs: - name: Setup Docker URL option id: docker-url-option run: | - if [ -n "${{ github.event.inputs.docker_url }}" ]; then - echo "value=--base-docker-url ${{ github.event.inputs.docker_url }}" >> "$GITHUB_OUTPUT" + if [ -n "${GITHUB_EVENT_INPUTS_DOCKER_URL}" ]; then + echo "value=--base-docker-url ${GITHUB_EVENT_INPUTS_DOCKER_URL}" >> "$GITHUB_OUTPUT" else echo "value=" >> "$GITHUB_OUTPUT" fi + env: + GITHUB_EVENT_INPUTS_DOCKER_URL: ${{ github.event.inputs.docker_url }} # Launch training workloads. @@ -67,10 +69,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py llama-3-8b) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ torchprime/torch_xla_models/train.py \ model=llama-3-8b \ @@ -86,10 +89,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py llama-3dot1-8b-sa) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ torchprime/torch_xla_models/train.py \ model=llama-3.1-8b \ @@ -106,10 +110,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py llama-3dot1-8b-sa) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ torchprime/torch_xla_models/train.py \ model=llama-3.1-8b \ @@ -126,10 +131,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py llama-3-8b-2d) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ torchprime/torch_xla_models/train.py \ model=llama-3-8b \ @@ -147,10 +153,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py mixtral-8x7b) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ torchprime/torch_xla_models/train.py \ model=mixtral-8x7b \ @@ -167,10 +174,11 @@ jobs: HF_TOKEN: ${{ secrets.HF_TOKEN }} XLA_IR_DEBUG: 1 XLA_HLO_DEBUG: 1 + STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }} run: | name=$(e2e_testing/gen_name.py llama-3-8b-2-slice) echo "name=$name" >> "$GITHUB_OUTPUT" - tp run ${{ steps.docker-url-option.outputs.value }} \ + tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \ --name $name \ --num-slices 2 \ torchprime/torch_xla_models/train.py \ diff --git a/.github/workflows/reusable_e2e_check.yml b/.github/workflows/reusable_e2e_check.yml index 67b386d5..010497fc 100644 --- a/.github/workflows/reusable_e2e_check.yml +++ b/.github/workflows/reusable_e2e_check.yml @@ -41,34 +41,57 @@ jobs: gcp_sa_key: ${{ secrets.GCP_SA_KEY }} - name: Get GKE credentials run: | - gcloud container clusters get-credentials ${{ vars.XPK_CLUSTER_NAME }} --region=${{ vars.GCP_ZONE }} --project=${{ vars.GCP_PROJECT }} + gcloud container clusters get-credentials ${VARS_XPK_CLUSTER_NAME} --region=${VARS_GCP_ZONE} --project=${VARS_GCP_PROJECT} kubectl config view kubectl config set-context --current --namespace=default + env: + VARS_XPK_CLUSTER_NAME: ${{ vars.XPK_CLUSTER_NAME }} + VARS_GCP_ZONE: ${{ vars.GCP_ZONE }} + VARS_GCP_PROJECT: ${{ vars.GCP_PROJECT }} - name: Get pod name id: get_pod_name run: | - pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=${{ inputs.jobset_name }} -o json | jq --raw-output '.items[0].metadata.name') + pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=${INPUTS_JOBSET_NAME} -o json | jq --raw-output '.items[0].metadata.name') echo "pod_name=$pod_name" >> $GITHUB_OUTPUT + env: + INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }} - name: Wait for workload to start run: | - kubectl wait "pod/${{ steps.get_pod_name.outputs.pod_name }}" \ + kubectl wait "pod/${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}" \ --for='jsonpath={.status.containerStatuses[?(@.name=="jax-tpu")].state.running}' \ --timeout="60m" + env: + STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }} - name: Stream logs run: | # Save logs to a file for later checks - kubectl logs -c jax-tpu -f ${{ steps.get_pod_name.outputs.pod_name }} | tee /tmp/pod-${{ steps.get_pod_name.outputs.pod_name }}.log + kubectl logs -c jax-tpu -f ${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME} | tee /tmp/pod-${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}.log + env: + STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }} - name: Wait for workload to complete run: | - xpk workload list --cluster ${{ vars.XPK_CLUSTER_NAME }} --wait-for-job-completion=${{ inputs.jobset_name }} --project ${{ vars.GCP_PROJECT }} --zone ${{ vars.GCP_ZONE }} + xpk workload list --cluster ${VARS_XPK_CLUSTER_NAME} --wait-for-job-completion=${INPUTS_JOBSET_NAME} --project ${VARS_GCP_PROJECT} --zone ${VARS_GCP_ZONE} + env: + VARS_XPK_CLUSTER_NAME: ${{ vars.XPK_CLUSTER_NAME }} + INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }} + VARS_GCP_PROJECT: ${{ vars.GCP_PROJECT }} + VARS_GCP_ZONE: ${{ vars.GCP_ZONE }} - name: Validate logs run: | - e2e_testing/check_logs.py /tmp/pod-${{ steps.get_pod_name.outputs.pod_name }}.log + e2e_testing/check_logs.py /tmp/pod-${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}.log + env: + STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }} - name: Validate profile run: | - profile_dir="${{ inputs.artifact_dir }}/${{ inputs.jobset_name }}/profile/0-0" + profile_dir="${INPUTS_ARTIFACT_DIR}/${INPUTS_JOBSET_NAME}/profile/0-0" e2e_testing/check_profile.py "$profile_dir" + env: + INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }} + INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }} - name: Validate metrics run: | - output_dir="${{ inputs.artifact_dir }}/${{ inputs.jobset_name }}/outputs/0-0" + output_dir="${INPUTS_ARTIFACT_DIR}/${INPUTS_JOBSET_NAME}/outputs/0-0" e2e_testing/check_step_time.py "$output_dir" "${{ inputs.step_time_lower_bound }}" "${{ inputs.step_time_upper_bound }}" + env: + INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }} + INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }}