Skip to content
This repository was archived by the owner on Mar 3, 2026. It is now read-only.
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions .github/actions/e2e-setup/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,18 @@ runs:
- name: tp use
run: >
tp use
--project '${{ inputs.gcp_project }}'
--zone '${{ inputs.gcp_zone }}'
--cluster '${{ inputs.xpk_cluster_name }}'
--project '${INPUTS_GCP_PROJECT}'
--zone '${INPUTS_GCP_ZONE}'
--cluster '${INPUTS_XPK_CLUSTER_NAME}'
--num-slices 1
--artifact-dir '${{ inputs.artifact_dir }}'
--tpu-type '${{ inputs.tpu_type }}'
--artifact-dir '${INPUTS_ARTIFACT_DIR}'
--tpu-type '${INPUTS_TPU_TYPE}'
--bq-table 'torchprime-e2e-tests'
--upload-metrics
shell: bash
env:
INPUTS_GCP_PROJECT: ${{ inputs.gcp_project }}
INPUTS_GCP_ZONE: ${{ inputs.gcp_zone }}
INPUTS_XPK_CLUSTER_NAME: ${{ inputs.xpk_cluster_name }}
INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }}
INPUTS_TPU_TYPE: ${{ inputs.tpu_type }}
24 changes: 16 additions & 8 deletions .github/workflows/e2e_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,11 +53,13 @@ jobs:
- name: Setup Docker URL option
id: docker-url-option
run: |
if [ -n "${{ github.event.inputs.docker_url }}" ]; then
echo "value=--base-docker-url ${{ github.event.inputs.docker_url }}" >> "$GITHUB_OUTPUT"
if [ -n "${GITHUB_EVENT_INPUTS_DOCKER_URL}" ]; then
echo "value=--base-docker-url ${GITHUB_EVENT_INPUTS_DOCKER_URL}" >> "$GITHUB_OUTPUT"
else
echo "value=" >> "$GITHUB_OUTPUT"
fi
env:
GITHUB_EVENT_INPUTS_DOCKER_URL: ${{ github.event.inputs.docker_url }}

# Launch training workloads.

Expand All @@ -67,10 +69,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py llama-3-8b)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
torchprime/torch_xla_models/train.py \
model=llama-3-8b \
Expand All @@ -86,10 +89,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py llama-3dot1-8b-sa)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
torchprime/torch_xla_models/train.py \
model=llama-3.1-8b \
Expand All @@ -106,10 +110,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py llama-3dot1-8b-sa)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
torchprime/torch_xla_models/train.py \
model=llama-3.1-8b \
Expand All @@ -126,10 +131,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py llama-3-8b-2d)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
torchprime/torch_xla_models/train.py \
model=llama-3-8b \
Expand All @@ -147,10 +153,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py mixtral-8x7b)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
torchprime/torch_xla_models/train.py \
model=mixtral-8x7b \
Expand All @@ -167,10 +174,11 @@ jobs:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
XLA_IR_DEBUG: 1
XLA_HLO_DEBUG: 1
STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE: ${{ steps.docker-url-option.outputs.value }}
run: |
name=$(e2e_testing/gen_name.py llama-3-8b-2-slice)
echo "name=$name" >> "$GITHUB_OUTPUT"
tp run ${{ steps.docker-url-option.outputs.value }} \
tp run ${STEPS_DOCKER_URL_OPTION_OUTPUTS_VALUE} \
--name $name \
--num-slices 2 \
torchprime/torch_xla_models/train.py \
Expand Down
39 changes: 31 additions & 8 deletions .github/workflows/reusable_e2e_check.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,34 +41,57 @@ jobs:
gcp_sa_key: ${{ secrets.GCP_SA_KEY }}
- name: Get GKE credentials
run: |
gcloud container clusters get-credentials ${{ vars.XPK_CLUSTER_NAME }} --region=${{ vars.GCP_ZONE }} --project=${{ vars.GCP_PROJECT }}
gcloud container clusters get-credentials ${VARS_XPK_CLUSTER_NAME} --region=${VARS_GCP_ZONE} --project=${VARS_GCP_PROJECT}
kubectl config view
kubectl config set-context --current --namespace=default
env:
VARS_XPK_CLUSTER_NAME: ${{ vars.XPK_CLUSTER_NAME }}
VARS_GCP_ZONE: ${{ vars.GCP_ZONE }}
VARS_GCP_PROJECT: ${{ vars.GCP_PROJECT }}
- name: Get pod name
id: get_pod_name
run: |
pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=${{ inputs.jobset_name }} -o json | jq --raw-output '.items[0].metadata.name')
pod_name=$(kubectl get pods -l jobset.sigs.k8s.io/jobset-name=${INPUTS_JOBSET_NAME} -o json | jq --raw-output '.items[0].metadata.name')
echo "pod_name=$pod_name" >> $GITHUB_OUTPUT
env:
INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }}
- name: Wait for workload to start
run: |
kubectl wait "pod/${{ steps.get_pod_name.outputs.pod_name }}" \
kubectl wait "pod/${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}" \
--for='jsonpath={.status.containerStatuses[?(@.name=="jax-tpu")].state.running}' \
--timeout="60m"
env:
STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }}
- name: Stream logs
run: |
# Save logs to a file for later checks
kubectl logs -c jax-tpu -f ${{ steps.get_pod_name.outputs.pod_name }} | tee /tmp/pod-${{ steps.get_pod_name.outputs.pod_name }}.log
kubectl logs -c jax-tpu -f ${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME} | tee /tmp/pod-${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}.log
env:
STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }}
- name: Wait for workload to complete
run: |
xpk workload list --cluster ${{ vars.XPK_CLUSTER_NAME }} --wait-for-job-completion=${{ inputs.jobset_name }} --project ${{ vars.GCP_PROJECT }} --zone ${{ vars.GCP_ZONE }}
xpk workload list --cluster ${VARS_XPK_CLUSTER_NAME} --wait-for-job-completion=${INPUTS_JOBSET_NAME} --project ${VARS_GCP_PROJECT} --zone ${VARS_GCP_ZONE}
env:
VARS_XPK_CLUSTER_NAME: ${{ vars.XPK_CLUSTER_NAME }}
INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }}
VARS_GCP_PROJECT: ${{ vars.GCP_PROJECT }}
VARS_GCP_ZONE: ${{ vars.GCP_ZONE }}
- name: Validate logs
run: |
e2e_testing/check_logs.py /tmp/pod-${{ steps.get_pod_name.outputs.pod_name }}.log
e2e_testing/check_logs.py /tmp/pod-${STEPS_GET_POD_NAME_OUTPUTS_POD_NAME}.log
env:
STEPS_GET_POD_NAME_OUTPUTS_POD_NAME: ${{ steps.get_pod_name.outputs.pod_name }}
- name: Validate profile
run: |
profile_dir="${{ inputs.artifact_dir }}/${{ inputs.jobset_name }}/profile/0-0"
profile_dir="${INPUTS_ARTIFACT_DIR}/${INPUTS_JOBSET_NAME}/profile/0-0"
e2e_testing/check_profile.py "$profile_dir"
env:
INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }}
INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }}
- name: Validate metrics
run: |
output_dir="${{ inputs.artifact_dir }}/${{ inputs.jobset_name }}/outputs/0-0"
output_dir="${INPUTS_ARTIFACT_DIR}/${INPUTS_JOBSET_NAME}/outputs/0-0"
e2e_testing/check_step_time.py "$output_dir" "${{ inputs.step_time_lower_bound }}" "${{ inputs.step_time_upper_bound }}"
env:
INPUTS_ARTIFACT_DIR: ${{ inputs.artifact_dir }}
INPUTS_JOBSET_NAME: ${{ inputs.jobset_name }}
Loading