-
Notifications
You must be signed in to change notification settings - Fork 24
Open
Description
Hello, I try to trian the StreamVLN, my GPU machine configuration is
(streamvln) root@autodl-container-c41f44af49-5ab1d942:~/autodl-tmp/StreamVLN# nvidia-smi
Tue Nov 25 10:57:32 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 565.57.01 Driver Version: 565.57.01 CUDA Version: 12.7 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA H20 On | 00000000:67:01.0 Off | Off |
| N/A 36C P0 73W / 500W | 1MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA H20 On | 00000000:69:01.0 Off | Off |
| N/A 36C P0 72W / 500W | 1MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA H20 On | 00000000:71:01.0 Off | Off |
| N/A 44C P0 75W / 500W | 1MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA H20 On | 00000000:73:01.0 Off | Off |
| N/A 37C P0 74W / 500W | 1MiB / 97871MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
, my scripts is
#!/bin/bash
export HF_HUB_OFFLINE=1
export HF_HOME=$PWD/checkpoints/hf_home/
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128,garbage_collection_threshold:0.6"
export NCCL_IB_DISABLE=1
export NCCL_P2P_DISABLE=1
export NCCL_DEBUG=INFO
# Distributed training parameters
NNODES=${NNODES:-1} # Number of nodes, default 1 (single node)
NPROC_PER_NODE=${NPROC_PER_NODE:-8} # GPUs per node, default 8
MASTER_ADDR=${MASTER_ADDR:-127.0.0.1} # Master node address, default localhost for single node
MASTER_PORT=${MASTER_PORT:-12000} # Default port
# Auto-select a large free port if unset or occupied
if command -v python3 >/dev/null 2>&1; then
SEL_PORT=$(python3 - <<'PY'
import os, socket, random, sys
def is_free(port: int) -> bool:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
try:
s.bind(("", port))
return True
except OSError:
return False
env_port = os.environ.get("MASTER_PORT")
if env_port is not None:
try:
p = int(env_port)
if 1024 <= p <= 65535 and is_free(p):
print(p)
sys.exit(0)
except Exception:
pass
start = int(os.environ.get("PORT_START", 29500))
end = int(os.environ.get("PORT_END", 65535))
for _ in range(2048):
p = random.randint(start, end)
if is_free(p):
print(p)
sys.exit(0)
print(0)
PY
)
if [ -n "$SEL_PORT" ] && [ "$SEL_PORT" != "0" ]; then
MASTER_PORT=$SEL_PORT
fi
fi
echo "=== Distributed training config ==="
echo "Num nodes: $NNODES"
echo "NPROC_PER_NODE: $NPROC_PER_NODE"
echo "Master endpoint: $MASTER_ADDR:$MASTER_PORT"
echo "HF cache dir: $HF_HOME"
VIDEO_FOLDER="data/trajectory_data/R2R","data/trajectory_data/RxR","data/trajectory_data/EnvDrop"
#VIDEO_FOLDER="data/trajectory_data/R2R","data/trajectory_data/RxR"
# VIDEO_FOLDER="data/trajectory_data/R2R_small_1118"
LLM_VERSION="Qwen/Qwen2-7B-Instruct"
# POSIX-compatible replacement: replace '/' with '_' using tr
LLM_VERSION_CLEAN="$(printf '%s' "$LLM_VERSION" | tr '/' '_')"
VISION_MODEL_VERSION="google/siglip-so400m-patch14-384"
VISION_MODEL_VERSION_CLEAN="$(printf '%s' "$VISION_MODEL_VERSION" | tr '/' '_')"
############### Pretrain ################
BASE_RUN_NAME="llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-mlp2x_gelu-pretrain_blip558k_plain"
echo "BASE_RUN_NAME: ${BASE_RUN_NAME}"
############### Finetune ################
PROMPT_VERSION="qwen_1_5"
MID_RUN_NAME="StreamVLN_Video_${PROMPT_VERSION}_1epoch_196token_8history_32frame"
PREV_STAGE_CHECKPOINT="checkpoints/lmms-lab/LLaVA-Video-7B-Qwen2"
echo "PREV_STAGE_CHECKPOINT: ${PREV_STAGE_CHECKPOINT}"
echo "MID_RUN_NAME: ${MID_RUN_NAME}"
# Detect GPU count: prefer CUDA_VISIBLE_DEVICES, then nvidia-smi, then Python torch
GPU_COUNT=""
if [ -n "${CUDA_VISIBLE_DEVICES:-}" ]; then
_CVD_CLEAN=$(echo "$CUDA_VISIBLE_DEVICES" | tr -d '[:space:]' | sed 's/,,*/,/g;s/^,//;s/,$//')
if [ -n "$_CVD_CLEAN" ]; then
GPU_COUNT=$(echo "$_CVD_CLEAN" | awk -F, '{print NF}')
else
GPU_COUNT=0
fi
elif command -v nvidia-smi >/dev/null 2>&1; then
GPU_COUNT=$(nvidia-smi -L 2>/dev/null | wc -l | tr -d '[:space:]')
fi
if [ -z "$GPU_COUNT" ] || ! echo "$GPU_COUNT" | grep -Eq '^[0-9]+$'; then
if command -v python3 >/dev/null 2>&1; then
GPU_COUNT=$(python3 - <<'PY'
try:
import torch
print(torch.cuda.device_count())
except Exception:
print(-1)
PY
)
GPU_COUNT=$(echo "$GPU_COUNT" | tr -d '[:space:]')
fi
fi
if ! echo "$GPU_COUNT" | grep -Eq '^[0-9]+$'; then
echo "Error: Unable to determine GPU count. Set CUDA_VISIBLE_DEVICES or install NVIDIA drivers."
exit 1
fi
if [ "$GPU_COUNT" -le 0 ]; then
echo "Error: No GPUs visible to the process."
exit 1
fi
echo "Detected GPUs: $GPU_COUNT"
if [ "$GPU_COUNT" -lt "$NPROC_PER_NODE" ]; then
echo "Warning: NPROC_PER_NODE ($NPROC_PER_NODE) > available GPUs ($GPU_COUNT)"
echo "Auto-adjust NPROC_PER_NODE to available GPUs"
NPROC_PER_NODE=$GPU_COUNT
fi
echo "========================"
torchrun --nnodes=$NNODES --nproc_per_node=$NPROC_PER_NODE \
--rdzv_id=12345 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR:$MASTER_PORT streamvln/streamvln_train.py \
--deepspeed scripts/zero2.json \
--model_name_or_path $PREV_STAGE_CHECKPOINT \
--version $PROMPT_VERSION \
--video_folder ${VIDEO_FOLDER} \
--group_by_task False \
\
--num_history 8 \
--num_future_steps 4 \
--num_frames 32 \
--data_augmentation True \
\
--mm_tunable_parts="mm_vision_tower,mm_mlp_adapter,mm_language_model" \
--vision_tower ${VISION_MODEL_VERSION} \
--mm_projector_type mlp2x_gelu \
--mm_vision_select_layer -2 \
--mm_use_im_start_end False \
--mm_use_im_patch_token False \
--image_aspect_ratio anyres_max_9 \
--image_grid_pinpoints "(1x1),...,(6x6)" \
--bf16 True \
--run_name $MID_RUN_NAME \
--output_dir checkpoints/$MID_RUN_NAME \
--num_train_epochs 1 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 4 \
--gradient_accumulation_steps 2 \
--evaluation_strategy "no" \
--save_strategy "steps" \
--save_steps 10000 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--mm_vision_tower_lr 5e-6 \
--weight_decay 0. \
--warmup_ratio 0.075 \
--lr_scheduler_type "cosine_with_min_lr" \
--lr_scheduler_kwargs '{"min_lr": 1.85e-05}' \
--logging_steps 10 \
--tf32 True \
--model_max_length 32768 \
--gradient_checkpointing True \
--dataloader_num_workers 8 \
--lazy_preprocess True \
--torch_compile True \
--torch_compile_backend "inductor" \
--dataloader_drop_last True \
--report_to tensorboard
but I got the error
0%|_ | 252/59913 [27:07<106:21:36, 6.42s/it]
autodl-container-c41f44af49-5ab1d942:14777:18232 [0] NCCL INFO [Service thread] Connection closed by localRank 1
[2025-11-25 10:54:43,184] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 14777 closing signal SIGTERM
[2025-11-25 10:54:43,185] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 14779 closing signal SIGTERM
[2025-11-25 10:54:43,188] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 14780 closing signal SIGTERM
[2025-11-25 10:54:43,191] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: -8) local_rank: 1 (pid: 14778) of binary: /root/miniconda3/envs/streamvln/bin/python3.9
Traceback (most recent call last):
File "/root/miniconda3/envs/streamvln/bin/torchrun", line 7, in <module>
sys.exit(main())
File "/root/miniconda3/envs/streamvln/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/envs/streamvln/lib/python3.9/site-packages/torch/distributed/run.py", line 806, in main
run(args)
File "/root/miniconda3/envs/streamvln/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
elastic_launch(
File "/root/miniconda3/envs/streamvln/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/envs/streamvln/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 264, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
=====================================================
streamvln/streamvln_train.py FAILED
-----------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
-----------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2025-11-25_10:54:43
host : autodl-container-c41f44af49-5ab1d942
rank : 1 (local_rank: 1)
exitcode : -8 (pid: 14778)
error_file: <N/A>
traceback : Signal 8 (SIGFPE) received by PID 14778
=====================================================
Metadata
Metadata
Assignees
Labels
No labels