-
Notifications
You must be signed in to change notification settings - Fork 395
Open
Description
script:
#!/bin/bash
set -ex
# will prevent ray from buffering stdout/stderr
export PYTHONBUFFERED=16
# 强制同步执行,使报错信息出现在实际发生的行
export CUDA_LAUNCH_BLOCKING=1
# 开启 cuBLAS 日志以获取更详细的底层错误信息
export CUBLAS_LOGINFO_DBG=1
export CUBLAS_LOGDEST_DBG=stderr
export NVTE_FUSED_ATTN=0
PROJECT_NAME="slime_nemo_post_training_v3_sft"
EXPR_NAME="qwen3-4b-base-sft"
SUBMIT_TIME=$(date +%y%m%d_%H%M%S)
USER_ID=${USER_ID:-"xiang.long"}
export USER_ROOT=/cpfs01/${USER_ID}
export CKPT_DIR=${USER_ROOT}/checkpoints/${PROJECT_NAME}/${EXPR_NAME}
export TENSORBOARD_DIR=${USER_ROOT}/tensorboards/${PROJECT_NAME}/${EXPR_NAME}
export LOG_DIR=${USER_ROOT}/logs/${PROJECT_NAME}/${EXPR_NAME}
mkdir -p $CKPT_DIR
mkdir -p $TENSORBOARD_DIR
mkdir -p $LOG_DIR
NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
HAS_NVLINK=1
else
HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}/")" &>/dev/null && pwd)"
echo $SCRIPT_DIR
BASE_DIR=$(realpath "$SCRIPT_DIR/../../../")
echo $BASE_DIR
source "${BASE_DIR}/scripts/models/qwen3-4B.sh"
#source "${BASE_DIR}/scripts/models/qwen3-4B-Instruct-2507.sh"
CKPT_ARGS=(
--hf-checkpoint /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
--ref-load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
--load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
#--load $CKPT_DIR
--save $CKPT_DIR
--megatron-to-hf-mode bridge
--save-interval 1000
--loss-mask-type qwen3_fix
)
SFT_ARGS=(
--rollout-function-path slime.rollout.sft_rollout.generate_rollout
#--prompt-data /cpfs01/haoqingwang/nvidia_datasets/SFT-Merge/post_train_sft_token_blend.jsonl
--prompt-data /cpfs01/datasets/haoqing_nv_sft_merge_post_train_sft_token_blend_debug_10k.jsonl
--input-key messages
--tool-key tools
# --apply-chat-template
--rollout-shuffle
--num-epoch 2
--rollout-batch-size 512
--global-batch-size 512
--loss-type sft_loss
--calculate-per-token-loss
--disable-compute-advantages-and-returns
--debug-train-only
)
PERF_ARGS=(
--tensor-model-parallel-size 2
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 4
--expert-model-parallel-size 1
--expert-tensor-parallel-size 1
--recompute-granularity full
--recompute-method uniform
--recompute-num-layers 1
# --micro-batch-size 1
--use-dynamic-batch-size
--max-tokens-per-gpu 10368
)
OPTIMIZER_ARGS=(
--optimizer adam
--lr 1e-5
--lr-decay-style cosine
--min-lr 1e-6
--lr-warmup-fraction 0.1
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
)
WANDB_ARGS=(
# --use-wandb
# --wandb-project slime-dev
# --wandb-group qwen3-4B-base-sft
# --wandb-key ${WANDB_KEY}
)
TENSORBOARD_ARGS=(
--use-tensorboard
# comment out due to use TENSORBOARD_DIR environ
# --tb-project-name slime-dapo-17k
# --tb-experiment-name qwen3-4b-colocate-baseline
)
MISC_ARGS=(
# default dropout in megatron is 0.1
--attention-dropout 0.0
--hidden-dropout 0.0
# should be good for model performance
--accumulate-allreduce-grads-in-fp32
--attention-softmax-in-fp32
# need to comment this when using model with MLA
--attention-backend flash
)
# launch the master node of ray in container
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
WORLD_SIZE=${WORLD_SIZE:-"1"}
# export no_proxy="127.0.0.1,${MASTER_ADDR}"
# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/:/cpfs01/xiang.long/slime\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
}
}"
#\"PYTORCH_ALLOC_CONF\": \"expandable_segments:True\"
ray job submit --address="http://${MASTER_ADDR}:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train_async.py \
--actor-num-nodes ${WORLD_SIZE} \
--actor-num-gpus-per-node 8 \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${SFT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${WANDB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${MISC_ARGS[@]}
error log:
(MegatronTrainRayActor pid=97511) [2026-01-06 05:23:46] timer.py:24 - Timer actor_train start
Traceback (most recent call last):
File "/cpfs01/xiang.long/slime/train_async.py", line 77, in <module>
train(args)
File "/cpfs01/xiang.long/slime/train_async.py", line 48, in train
ray.get(actor_model.async_train(rollout_id, rollout_data_curr_ref))
File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
return func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
values, debugger_breakpoint = worker.get_objects(
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AcceleratorError): ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 735, in forward
hidden_states, context = layer(
^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 1044, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 475, in forward
hidden_states, context = self._forward_attention(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 549, in _forward_attention
attention_output_with_bias = self.self_attention(
^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/attention.py", line 1020, in forward
output, bias = self.linear_proj(core_attn_out)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/extensions/transformer_engine.py", line 431, in forward
out = super().forward(x, is_first_microbatch=_is_first_microbatch)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
return fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 1468, in forward
out = linear_fn(
^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py", line 581, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 324, in forward
gemm_out, *_, reduce_scatter_out = general_gemm(
^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/cpp_extensions/gemm.py", line 206, in general_gemm
out, bias_grad, gelu_input, extra_output = tex.generic_gemm(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:750 in function cublas_gemm: cuBLAS Error: the function failed to launch on the GPU
During handling of the above exception, another exception occurred:
ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 359, in train
return self.train_actor(rollout_id, rollout_data)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 448, in train_actor
train(
File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 591, in train
loss_dict, grad_norm = train_one_step(
^^^^^^^^^^^^^^^
File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 417, in train_one_step
losses_reduced = forward_backward_func(
^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 632, in forward_backward_no_pipelining
output_tensor, num_tokens = forward_step(
^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 417, in forward_step
output_tensor, loss_func = forward_step_func(data_iterator, model)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 408, in forward_step
output_tensor = model(**forward_kwargs)
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/distributed/data_parallel_base.py", line 22, in forward
return self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/module.py", line 456, in forward
outputs = self.module(*inputs, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/models/gpt/gpt_model.py", line 481, in forward
hidden_states = self.decoder(
^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 586, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 699, in forward
with rng_context, outer_quantization_context:
File "/usr/lib/python3.12/contextlib.py", line 158, in __exit__
self.gen.throw(value)
File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 273, in fork
self.states_[name] = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 55, in _get_cuda_rng_state
return torch.cuda.random.get_rng_state(device=device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
return default_generator.get_state()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
---------------------------------------
Job 'raysubmit_3K3GFVd9WHpJDYiK' failed
---------------------------------------
Status message: Job entrypoint command failed with exit code 1, last available logs (truncated to 20,000 chars):
return torch.cuda.random.get_rng_state(device=device)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
return default_generator.get_state()
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
Metadata
Metadata
Assignees
Labels
No labels