Skip to content

torch.AcceleratorError: CUDA error: an illegal memory access was encountered #1297

@SwordFaith

Description

@SwordFaith

script:

#!/bin/bash

set -ex

# will prevent ray from buffering stdout/stderr
export PYTHONBUFFERED=16
# 强制同步执行,使报错信息出现在实际发生的行
export CUDA_LAUNCH_BLOCKING=1

# 开启 cuBLAS 日志以获取更详细的底层错误信息
export CUBLAS_LOGINFO_DBG=1
export CUBLAS_LOGDEST_DBG=stderr
export NVTE_FUSED_ATTN=0

PROJECT_NAME="slime_nemo_post_training_v3_sft"
EXPR_NAME="qwen3-4b-base-sft"
SUBMIT_TIME=$(date +%y%m%d_%H%M%S)
USER_ID=${USER_ID:-"xiang.long"}
export USER_ROOT=/cpfs01/${USER_ID}
export CKPT_DIR=${USER_ROOT}/checkpoints/${PROJECT_NAME}/${EXPR_NAME}
export TENSORBOARD_DIR=${USER_ROOT}/tensorboards/${PROJECT_NAME}/${EXPR_NAME}
export LOG_DIR=${USER_ROOT}/logs/${PROJECT_NAME}/${EXPR_NAME}
mkdir -p $CKPT_DIR
mkdir -p $TENSORBOARD_DIR
mkdir -p $LOG_DIR

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
    HAS_NVLINK=1
else
    HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}/")" &>/dev/null && pwd)"
echo $SCRIPT_DIR
BASE_DIR=$(realpath "$SCRIPT_DIR/../../../")
echo $BASE_DIR
source "${BASE_DIR}/scripts/models/qwen3-4B.sh"
#source "${BASE_DIR}/scripts/models/qwen3-4B-Instruct-2507.sh"


CKPT_ARGS=(
   --hf-checkpoint /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   --ref-load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   --load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   #--load $CKPT_DIR
   --save $CKPT_DIR
   --megatron-to-hf-mode bridge
   --save-interval 1000
   --loss-mask-type qwen3_fix
)

SFT_ARGS=(
   --rollout-function-path slime.rollout.sft_rollout.generate_rollout
   #--prompt-data /cpfs01/haoqingwang/nvidia_datasets/SFT-Merge/post_train_sft_token_blend.jsonl
   --prompt-data /cpfs01/datasets/haoqing_nv_sft_merge_post_train_sft_token_blend_debug_10k.jsonl
   --input-key messages
   --tool-key tools
   # --apply-chat-template
   --rollout-shuffle
   --num-epoch 2
   --rollout-batch-size 512
   --global-batch-size 512

   --loss-type sft_loss
   --calculate-per-token-loss
   --disable-compute-advantages-and-returns
   --debug-train-only
)

PERF_ARGS=(
   --tensor-model-parallel-size 2
   --sequence-parallel
   --pipeline-model-parallel-size 1
   --context-parallel-size 4
   --expert-model-parallel-size 1
   --expert-tensor-parallel-size 1

   --recompute-granularity full
   --recompute-method uniform
   --recompute-num-layers 1

   # --micro-batch-size 1
   --use-dynamic-batch-size
   --max-tokens-per-gpu 10368
)

OPTIMIZER_ARGS=(
   --optimizer adam
   --lr 1e-5
   --lr-decay-style cosine
   --min-lr 1e-6
   --lr-warmup-fraction 0.1
   --weight-decay 0.1
   --adam-beta1 0.9
   --adam-beta2 0.95
)

WANDB_ARGS=(
   # --use-wandb
   # --wandb-project slime-dev
   # --wandb-group qwen3-4B-base-sft
   # --wandb-key ${WANDB_KEY}
)

TENSORBOARD_ARGS=(
   --use-tensorboard
   # comment out due to use TENSORBOARD_DIR environ
   # --tb-project-name slime-dapo-17k
   # --tb-experiment-name qwen3-4b-colocate-baseline
)

MISC_ARGS=(
   # default dropout in megatron is 0.1
   --attention-dropout 0.0
   --hidden-dropout 0.0
   # should be good for model performance
   --accumulate-allreduce-grads-in-fp32
   --attention-softmax-in-fp32
   # need to comment this when using model with MLA
   --attention-backend flash
)

# launch the master node of ray in container
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
WORLD_SIZE=${WORLD_SIZE:-"1"}
# export no_proxy="127.0.0.1,${MASTER_ADDR}"

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
  \"env_vars\": {
    \"PYTHONPATH\": \"/root/Megatron-LM/:/cpfs01/xiang.long/slime\",
    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
  }
}"

#\"PYTORCH_ALLOC_CONF\": \"expandable_segments:True\"
ray job submit --address="http://${MASTER_ADDR}:8265" \
   --runtime-env-json="${RUNTIME_ENV_JSON}" \
   -- python3 train_async.py \
   --actor-num-nodes ${WORLD_SIZE} \
   --actor-num-gpus-per-node 8 \
   ${MODEL_ARGS[@]} \
   ${CKPT_ARGS[@]} \
   ${SFT_ARGS[@]} \
   ${OPTIMIZER_ARGS[@]} \
   ${WANDB_ARGS[@]} \
   ${PERF_ARGS[@]} \
   ${EVAL_ARGS[@]} \
   ${MISC_ARGS[@]}

error log:

(MegatronTrainRayActor pid=97511) [2026-01-06 05:23:46] timer.py:24 - Timer actor_train start
Traceback (most recent call last):
  File "/cpfs01/xiang.long/slime/train_async.py", line 77, in <module>
    train(args)
  File "/cpfs01/xiang.long/slime/train_async.py", line 48, in train
    ray.get(actor_model.async_train(rollout_id, rollout_data_curr_ref))
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
    values, debugger_breakpoint = worker.get_objects(
                                  ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AcceleratorError): ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 735, in forward
    hidden_states, context = layer(
                             ^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 1044, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 475, in forward
    hidden_states, context = self._forward_attention(*args, **kwargs)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 549, in _forward_attention
    attention_output_with_bias = self.self_attention(
                                 ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/attention.py", line 1020, in forward
    output, bias = self.linear_proj(core_attn_out)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/extensions/transformer_engine.py", line 431, in forward
    out = super().forward(x, is_first_microbatch=_is_first_microbatch)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 1468, in forward
    out = linear_fn(
          ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py", line 581, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 324, in forward
    gemm_out, *_, reduce_scatter_out = general_gemm(
                                       ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/cpp_extensions/gemm.py", line 206, in general_gemm
    out, bias_grad, gelu_input, extra_output = tex.generic_gemm(*args, **kwargs)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:750 in function cublas_gemm: cuBLAS Error: the function failed to launch on the GPU

During handling of the above exception, another exception occurred:

ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 359, in train
    return self.train_actor(rollout_id, rollout_data)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 448, in train_actor
    train(
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 591, in train
    loss_dict, grad_norm = train_one_step(
                           ^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 417, in train_one_step
    losses_reduced = forward_backward_func(
                     ^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 632, in forward_backward_no_pipelining
    output_tensor, num_tokens = forward_step(
                                ^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 417, in forward_step
    output_tensor, loss_func = forward_step_func(data_iterator, model)
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 408, in forward_step
    output_tensor = model(**forward_kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/distributed/data_parallel_base.py", line 22, in forward
    return self.module(*inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 456, in forward
    outputs = self.module(*inputs, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/models/gpt/gpt_model.py", line 481, in forward
    hidden_states = self.decoder(
                    ^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 586, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 699, in forward
    with rng_context, outer_quantization_context:
  File "/usr/lib/python3.12/contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 273, in fork
    self.states_[name] = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 55, in _get_cuda_rng_state
    return torch.cuda.random.get_rng_state(device=device)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
    return default_generator.get_state()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

---------------------------------------
Job 'raysubmit_3K3GFVd9WHpJDYiK' failed
---------------------------------------

Status message: Job entrypoint command failed with exit code 1, last available logs (truncated to 20,000 chars):
    return torch.cuda.random.get_rng_state(device=device)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
    return default_generator.get_state()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions