torch.AcceleratorError: CUDA error: an illegal memory access was encountered

script:
```bash
#!/bin/bash

set -ex

# will prevent ray from buffering stdout/stderr
export PYTHONBUFFERED=16
# 强制同步执行，使报错信息出现在实际发生的行
export CUDA_LAUNCH_BLOCKING=1

# 开启 cuBLAS 日志以获取更详细的底层错误信息
export CUBLAS_LOGINFO_DBG=1
export CUBLAS_LOGDEST_DBG=stderr
export NVTE_FUSED_ATTN=0

PROJECT_NAME="slime_nemo_post_training_v3_sft"
EXPR_NAME="qwen3-4b-base-sft"
SUBMIT_TIME=$(date +%y%m%d_%H%M%S)
USER_ID=${USER_ID:-"xiang.long"}
export USER_ROOT=/cpfs01/${USER_ID}
export CKPT_DIR=${USER_ROOT}/checkpoints/${PROJECT_NAME}/${EXPR_NAME}
export TENSORBOARD_DIR=${USER_ROOT}/tensorboards/${PROJECT_NAME}/${EXPR_NAME}
export LOG_DIR=${USER_ROOT}/logs/${PROJECT_NAME}/${EXPR_NAME}
mkdir -p $CKPT_DIR
mkdir -p $TENSORBOARD_DIR
mkdir -p $LOG_DIR

NVLINK_COUNT=$(nvidia-smi topo -m 2>/dev/null | grep -o 'NV[0-9][0-9]*' | wc -l)
if [ "$NVLINK_COUNT" -gt 0 ]; then
    HAS_NVLINK=1
else
    HAS_NVLINK=0
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}/")" &>/dev/null && pwd)"
echo $SCRIPT_DIR
BASE_DIR=$(realpath "$SCRIPT_DIR/../../../")
echo $BASE_DIR
source "${BASE_DIR}/scripts/models/qwen3-4B.sh"
#source "${BASE_DIR}/scripts/models/qwen3-4B-Instruct-2507.sh"


CKPT_ARGS=(
   --hf-checkpoint /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   --ref-load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   --load /cpfs01/models/Qwen/Qwen3-4B-Base-replace-instruct-tokenizer
   #--load $CKPT_DIR
   --save $CKPT_DIR
   --megatron-to-hf-mode bridge
   --save-interval 1000
   --loss-mask-type qwen3_fix
)

SFT_ARGS=(
   --rollout-function-path slime.rollout.sft_rollout.generate_rollout
   #--prompt-data /cpfs01/haoqingwang/nvidia_datasets/SFT-Merge/post_train_sft_token_blend.jsonl
   --prompt-data /cpfs01/datasets/haoqing_nv_sft_merge_post_train_sft_token_blend_debug_10k.jsonl
   --input-key messages
   --tool-key tools
   # --apply-chat-template
   --rollout-shuffle
   --num-epoch 2
   --rollout-batch-size 512
   --global-batch-size 512

   --loss-type sft_loss
   --calculate-per-token-loss
   --disable-compute-advantages-and-returns
   --debug-train-only
)

PERF_ARGS=(
   --tensor-model-parallel-size 2
   --sequence-parallel
   --pipeline-model-parallel-size 1
   --context-parallel-size 4
   --expert-model-parallel-size 1
   --expert-tensor-parallel-size 1

   --recompute-granularity full
   --recompute-method uniform
   --recompute-num-layers 1

   # --micro-batch-size 1
   --use-dynamic-batch-size
   --max-tokens-per-gpu 10368
)

OPTIMIZER_ARGS=(
   --optimizer adam
   --lr 1e-5
   --lr-decay-style cosine
   --min-lr 1e-6
   --lr-warmup-fraction 0.1
   --weight-decay 0.1
   --adam-beta1 0.9
   --adam-beta2 0.95
)

WANDB_ARGS=(
   # --use-wandb
   # --wandb-project slime-dev
   # --wandb-group qwen3-4B-base-sft
   # --wandb-key ${WANDB_KEY}
)

TENSORBOARD_ARGS=(
   --use-tensorboard
   # comment out due to use TENSORBOARD_DIR environ
   # --tb-project-name slime-dapo-17k
   # --tb-experiment-name qwen3-4b-colocate-baseline
)

MISC_ARGS=(
   # default dropout in megatron is 0.1
   --attention-dropout 0.0
   --hidden-dropout 0.0
   # should be good for model performance
   --accumulate-allreduce-grads-in-fp32
   --attention-softmax-in-fp32
   # need to comment this when using model with MLA
   --attention-backend flash
)

# launch the master node of ray in container
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
WORLD_SIZE=${WORLD_SIZE:-"1"}
# export no_proxy="127.0.0.1,${MASTER_ADDR}"

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
  \"env_vars\": {
    \"PYTHONPATH\": \"/root/Megatron-LM/:/cpfs01/xiang.long/slime\",
    \"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
    \"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
  }
}"

#\"PYTORCH_ALLOC_CONF\": \"expandable_segments:True\"
ray job submit --address="http://${MASTER_ADDR}:8265" \
   --runtime-env-json="${RUNTIME_ENV_JSON}" \
   -- python3 train_async.py \
   --actor-num-nodes ${WORLD_SIZE} \
   --actor-num-gpus-per-node 8 \
   ${MODEL_ARGS[@]} \
   ${CKPT_ARGS[@]} \
   ${SFT_ARGS[@]} \
   ${OPTIMIZER_ARGS[@]} \
   ${WANDB_ARGS[@]} \
   ${PERF_ARGS[@]} \
   ${EVAL_ARGS[@]} \
   ${MISC_ARGS[@]}

```

error log:
```
(MegatronTrainRayActor pid=97511) [2026-01-06 05:23:46] timer.py:24 - Timer actor_train start
Traceback (most recent call last):
  File "/cpfs01/xiang.long/slime/train_async.py", line 77, in <module>
    train(args)
  File "/cpfs01/xiang.long/slime/train_async.py", line 48, in train
    ray.get(actor_model.async_train(rollout_id, rollout_data_curr_ref))
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 104, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2967, in get
    values, debugger_breakpoint = worker.get_objects(
                                  ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 1015, in get_objects
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError(AcceleratorError): ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 735, in forward
    hidden_states, context = layer(
                             ^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 1044, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 475, in forward
    hidden_states, context = self._forward_attention(*args, **kwargs)
                             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_layer.py", line 549, in _forward_attention
    attention_output_with_bias = self.self_attention(
                                 ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/attention.py", line 1020, in forward
    output, bias = self.linear_proj(core_attn_out)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/extensions/transformer_engine.py", line 431, in forward
    out = super().forward(x, is_first_microbatch=_is_first_microbatch)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/_dynamo/eval_frame.py", line 1044, in _fn
    return fn(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 1468, in forward
    out = linear_fn(
          ^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/autograd/function.py", line 581, in apply
    return super().apply(*args, **kwargs)  # type: ignore[misc]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/module/linear.py", line 324, in forward
    gemm_out, *_, reduce_scatter_out = general_gemm(
                                       ^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/transformer_engine/pytorch/cpp_extensions/gemm.py", line 206, in general_gemm
    out, bias_grad, gelu_input, extra_output = tex.generic_gemm(*args, **kwargs)
                                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
RuntimeError: /TransformerEngine/transformer_engine/common/gemm/cublaslt_gemm.cu:750 in function cublas_gemm: cuBLAS Error: the function failed to launch on the GPU

During handling of the above exception, another exception occurred:

ray::MegatronTrainRayActor.train() (pid=97802, ip=109.22.105.1, actor_id=cb7efc4f05f4304402d0a8dd02000000, repr=<slime.backends.megatron_utils.actor.MegatronTrainRayActor object at 0x7f1eb0cffc80>)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 359, in train
    return self.train_actor(rollout_id, rollout_data)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/actor.py", line 448, in train_actor
    train(
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 591, in train
    loss_dict, grad_norm = train_one_step(
                           ^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 417, in train_one_step
    losses_reduced = forward_backward_func(
                     ^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 632, in forward_backward_no_pipelining
    output_tensor, num_tokens = forward_step(
                                ^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/pipeline_parallel/schedules.py", line 417, in forward_step
    output_tensor, loss_func = forward_step_func(data_iterator, model)
                               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/cpfs01/xiang.long/slime/slime/backends/megatron_utils/model.py", line 408, in forward_step
    output_tensor = model(**forward_kwargs)
                    ^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/distributed/data_parallel_base.py", line 22, in forward
    return self.module(*inputs, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 456, in forward
    outputs = self.module(*inputs, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/models/gpt/gpt_model.py", line 481, in forward
    hidden_states = self.decoder(
                    ^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 586, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/module.py", line 319, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/transformer/transformer_block.py", line 699, in forward
    with rng_context, outer_quantization_context:
  File "/usr/lib/python3.12/contextlib.py", line 158, in __exit__
    self.gen.throw(value)
  File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 273, in fork
    self.states_[name] = _get_cuda_rng_state(graph_safe=self.use_cudagraphable_rng)
                         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/Megatron-LM/megatron/core/tensor_parallel/random.py", line 55, in _get_cuda_rng_state
    return torch.cuda.random.get_rng_state(device=device)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
    return default_generator.get_state()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

---------------------------------------
Job 'raysubmit_3K3GFVd9WHpJDYiK' failed
---------------------------------------

Status message: Job entrypoint command failed with exit code 1, last available logs (truncated to 20,000 chars):
    return torch.cuda.random.get_rng_state(device=device)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/torch/cuda/random.py", line 43, in get_rng_state
    return default_generator.get_state()
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
torch.AcceleratorError: CUDA error: an illegal memory access was encountered
Search for `cudaErrorIllegalAddress' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

torch.AcceleratorError: CUDA error: an illegal memory access was encountered #1297

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

torch.AcceleratorError: CUDA error: an illegal memory access was encountered #1297

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions