Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
31006e4
update code
miles-code-angel Dec 12, 2025
22f2527
[Hardware] AMD - MI350/MI355 dockerfile (#306)
yushengsu-thu Dec 12, 2025
61a319a
update code
miles-code-angel Dec 13, 2025
5c365db
Super tiny update link (#312)
fzyzcjy Dec 14, 2025
5202164
Tiny update doc about multi node training (#313)
fzyzcjy Dec 14, 2025
bdc41f0
add explicit argument name for new megatron compatibility (#324)
yueming-yuan Dec 19, 2025
996e31d
update outdated commands in docs (#339)
zijiexia Dec 23, 2025
7ca7026
tiny fix (#337)
Zhuohao-Li Dec 23, 2025
8626480
Fix example rollout_temperature and top_k (#338)
gongyisheng Dec 24, 2025
b3dc57d
Supported `qkv_format=bshd` with CP (#341)
yueming-yuan Dec 24, 2025
b777306
Add LoRA for FSDP backend. (#307) (#326)
GuanxingLu Dec 25, 2025
54d5717
Revert "Add LoRA for FSDP backend. (#307)" [will merge on later] (#351)
yushengsu-thu Dec 25, 2025
81a0711
[Fet] Lora FSDP RL training - #326 and add CI/CD tests (#351) (#352)
GuanxingLu Dec 28, 2025
d5e140d
Fix lora_rank attribute check in arguments.py (#363)
rucnyz Dec 29, 2025
3da31ef
Revert "Fix lora_rank attribute check in arguments.py" (#369)
yushengsu-thu Dec 29, 2025
ef57bde
Revert "[Feat] Lora FSDP RL training - #326 and add CI/CD tests (#351…
yushengsu-thu Dec 29, 2025
9a3b297
feat: Implement lazy data loading for Dataset (#246)
Ratish1 Dec 31, 2025
b0d3341
Revert "feat: Implement lazy data loading for Dataset" (#372)
zhaochenyang20 Dec 31, 2025
8ba715e
[MISC] add codeowners (#373)
Ying1123 Dec 31, 2025
bc61a7d
[Misc] update codeowners (#374)
Ying1123 Dec 31, 2025
66c14c4
Added uv build scripts (#2)
maharajamihir Dec 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
.github/CODEOWNERS @fzyzcjy @Ying1123
.github/workflows/ @yushengsu-thu
/miles/ @fzyzcjy @yueming-yuan
17 changes: 10 additions & 7 deletions build_conda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@ micromamba install -n miles cuda cuda-nvtx cuda-nvtx-dev nccl -c nvidia/label/cu
micromamba install -n miles -c conda-forge cudnn -y

# prevent installing cuda 13.0 for sglang
pip install cuda-python==12.9.1
pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu129
pip install cuda-python==13.1.0
pip install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu129

# install sglang
git clone https://github.com/sgl-project/sglang.git
cd sglang
git checkout 303cc957e62384044dfa8e52d7d8af8abe12f0ac
git checkout 5e2cda6158e670e64b926a9985d65826c537ac82
# Install the python packages
pip install -e "python[all]"

Expand All @@ -39,7 +39,7 @@ pip install cmake ninja
MAX_JOBS=64 pip -v install flash-attn==2.7.4.post1 --no-build-isolation

pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10887887bc74853f89a4de258c0702932a1c --no-deps
pip install --no-build-isolation "transformer_engine[pytorch]==2.8.0"
pip install --no-build-isolation "transformer_engine[pytorch]==2.10.0"
pip install flash-linear-attention==0.4.0
NVCC_APPEND_FLAGS="--threads 4" \
pip -v install --disable-pip-version-check --no-cache-dir \
Expand All @@ -50,7 +50,7 @@ git clone https://github.com/NVIDIA/Megatron-LM.git --recursive && \
cd Megatron-LM && git checkout ${MEGATRON_COMMIT} && \
pip install -e .

pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@9b8b788fdeb9c2ee528183214cef65a99b71e7d5 --no-cache-dir --force-reinstall
pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@dc6876905830430b5054325fa4211ff302169c6b --no-cache-dir --force-reinstall
pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation
pip install nvidia-modelopt[torch]>=0.37.0 --no-build-isolation

Expand All @@ -60,6 +60,9 @@ git clone https://github.com/NVIDIA/Megatron-LM.git --recursive && \
cd Megatron-LM/ && git checkout core_v0.14.0 && \
pip install -e .

# https://github.com/pytorch/pytorch/issues/168167
pip install nvidia-cudnn-cu12==9.16.0.29

# install miles and apply patches

# if miles does not exist locally, clone it
Expand All @@ -76,6 +79,6 @@ fi

# apply patch
cd $BASE_DIR/sglang
git apply $MILES_DIR/docker/patch/v0.5.5.post1/sglang.patch
git apply $MILES_DIR/docker/patch/v0.5.6/sglang.patch
cd $BASE_DIR/Megatron-LM
git apply $MILES_DIR/docker/patch/v0.5.5.post1/megatron.patch
git apply $MILES_DIR/docker/patch/v0.5.6/megatron.patch
194 changes: 194 additions & 0 deletions build_uv_berlin.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#!/bin/bash

# =============================================================================
# Miles Build Script (CUDA 12.8 Slurm Version)
#
# This script uses uv for Python environment management.
# It relies on the Slurm module system for CUDA 12.8 instead of pip packages.
#
# Configuration:
# - CUDA: 12.8 (via module load)
# - PyTorch: 2.8.0 (cu128)
# - Flash Attention 3: Prebuilt for cu128 + torch2.8
# - Flash Attention 2: Prebuilt for cu128 + torch2.8
# =============================================================================

set -e # Exit on error

BASE_DIR="$(pwd)/.." # change this if you want a different base directory. Default is parent directory of miles

if [ -z "$BASE_DIR" ]; then
echo "BASE_DIR is not set. Please set it to proceed with the installation."
exit 1
fi

# =============================================================================
# Load Slurm Module
# =============================================================================
echo "Loading CUDA 12.8 module..."
module load CUDA/12.8

# Verify NVCC is in path
if ! command -v nvcc &> /dev/null; then
echo "CRITICAL ERROR: nvcc not found after loading module."
echo "Please ensure 'module load CUDA/12.8' works on this cluster."
exit 1
fi

# =============================================================================
# Install uv if not already installed
# =============================================================================
if ! command -v uv &> /dev/null; then
echo "Installing uv..."
curl -LsSf https://astral.sh/uv/install.sh | sh
export PATH="$HOME/.local/bin:$PATH"
fi

# =============================================================================
# Create Python virtual environment with uv
# =============================================================================
# Create virtual environment with Python 3.12
uv venv --python 3.12

# Activate the virtual environment
source ".venv/bin/activate"

cd "$BASE_DIR"
# =============================================================================
# Install PyTorch with CUDA 12.8
# =============================================================================
echo "Installing PyTorch 2.8.0 with CUDA 12.8..."

# Install cuda-python (Pinned to 12.8 to match the module)
uv pip install cuda-python==12.8.0

# Install PyTorch 2.8.0 for CUDA 12.8
uv pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/cu128

# Set TORCH_CUDA_ARCH_LIST for our GPU architectures
# 8.0 = A100 (Ampere), 9.0 = H100 (Hopper)
export TORCH_CUDA_ARCH_LIST="8.0;9.0"

# =============================================================================
# Install sglang
# =============================================================================
echo "Installing sglang..."
cd "$BASE_DIR"
if [ ! -d "$BASE_DIR/sglang" ]; then
git clone https://github.com/sgl-project/sglang.git
fi
cd sglang
git checkout 303cc957e62384044dfa8e52d7d8af8abe12f0ac
uv pip install -e "python[all]"

# =============================================================================
# Install build tools
# =============================================================================
uv pip install cmake ninja packaging build wheel

# =============================================================================
# Install Flash Attention 3 (prebuilt wheels for cu128 + torch2.8)
# =============================================================================
echo "Installing Flash Attention 3..."
# Using windreamer's wheel index for CUDA 12.8 and PyTorch 2.8.0
uv pip install flash_attn_3 --find-links https://windreamer.github.io/flash-attention3-wheels/cu128_torch280 --extra-index-url https://download.pytorch.org/whl/cu128

# =============================================================================
# Install Flash Attention 2 (prebuilt wheel for Megatron compatibility)
# =============================================================================
echo "Installing Flash Attention 2..."
uv pip install https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.3.18/flash_attn-2.7.4%2Bcu128torch2.8-cp312-cp312-linux_x86_64.whl

# =============================================================================
# Install mbridge, transformer_engine, flash-linear-attention
# =============================================================================
echo "Installing mbridge, transformer_engine, flash-linear-attention..."
uv pip install git+https://github.com/ISEEKYAN/mbridge.git@89eb10887887bc74853f89a4de258c0702932a1c --no-deps

# Transformer Engine 2.8.0 (compatible with CUDA 12.8)
uv pip install --no-build-isolation "transformer_engine[pytorch]==2.8.0" --no-cache-dir

uv pip install flash-linear-attention==0.4.0

# =============================================================================
# Install NVIDIA Apex (requires CUDA compilation)
# =============================================================================
echo "Installing NVIDIA Apex (Compiling from source)..."
# We utilize the loaded CUDA module for compilation
NVCC_APPEND_FLAGS="--threads 4" \
APEX_CPP_EXT=1 \
APEX_CUDA_EXT=1 \
APEX_PARALLEL_BUILD=8 \
uv pip install -v --no-cache-dir \
--no-build-isolation \
git+https://github.com/NVIDIA/apex.git@10417aceddd7d5d05d7cbf7b0fc2daad1105f8b4

# =============================================================================
# Install Megatron-LM
# =============================================================================
echo "Installing Megatron-LM..."
cd "$BASE_DIR"
if [ ! -d "$BASE_DIR/Megatron-LM" ]; then
git clone https://github.com/NVIDIA/Megatron-LM.git --recursive
fi
cd Megatron-LM
git checkout core_v0.14.0
uv pip install -e .

# =============================================================================
# Install additional dependencies
# =============================================================================
echo "Installing additional dependencies..."
uv pip install poetry pybind11
uv pip install git+https://github.com/fzyzcjy/torch_memory_saver.git@9b8b788fdeb9c2ee528183214cef65a99b71e7d5 --no-cache-dir --force-reinstall
uv pip install git+https://github.com/fzyzcjy/Megatron-Bridge.git@dev_rl --no-build-isolation
uv pip install "nvidia-modelopt[torch]>=0.37.0" --no-build-isolation

# =============================================================================
# Install remaining packages
# =============================================================================
uv pip install sglang_router ring_flash_attn pylatexenc
uv pip install -U "ray[data,train,tune,serve]"

# =============================================================================
# Install miles
# =============================================================================
echo "Installing miles..."
if [ ! -d "$BASE_DIR/miles" ]; then
cd "$BASE_DIR"
git clone https://github.com/radixark/miles.git
cd miles/
export MILES_DIR="$BASE_DIR/miles"
uv pip install -e .
elif [ -f "$BASE_DIR/pyproject.toml" ]; then
export MILES_DIR="$BASE_DIR"
cd "$MILES_DIR"
uv pip install -e .
else
export MILES_DIR="$BASE_DIR/miles"
cd "$MILES_DIR"
uv pip install -e .
fi

# =============================================================================
# Apply patches
# =============================================================================
echo "Applying patches..."
cd "$BASE_DIR/sglang"
git apply "$MILES_DIR/docker/patch/v0.5.5.post1/sglang.patch" || echo "sglang patch already applied or failed"

cd "$BASE_DIR/Megatron-LM"
git apply "$MILES_DIR/docker/patch/v0.5.5.post1/megatron.patch" || echo "Megatron patch already applied or failed"

echo ""
echo "============================================================================="
echo "Installation complete!"
echo ""
echo "To activate the environment, run:"
echo " module load CUDA/12.8"
echo " source $BASE_DIR/miles-venv/bin/activate"
echo ""
echo "Environment configured using System CUDA from 'module load CUDA/12.8'"
echo "PyTorch Version: 2.8.0 (cu128)"
echo "CUDA_HOME: $CUDA_HOME"
echo "============================================================================="
Loading