ROCm · VeeraRajasekhar · Sep 19, 2025 · Sep 22, 2025 · Sep 22, 2025 · Sep 23, 2025
@@ -19,7 +19,7 @@ jobs:
         run: |
           apt-get update
           apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake==3.21.0 pybind11[global] ninja
+          pip install cmake==3.21.0 pybind11[global] ninja nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -43,7 +43,7 @@ jobs:
         run: |
           apt-get update
           apt-get install -y git python3.9 pip cudnn9-cuda-12
-          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript
+          pip install cmake torch ninja pydantic importlib-metadata>=1.0 packaging pybind11 numpy einops onnxscript nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -63,7 +63,7 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install pybind11[global]
+        run: pip install pybind11[global] nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:
@@ -83,7 +83,7 @@ jobs:
       options: --user root
     steps:
       - name: 'Dependencies'
-        run: pip install torch pybind11[global] einops onnxscript
+        run: pip install torch pybind11[global] einops onnxscript nvidia-mathdx==25.1.1
       - name: 'Checkout'
         uses: actions/checkout@v3
         with:

@@ -307,7 +307,6 @@ def sanity_checks(
             cfg,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
-            window_size=cfg.window_size,
             pad_between_seqs=pad_between_seqs,
         )
         flash_ok, fused_ok, _ = avail
@@ -368,7 +367,6 @@ def main(args):
             config,
             qkv_dtype=dtype,
             qkv_layout=qkv_layout,
-            window_size=config.window_size,
             pad_between_seqs=pad_between_seqs,
         )
         flash_attn_supported, fused_attn_supported, unfused_attn_supported = available_backends

@@ -0,0 +1,152 @@
+# Copyright (c) 2022-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# See LICENSE for license information.
+
+import argparse
+import torch
+import pandas as pd
+import torch.utils.benchmark as benchmark
+
+import transformer_engine.pytorch as te
+import transformer_engine_torch as tex
+import transformer_engine.pytorch.cpp_extensions as ext
+
+from transformer_engine.pytorch.tensor.nvfp4_tensor import NVFP4Quantizer
+
+scale_padding_to = 1
+permute_scale = False
+
+TORCH_TO_TE_FLOAT_MAP = {
+    torch.bfloat16: tex.DType.kBFloat16,
+}
+
+
+def run_kernel(shape, stochastic_rounding: bool, input_dtype=torch.bfloat16):
+    # Generate random input data
+    M, K = shape
+    x = torch.randn([M, K], dtype=input_dtype, device="cuda")
+
+    assert shape[0] % 16 == 0, "Shape must be divisible by 16"
+    assert shape[1] % 16 == 0, "Shape must be divisible by 16"
+
+    # Quantize
+    nvfp4_quantizer = NVFP4Quantizer(
+        fp4_dtype=tex.DType.kFloat4E2M1,
+        rowwise=True,
+        columnwise=True,
+        with_amax_reduction=False,
+        amax_reduction_group=None,
+        with_rht=True,
+        with_post_rht_amax=True,
+        with_random_sign_mask=True,
+        stochastic_rounding=stochastic_rounding,
+    )
+    x_nvfp4_sut = nvfp4_quantizer.make_empty(
+        (M, K), dtype=x.dtype, device=x.device, requires_grad=False
+    )
+    x_nvfp4_sut = nvfp4_quantizer.update_quantized(x, x_nvfp4_sut)
+
+    with torch.no_grad():
+        stmt = "kernel_func(input, output)"
+        globals_dict = {
+            "kernel_func": nvfp4_quantizer.update_quantized,
+            "input": x,
+            "output": x_nvfp4_sut,
+        }
+
+        timing = benchmark.Timer(
+            stmt=stmt,
+            globals=globals_dict,
+            num_threads=1,
+        ).blocked_autorange(min_run_time=5)
+    print(timing)
+    timing_us = timing.median * 1e6
+
+    input_nbytes = shape[0] * shape[1] * 2  # bf16
+    output_nbytes = shape[0] * shape[1] // 2  # //2 for fp4
+    sf_nbytes = shape[0] * shape[1] // 16  # //16 for 1 byte per 16 elems
+
+    total_nbytes = (
+        0
+        + input_nbytes
+        * 3  # Reading input for Amax(x)&Amax(RHT(x.T)), Reading input for Cast(x), Reaindg input for Cast(RHT(x.T))
+        + 2 * 4  # Output 2 * float for scale & amax
+        + 2 * 4  # Input 2 * float
+        + output_nbytes * 2  # Output from Cast(x) and Cast(RHT(x.T))
+        + sf_nbytes * 2  # Scale factor
+    )
+
+    throughput_GBps = total_nbytes / (1024 * 1024 * 1024) / (timing_us / 1e6)
+
+    print(
+        f"Stochastic rounding: {stochastic_rounding}, Total: {total_nbytes} bytes, Throughput:"
+        f" {throughput_GBps} GB/s"
+    )
+    return timing_us, throughput_GBps
+
+
+# Nsight Compute Profiling Command:
+# ncu -f -o block_scaled_1d_cast_transpose_kernel --set=full --kernel-name "block_scaled_1d_cast_transpose_kernel" -s 5 -c 5 python benchmark_cast_transpose_1d_block.py --profile
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--profile", action="store_true", help="Enable profiling mode")
+    args = parser.parse_args()
+
+    if args.profile:
+        print("Profiling is enabled.")
+    else:
+        print("Profiling is disabled.")
+
+    shapes = [
+        (8192, 5120),
+        (8192, 10240),
+        (8192, 2560),
+        (8192, 11328),
+        (8192, 512),
+        (8192, 3584),
+        (5120, 8192),
+        (10240, 8192),
+        (2560, 8192),
+        (11328, 8192),
+        (512, 8192),
+        (3584, 8192),
+        (4096, 16384),
+        (14336, 16384),
+    ]
+
+    if args.profile:
+        shapes = [
+            (16384, 6144),
+        ]
+
+    data = []
+    for stochastic_rounding in [True]:  # , False]:
+        for shape in shapes:
+            print(
+                f"Running benchmark_func with shape {shape} and stochastic_rounding"
+                f" {stochastic_rounding}"
+            )
+            timing_us, throughput_GBps = run_kernel(shape, stochastic_rounding)
+            data.append(
+                [
+                    "benchmark_func",
+                    shape,
+                    stochastic_rounding,
+                    timing_us,
+                    throughput_GBps,
+                ]
+            )
+
+    df = pd.DataFrame(
+        data=data,
+        columns=[
+            "kernel",
+            "shape",
+            "stochastic_rounding",
+            "timing_us",
+            "throughput(GB/s)",
+        ],
+    )
+    print(df)
+    df.to_csv("benchmark_cast_nvfp4.csv", index=False)
@@ -1 +1 @@
-2.8.0.dev0
+2.8.0
@@ -100,11 +100,18 @@ def setup_jax_extension(
     # Define TE/JAX as a Pybind11Extension
     from pybind11.setup_helpers import Pybind11Extension
 
+    # Note: Collective GEMM operations are not supported on ROCm yet
+    if rocm_build():
+        comm_libraries = []
+    else:
+        comm_libraries = ["nccl"]
+
     return Pybind11Extension(
         "transformer_engine_jax",
         sources=[str(path) for path in sources],
         include_dirs=[str(path) for path in include_dirs],
         extra_compile_args=cxx_flags,
+        libraries=comm_libraries,
     )
 
 

@@ -27,7 +27,7 @@
 
 def install_requirements() -> List[str]:
     """Install dependencies for TE/PyTorch extensions."""
-    return ["torch>=2.1", "einops", "onnxscript==0.3.1", "onnx"]
+    return ["torch>=2.1", "einops", "onnxscript", "onnx"]
 
 
 def test_requirements() -> List[str]:

@@ -305,15 +305,18 @@ def get_cuda_include_dirs() -> Tuple[str, str]:
 
 @functools.lru_cache(maxsize=None)
 def cuda_archs() -> str:
-    version = cuda_version()
-    if os.getenv("NVTE_CUDA_ARCHS") is None:
+    archs = os.getenv("NVTE_CUDA_ARCHS")
+    if archs is None:
+        version = cuda_version()
         if version >= (13, 0):
-            os.environ["NVTE_CUDA_ARCHS"] = "75;80;89;90;100;120"
+            archs = "75;80;89;90;100;100a;103a;120"
+        elif version >= (12, 9):
+            archs = "70;80;89;90;100;100a;103a;120"
         elif version >= (12, 8):
-            os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90;100;120"
+            archs = "70;80;89;90;100;100a;120"
         else:
-            os.environ["NVTE_CUDA_ARCHS"] = "70;80;89;90"
-    return os.getenv("NVTE_CUDA_ARCHS")
+            archs = "70;80;89;90"
+    return archs
 
 
 def cuda_version() -> Tuple[int, ...]: