From 52500854c106b8cf1bbb7c6d00e2fe30f11df441 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Fri, 24 Oct 2025 10:39:24 +0800
Subject: [PATCH 01/26] fp4 dense

---
 fastdeploy/envs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index 3b0be3df998..f7bd505ca06 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -126,6 +126,8 @@
     "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
     # Count for cache_transfer_manager process error
     "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
+    # FP4 dense GEMM backend
+    "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None),
 }
 
 

From b0c863a0dea315e10f512b99db0029a85d942e3b Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Mon, 27 Oct 2025 18:49:31 +0800
Subject: [PATCH 02/26] [WIP] support nvfp4, dense part

---
 fastdeploy/flashinfer.py                      |  35 +++
 .../layers/quantization/__init__.py           |   3 +
 .../layers/quantization/nvfp4.py              | 276 ++++++++++++++++++
 3 files changed, 314 insertions(+)
 create mode 100644 fastdeploy/flashinfer.py
 create mode 100644 fastdeploy/model_executor/layers/quantization/nvfp4.py

diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py
new file mode 100644
index 00000000000..4bc6aa994f2
--- /dev/null
+++ b/fastdeploy/flashinfer.py
@@ -0,0 +1,35 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import functools
+import importlib
+import importlib.util
+import shutil
+
+
+@functools.cache
+def has_flashinfer() -> bool:
+    """Return `True` if FlashInfer is available."""
+    # Use find_spec to check if the module exists without importing it
+    # This avoids potential CUDA initialization side effects
+    if importlib.util.find_spec("flashinfer") is None:
+        # logger.debug_once("FlashInfer unavailable since package was not found")
+        return False
+    # Also check if nvcc is available since it's required to JIT compile flashinfer
+    if shutil.which("nvcc") is None:
+        # logger.debug_once("FlashInfer unavailable since nvcc was not found")
+        return False
+    return True
diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index f8716369852..1c9a169a5c4 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -33,6 +33,7 @@
     "mix_quant",
     "tensor_wise_fp8",
     "kvcache",
+    "modelopt_fp4",
 ]
 
 
@@ -116,6 +117,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .block_wise_fp8 import BlockWiseFP8Config
     from .kv_cache import KvCacheQuantConfig
     from .mix_quant import MixQuantConfig
+    from .nvfp4 import ModelOptNvFp4Config
     from .tensor_wise_fp8 import TensorWiseFP8Config
     from .w4a8 import W4A8Config
     from .w4afp8 import W4AFP8Config
@@ -137,6 +139,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
         "tensor_wise_fp8": TensorWiseFP8Config,
         "kvcache": KvCacheQuantConfig,
         "mix_quant": MixQuantConfig,
+        "modelopt_fp4": ModelOptNvFp4Config,
     }
 
     return method_to_config[quantization]
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
new file mode 100644
index 00000000000..e686512d50b
--- /dev/null
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -0,0 +1,276 @@
+"""
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+from typing import Optional
+
+import paddle
+from paddleformers.utils.log import logger
+
+from fastdeploy import envs
+from fastdeploy.flashinfer import has_flashinfer
+from fastdeploy.model_executor.layers.moe import FusedMoE
+
+from .quant_base import QuantConfigBase, QuantMethodBase
+
+if has_flashinfer():
+    from flashinfer import fp4_quantize as scaled_fp4_quant  # need to use vllm version
+    from flashinfer import mm_fp4 as fp4_gemm
+
+
+def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor:
+    """
+    Pad and block-interleave the FP4 block-scales so that they match the data
+    layout expected by the CUTLASS / FlashInfer kernels.
+
+    Parameters
+    ----------
+    scale: paddle.Tensor
+
+    Returns
+    -------
+    torch.Tensor
+        The swizzled tensor with the same logical shape as *scale*.
+    """
+    assert scale.dtype == paddle.float8_e4m3fn, (
+        "swizzle_blockscale expects the input tensor to be in " "paddle.float8_e4m3fn format."
+    )
+
+    scale_ndim = scale.ndim
+    if scale_ndim == 2:
+        scale = scale.unsqueeze(0)  # (1, M, K)
+    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
+
+    B, M, K = scale.shape
+
+    def _round_up(x: int, m: int) -> int:
+        return (x + m - 1) // m * m
+
+    M_padded = _round_up(M, 128)
+    K_padded = _round_up(K, 4)
+
+    padded = paddle.zeros((B, M_padded, K_padded), dtype=scale.dtype, device=scale.place)
+    padded[:B, :M, :K] = scale
+
+    # Reshape / permute to the layout required by the kernel.
+    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
+    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
+
+    if scale_ndim == 2:
+        return swizzled.reshape(M_padded, K_padded)
+    return swizzled.reshape(B, M_padded, K_padded)
+
+
+class ModelOptNvFp4Config(QuantConfigBase):
+    """
+    quantization config for ModelOpt Nvfp4 datatype
+    """
+
+    def __init__(
+        self,
+        is_checkpoint_nvfp4_serialized: bool,
+        kv_cache_quant_algo: str | None,
+        exclude_modules: list[str],
+        group_size: int = 16,
+    ) -> None:
+        self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
+        if is_checkpoint_nvfp4_serialized:
+            logger.warning(
+                "Detected ModelOpt NVFP4 checkpoint. Please note that"
+                " the format is experimental and could change in future."
+            )
+
+            self.group_size = group_size
+            self.kv_cache_quant_algo = kv_cache_quant_algo
+            self.exclude_modules = exclude_modules
+
+    def name(self) -> str:
+        return "modelopt_fp4"
+
+    @classmethod
+    def from_config(cls, config: dict) -> "ModelOptNvFp4Config":
+        if "quantization" in config:
+            # Traditional ModelOpt format:
+            # {"quantization": {"quant_algo": "..."}}
+            quant_config = cls.get_from_keys(config, ["quantization"])
+            if not isinstance(quant_config, dict):
+                raise ValueError("Expected 'quantization' to be a dictionary in config")
+
+            quant_method = quant_config.get("quant_algo", "")
+            if not quant_method:
+                raise ValueError("Missing 'quant_algo' in quantization config")
+
+            # Handle kv_cache_quant_algo with proper type validation
+            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
+            if kv_cache_quant_algo_raw is None:
+                # No KV cache quantization by default
+                kv_cache_quant_algo = None
+            elif isinstance(kv_cache_quant_algo_raw, str):
+                kv_cache_quant_algo = kv_cache_quant_algo_raw
+            else:
+                raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}")
+
+            # Handle group_size with proper type validation
+            group_size_raw = quant_config.get("group_size")
+            if group_size_raw is None:
+                group_size = 16  # Default value
+            elif isinstance(group_size_raw, int):
+                group_size = group_size_raw
+            else:
+                try:
+                    group_size = int(group_size_raw)
+                except (ValueError, TypeError):
+                    raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None
+
+            # "exclude_modules" is the key in the legacy hf_quant_config.json
+            exclude_modules = quant_config.get("exclude_modules", [])
+            if not isinstance(exclude_modules, list):
+                raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}")
+        else:
+            raise ValueError(
+                "Missing 'quantization' section in config. Please make sure your model is exported using FastDeploy."
+            )
+
+        is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
+
+        # For FP4, these fields are required
+        if is_checkpoint_nvfp4_serialized and "quantization" in config:
+            # Check if required fields are present in the quantization config
+            quant_config = config["quantization"]
+            required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"]
+            missing_fields = [field for field in required_fields if field not in quant_config]
+            if missing_fields:
+                raise ValueError(
+                    f"NVFP4 quantization requires the following fields in " f"hf_quant_config.json: {missing_fields}"
+                )
+
+        return cls(
+            is_checkpoint_nvfp4_serialized=is_checkpoint_nvfp4_serialized,
+            kv_cache_quant_algo=kv_cache_quant_algo,
+            exclude_modules=exclude_modules,
+            group_size=group_size,
+        )
+
+    def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
+        """
+        Get quantization method.
+        """
+        # skip_layer = self.is_layer_excluded(prefix)
+        if isinstance(layer, FusedMoE):
+            # if skip_layer:
+            #     return None
+            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
+        else:
+            # LinearBase
+            # if skip_layer:
+            #     return UnquantizedLinearMethod()
+            # Check if this is a vision model layer that should not be quantized
+            # if "vision_tower" in prefix or "vision_model" in prefix:
+            #     return UnquantizedLinearMethod()
+            return ModelOptNvFp4LinearMethod(self)
+
+        return None
+
+
+class ModelOptNvFp4LinearMethod(QuantMethodBase):
+    """Linear method for Model Optimizer NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    input_scale: paddle.float32, scalar ,
+    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
+    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
+    weight_scale_2: paddle.float32, scalar,
+    Args: quant_config: The ModelOpt quantization config.
+    """
+
+    def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
+        self.quant_config = quant_config
+
+        self.backend = "none"
+        if envs.FD_NVFP4_GEMM_BACKEND is None:
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.FD_NVFP4_GEMM_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
+
+        if self.backend == "none":
+            raise ValueError("No valid NVFP4 GEMM backend found. " "Please check your platform capability.")
+
+        logger.info(f"Using {self.backend} for NVFP4 GEMM")
+
+    def create_weights(
+        self,
+        layer,
+        **extra_weight_attrs,
+    ):
+        return
+
+    def process_weights_after_loading(self, layer) -> None:
+        return
+
+    def apply(
+        self,
+        layer,
+        x,
+    ):
+        x_m, _ = x.shape
+        w_n, _ = layer.weight.shape
+        output_shape = [x_m, w_n]
+        output_dtype = x.dtype
+
+        # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
+
+        assert x_fp4.dtype == paddle.uint8
+        assert x_scale_interleaved.dtype == paddle.float8_e4m3fn
+        assert layer.weight.dtype == paddle.uint8
+        assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn
+        assert layer.alpha.dtype == paddle.float32
+
+        if self.backend.startswith("flashinfer-"):
+            backend = self.backend[len("flashinfer-") :]
+        else:
+            raise ValueError(f"Unsupported backend: {self.backend}.")
+
+        w = layer.weight.T
+        w_scale_interleaved = layer.weight_scale_interleaved.T
+
+        if backend == "cutlass":
+            x_scale_interleaved = x_scale_interleaved.view(paddle.uint8)
+            w_scale_interleaved = w_scale_interleaved.view(paddle.uint8)
+        out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend)
+
+        if layer.with_bias:
+            out = paddle.add(out, layer.bias)
+        return out.view(*output_shape)
+
+
+class ModelOptNvFp4FusedMoE:
+    """Fused MoE method for Model Optimizer NVFP4.
+    Supports loading NVFP4 checkpoints with the following structure:
+
+    input_scale: paddle.float32, scalar ,
+    weight: NVFP4(represented as byte) Shape: [1, X, y/2]
+    weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale,
+    weight_scale_2: paddle.float32, scalar,
+    Args:
+    quant_config: The ModelOpt quantization config.
+    moe_config: The MoE configuration.
+    layer: The linear layer.
+    """
+
+    def __init__(self):
+        pass

From d5f3fd269d3e552f935c6d21398e1321d949a9f3 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Tue, 28 Oct 2025 18:29:34 +0800
Subject: [PATCH 03/26] [wip] developing loading qwen model

---
 .../layers/quantization/__init__.py           |   5 +
 .../layers/quantization/nvfp4.py              | 121 ++++++++++++------
 2 files changed, 84 insertions(+), 42 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index 1c9a169a5c4..a6bffde03db 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -100,6 +100,11 @@ def _get_offline_quant_config_name(quantization_config, is_torch_weight, is_v1_l
         has_block_size = "weight_block_size" in quantization_config
         if quant_method == "fp8" and has_block_size:
             quant_config_name = "block_wise_fp8"
+        elif quant_method == "modelopt":
+            if quantization_config.get("quant_algo", "") == "NVFP4":
+                quant_config_name = "modelopt_fp4"
+            else:
+                raise ValueError("modelopt only supports NVFP4 quantization.")
         else:
             raise ValueError("Torch weight offline quantization only supports block-wise FP8.")
     else:
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index e686512d50b..ebd122a1b84 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -17,6 +17,9 @@
 from typing import Optional
 
 import paddle
+
+paddle.compat.enable_torch_proxy()
+
 from paddleformers.utils.log import logger
 
 from fastdeploy import envs
@@ -96,52 +99,46 @@ def __init__(
             self.kv_cache_quant_algo = kv_cache_quant_algo
             self.exclude_modules = exclude_modules
 
+        self.quant_max_bound = 6
+        self.quant_min_bound = -6
+        self.quant_round_type = 1
+
     def name(self) -> str:
         return "modelopt_fp4"
 
     @classmethod
     def from_config(cls, config: dict) -> "ModelOptNvFp4Config":
-        if "quantization" in config:
-            # Traditional ModelOpt format:
-            # {"quantization": {"quant_algo": "..."}}
-            quant_config = cls.get_from_keys(config, ["quantization"])
-            if not isinstance(quant_config, dict):
-                raise ValueError("Expected 'quantization' to be a dictionary in config")
-
-            quant_method = quant_config.get("quant_algo", "")
-            if not quant_method:
-                raise ValueError("Missing 'quant_algo' in quantization config")
-
-            # Handle kv_cache_quant_algo with proper type validation
-            kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
-            if kv_cache_quant_algo_raw is None:
-                # No KV cache quantization by default
-                kv_cache_quant_algo = None
-            elif isinstance(kv_cache_quant_algo_raw, str):
-                kv_cache_quant_algo = kv_cache_quant_algo_raw
-            else:
-                raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}")
-
-            # Handle group_size with proper type validation
-            group_size_raw = quant_config.get("group_size")
-            if group_size_raw is None:
-                group_size = 16  # Default value
-            elif isinstance(group_size_raw, int):
-                group_size = group_size_raw
-            else:
-                try:
-                    group_size = int(group_size_raw)
-                except (ValueError, TypeError):
-                    raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None
-
-            # "exclude_modules" is the key in the legacy hf_quant_config.json
-            exclude_modules = quant_config.get("exclude_modules", [])
-            if not isinstance(exclude_modules, list):
-                raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}")
+        quant_config = config
+        quant_method = quant_config.get("quant_algo", "")
+        if not quant_method:
+            raise ValueError("Missing 'quant_algo' in quantization config")
+
+        # Handle kv_cache_quant_algo with proper type validation
+        kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo")
+        if kv_cache_quant_algo_raw is None:
+            # No KV cache quantization by default
+            kv_cache_quant_algo = None
+        elif isinstance(kv_cache_quant_algo_raw, str):
+            kv_cache_quant_algo = kv_cache_quant_algo_raw
         else:
-            raise ValueError(
-                "Missing 'quantization' section in config. Please make sure your model is exported using FastDeploy."
-            )
+            raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}")
+
+        # Handle group_size with proper type validation
+        group_size_raw = quant_config.get("group_size")
+        if group_size_raw is None:
+            group_size = 16  # Default value
+        elif isinstance(group_size_raw, int):
+            group_size = group_size_raw
+        else:
+            try:
+                group_size = int(group_size_raw)
+            except (ValueError, TypeError):
+                raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None
+
+        # "exclude_modules" is the key in the legacy hf_quant_config.json
+        exclude_modules = quant_config.get("exclude_modules", [])
+        if not isinstance(exclude_modules, list):
+            raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}")
 
         is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method
 
@@ -216,10 +213,50 @@ def create_weights(
         layer,
         **extra_weight_attrs,
     ):
-        return
+        if not self.quant_config.is_checkpoint_nvfp4_serialized:
+            raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.")
+
+        input_size = layer.weight_shape[0]
+        output_size = layer.weight_shape[1]
+        if input_size % 16 != 0:
+            raise ValueError("Unsupported model when in features size is not multiple of 16")
+        # Weight
+        # 2 fp4 items are packed in the input dimension
+        print("====aaaaaa======= [output_size, input_size // 2]", [output_size, input_size // 2])
+        layer.weight = layer.create_parameter(
+            shape=[output_size, input_size // 2],
+            dtype=paddle.uint8,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
+
+        # Input Weight Scale
+        layer.input_scale = layer.create_parameter(
+            shape=[],  # output_size
+            dtype=paddle.float32,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        # Global Weight Scale
+        layer.weight_scale_2 = layer.create_parameter(
+            shape=[],  # output_size
+            dtype=paddle.float32,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        # Per Block Weight Scale
+        layer.weight_scale = layer.create_parameter(
+            shape=[output_size, input_size // self.quant_config.group_size],
+            dtype=paddle.float8_e4m3fn,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
 
     def process_weights_after_loading(self, layer) -> None:
-        return
+        raise ValueError("eeeeeeee")
 
     def apply(
         self,

From 1176caea3bb72351bfd18265854dff799722fe6a Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Wed, 5 Nov 2025 12:33:19 +0000
Subject: [PATCH 04/26] loading

---
 .../layers/quantization/nvfp4.py              | 113 +++++++++++++++---
 1 file changed, 99 insertions(+), 14 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index ebd122a1b84..16295c4296b 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -25,6 +25,7 @@
 from fastdeploy import envs
 from fastdeploy.flashinfer import has_flashinfer
 from fastdeploy.model_executor.layers.moe import FusedMoE
+from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs
 
 from .quant_base import QuantConfigBase, QuantMethodBase
 
@@ -213,27 +214,40 @@ def create_weights(
         layer,
         **extra_weight_attrs,
     ):
-        if not self.quant_config.is_checkpoint_nvfp4_serialized:
-            raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.")
 
-        input_size = layer.weight_shape[0]
-        output_size = layer.weight_shape[1]
-        if input_size % 16 != 0:
-            raise ValueError("Unsupported model when in features size is not multiple of 16")
+        # if not self.quant_config.is_checkpoint_nvfp4_serialized:
+        #     raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.")
+
+        # input_size = layer.weight_shape[0]
+        # output_size = layer.weight_shape[1]
+        # if input_size % 16 != 0:
+        #     raise ValueError("Unsupported model when in features size is not multiple of 16")
         # Weight
         # 2 fp4 items are packed in the input dimension
-        print("====aaaaaa======= [output_size, input_size // 2]", [output_size, input_size // 2])
+        # weight_scale_shape = [layer.weight_shape[1]]
+        # layer.weight_shape.reverse()
+        dim = -1 if extra_weight_attrs["output_dim"] else 0
+        extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
+        weight_shape = layer.weight_shape[::-1]
+        weight_shape[dim] = weight_shape[dim] // 2
+        layer.weight_dtype = "uint8"
+        input_scale_shape = [1]
+        weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size]
+        weight_scale_2_shape = [1]
         layer.weight = layer.create_parameter(
-            shape=[output_size, input_size // 2],
-            dtype=paddle.uint8,
+            shape=weight_shape,
+            dtype=layer.weight_dtype,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
         )
-        extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
 
+        set_weight_attrs(
+            layer.weight,
+            extra_weight_attrs,
+        )
         # Input Weight Scale
         layer.input_scale = layer.create_parameter(
-            shape=[],  # output_size
+            shape=input_scale_shape,  # output_size
             dtype=paddle.float32,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
@@ -241,7 +255,7 @@ def create_weights(
 
         # Global Weight Scale
         layer.weight_scale_2 = layer.create_parameter(
-            shape=[],  # output_size
+            shape=weight_scale_2_shape,  # output_size
             dtype=paddle.float32,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
@@ -249,14 +263,85 @@ def create_weights(
 
         # Per Block Weight Scale
         layer.weight_scale = layer.create_parameter(
-            shape=[output_size, input_size // self.quant_config.group_size],
+            shape=weight_scale_shape,
             dtype=paddle.float8_e4m3fn,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
         )
+        set_weight_attrs(
+            layer.weight_scale,
+            extra_weight_attrs,
+        )
 
     def process_weights_after_loading(self, layer) -> None:
-        raise ValueError("eeeeeeee")
+        # if
+        def _process_scale_interleaved(scales):
+            scale_dim = len(scales.shape)
+            if scale_dim == 2:
+                scales = scales.unsqueeze(0)
+            assert len(scales.shape) == 3
+            B, M, K = scales.shape
+            round_up_multiple = lambda x, m: (x + m - 1) // m * m
+            M_padded = round_up_multiple(M, 128)
+            K_padded = round_up_multiple(K, 4)
+            padded_scales = paddle.empty([B, M_padded, K_padded], dtype=scales.dtype)
+            padded_scales[:B, :M, :K].copy_(scales)
+            batches, rows, cols = padded_scales.shape
+            assert rows % 128 == 0
+            assert cols % 4 == 0
+            padded_scales = padded_scales.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+            padded_scales = padded_scales.transpose([0, 1, 4, 3, 2, 5])
+            padded_scales = padded_scales.contiguous().to(paddle.device.get_device())
+            padded_scales = (
+                padded_scales.reshape(M_padded, K_padded)
+                if scale_dim == 2
+                else padded_scales.reshape(B, M_padded, K_padded)
+            )
+            return padded_scales
+
+        input_scale_2 = layer.input_scale.max().to(paddle.float32)
+        weight_scale_2 = layer.weight_scale_2.max().to(paddle.float32)
+        alpha = input_scale_2 * weight_scale_2
+        input_scale_inv = (1 / input_scale_2).to(paddle.float32)
+        weight_scale_interleaved = _process_scale_interleaved(layer.weight_scale)
+        free_tensor(layer.input_scale)
+        free_tensor(layer.weight_scale_2)
+
+        layer.weight_scale_2 = layer.create_parameter(
+            shape=weight_scale_2.shape,  # output_size
+            dtype=weight_scale_2.dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.input_scale = layer.create_parameter(
+            shape=input_scale_2.shape,  # output_size
+            dtype=input_scale_2.dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.alpha = layer.create_parameter(
+            shape=alpha.shape,  # output_size
+            dtype=alpha.dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.input_scale_inv = layer.create_parameter(
+            shape=input_scale_inv.shape,  # output_size
+            dtype=input_scale_inv.dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.weight_scale_interleaved = layer.create_parameter(
+            shape=weight_scale_interleaved.shape,
+            dtype=weight_scale_interleaved.dtype,
+            is_bias=False,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.weight_scale_2.copy_(weight_scale_2, False)
+        layer.input_scale.copy_(input_scale_2, False)
+        layer.alpha.copy_(alpha, False)
+        layer.input_scale_inv.copy_(input_scale_inv, False)
+        layer.weight_scale_interleaved.copy_(weight_scale_interleaved, False)
 
     def apply(
         self,

From 71370546bef68361660c04484b78f5101c699cc9 Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Thu, 6 Nov 2025 10:15:05 +0000
Subject: [PATCH 05/26] update

---
 fastdeploy/model_executor/layers/quantization/nvfp4.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 16295c4296b..ad751f3b4c1 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -226,10 +226,9 @@ def create_weights(
         # 2 fp4 items are packed in the input dimension
         # weight_scale_shape = [layer.weight_shape[1]]
         # layer.weight_shape.reverse()
-        dim = -1 if extra_weight_attrs["output_dim"] else 0
         extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"]
         weight_shape = layer.weight_shape[::-1]
-        weight_shape[dim] = weight_shape[dim] // 2
+        weight_shape[1] = weight_shape[1] // 2
         layer.weight_dtype = "uint8"
         input_scale_shape = [1]
         weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size]

From 059409088d6c4b2f9e9a10d13a0712b929b011ed Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Thu, 6 Nov 2025 18:28:56 +0800
Subject: [PATCH 06/26] dense fp4 OK, cudagraph error

---
 fastdeploy/model_executor/layers/quantization/nvfp4.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index ad751f3b4c1..cce2930b294 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -30,7 +30,7 @@
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 if has_flashinfer():
-    from flashinfer import fp4_quantize as scaled_fp4_quant  # need to use vllm version
+    from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
 
 
@@ -353,10 +353,9 @@ def apply(
         output_dtype = x.dtype
 
         # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
-        x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv)
+        x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv)
 
         assert x_fp4.dtype == paddle.uint8
-        assert x_scale_interleaved.dtype == paddle.float8_e4m3fn
         assert layer.weight.dtype == paddle.uint8
         assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn
         assert layer.alpha.dtype == paddle.float32

From ae80853c1aa11c24c832105459259beafc18c807 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Fri, 7 Nov 2025 18:14:54 +0800
Subject: [PATCH 07/26] [WIP] moe forward part

---
 fastdeploy/envs.py                            |  4 +-
 .../layers/quantization/nvfp4.py              | 57 ++++++++++++++++++-
 2 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index f7bd505ca06..fa7bb839fc1 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -124,10 +124,10 @@
     "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
     # Timeout for cache_transfer_manager process exit
     "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
+    # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is cutlass)
+    "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None),
     # Count for cache_transfer_manager process error
     "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
-    # FP4 dense GEMM backend
-    "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None),
 }
 
 
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index cce2930b294..327d2f06d3d 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -25,6 +25,7 @@
 from fastdeploy import envs
 from fastdeploy.flashinfer import has_flashinfer
 from fastdeploy.model_executor.layers.moe import FusedMoE
+from fastdeploy.model_executor.ops.gpu import moe_topk_select
 from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs
 
 from .quant_base import QuantConfigBase, QuantMethodBase
@@ -32,6 +33,7 @@
 if has_flashinfer():
     from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
+    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
 
 
 def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor:
@@ -77,6 +79,10 @@ def _round_up(x: int, m: int) -> int:
     return swizzled.reshape(B, M_padded, K_padded)
 
 
+def next_power_of_2(n: int):
+    return 1 << (n - 1).bit_length() if n > 0 else 1
+
+
 class ModelOptNvFp4Config(QuantConfigBase):
     """
     quantization config for ModelOpt Nvfp4 datatype
@@ -378,7 +384,7 @@ def apply(
         return out.view(*output_shape)
 
 
-class ModelOptNvFp4FusedMoE:
+class ModelOptNvFp4FusedMoE(QuantMethodBase):
     """Fused MoE method for Model Optimizer NVFP4.
     Supports loading NVFP4 checkpoints with the following structure:
 
@@ -392,5 +398,52 @@ class ModelOptNvFp4FusedMoE:
     layer: The linear layer.
     """
 
-    def __init__(self):
+    def __init__(self, quant_config: ModelOptNvFp4Config):
+        self.quant_config = quant_config
+
+    def create_weights(self, layer):
         pass
+
+    def apply(self, layer, x, gate):
+        """
+        flashinfer nvfp4 fusedmoe for Model Optimizer
+        """
+        gate_out = gate(x.cast("float32"))
+        topk_ids, topk_weights = moe_topk_select(
+            gate_out,
+            layer.gate_correction_bias,
+            layer.top_k,
+            True,  # apply_norm_weight,
+            False,
+        )
+
+        output_dtype = x.dtype
+        x_sf = None
+
+        output = paddle.empty_like(x)
+        # flashinfer cutlass
+        _ = flashinfer_cutlass_fused_moe(
+            input=x,
+            token_selected_experts=topk_ids.to(paddle.int),
+            token_final_scales=topk_weights,
+            fc1_expert_weights=layer.w13_weight.view(paddle.long),
+            fc2_expert_weights=layer.w2_weight.view(paddle.long),
+            output_dtype=output_dtype,
+            input_sf=x_sf,
+            quant_scales=[
+                layer.w13_input_scale_quant,
+                layer.w13_blockscale_swizzled.view(paddle.int32),
+                layer.g1_alphas,
+                layer.w2_input_scale_quant,
+                layer.w2_blockscale_swizzled.view(paddle.int32),
+                layer.g2_alphas,
+            ],
+            ep_size=layer.ep_size,
+            ep_rank=layer.ep_rank,
+            tp_size=layer.tp_size,
+            tp_rank=layer.tp_rank,
+            tune_max_num_tokens=next_power_of_2(x.shape[0]),
+            output=output,
+        )
+
+        return output

From 6b2ebd66fd9562f6c8599b9fe7907c36e43d4211 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Fri, 14 Nov 2025 17:29:41 +0800
Subject: [PATCH 08/26] with flashinfer-backend

---
 fastdeploy/envs.py                            |  6 +-
 .../layers/quantization/nvfp4.py              | 73 ++++++++++++-------
 2 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py
index fa7bb839fc1..3a8c95b708b 100644
--- a/fastdeploy/envs.py
+++ b/fastdeploy/envs.py
@@ -124,8 +124,10 @@
     "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
     # Timeout for cache_transfer_manager process exit
     "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),
-    # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is cutlass)
-    "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None),
+    # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is None)
+    "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_MOE_BACKEND", None),
+    # Flahinfer MOE backend, could be flashinfer-cutlass, flashinfer-trtllm or None (default is None)
+    "FD_FLASHINFER_MOE_BACKEND": lambda: os.getenv("FD_FLASHINFER_MOE_BACKEND", None),
     # Count for cache_transfer_manager process error
     "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
 }
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 327d2f06d3d..0a34d529a95 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -34,6 +34,8 @@
     from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
+else:
+    logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.")
 
 
 def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor:
@@ -47,7 +49,7 @@ def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor:
 
     Returns
     -------
-    torch.Tensor
+    paddle.Tensor
         The swizzled tensor with the same logical shape as *scale*.
     """
     assert scale.dtype == paddle.float8_e4m3fn, (
@@ -206,7 +208,7 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
         if envs.FD_NVFP4_GEMM_BACKEND is None:
             if has_flashinfer():
                 self.backend = "flashinfer-cutlass"
-        elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
+        elif envs.FD_NVFP4_GEMM_BACKEND.startswith("flashinfer-"):
             self.backend = envs.FD_NVFP4_GEMM_BACKEND
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
 
@@ -378,7 +380,6 @@ def apply(
             x_scale_interleaved = x_scale_interleaved.view(paddle.uint8)
             w_scale_interleaved = w_scale_interleaved.view(paddle.uint8)
         out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend)
-
         if layer.with_bias:
             out = paddle.add(out, layer.bias)
         return out.view(*output_shape)
@@ -400,6 +401,18 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase):
 
     def __init__(self, quant_config: ModelOptNvFp4Config):
         self.quant_config = quant_config
+        self.backend = "none"
+
+        if envs.FD_FLASHINFER_MOE_BACKEND is None:
+            # currently support flashinfer-cutlass and flashinfer-trtllm
+            if has_flashinfer():
+                self.backend = "flashinfer-cutlass"
+        elif envs.FD_FLASHINFER_MOE_BACKEND.startswith("flashinfer-"):
+            self.backend = envs.FD_FLASHINFER_MOE_BACKEND
+            assert has_flashinfer(), f"FlashInfer is required for MoE backend {self.backend}"
+
+        if self.backend == "none":
+            raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
 
     def create_weights(self, layer):
         pass
@@ -419,31 +432,35 @@ def apply(self, layer, x, gate):
 
         output_dtype = x.dtype
         x_sf = None
-
         output = paddle.empty_like(x)
-        # flashinfer cutlass
-        _ = flashinfer_cutlass_fused_moe(
-            input=x,
-            token_selected_experts=topk_ids.to(paddle.int),
-            token_final_scales=topk_weights,
-            fc1_expert_weights=layer.w13_weight.view(paddle.long),
-            fc2_expert_weights=layer.w2_weight.view(paddle.long),
-            output_dtype=output_dtype,
-            input_sf=x_sf,
-            quant_scales=[
-                layer.w13_input_scale_quant,
-                layer.w13_blockscale_swizzled.view(paddle.int32),
-                layer.g1_alphas,
-                layer.w2_input_scale_quant,
-                layer.w2_blockscale_swizzled.view(paddle.int32),
-                layer.g2_alphas,
-            ],
-            ep_size=layer.ep_size,
-            ep_rank=layer.ep_rank,
-            tp_size=layer.tp_size,
-            tp_rank=layer.tp_rank,
-            tune_max_num_tokens=next_power_of_2(x.shape[0]),
-            output=output,
-        )
 
+        if self.backend == "flashinfer-cutlass":
+            # flashinfer cutlass
+            _ = flashinfer_cutlass_fused_moe(
+                input=x,
+                token_selected_experts=topk_ids.to(paddle.int),
+                token_final_scales=topk_weights,
+                fc1_expert_weights=layer.w13_weight.view(paddle.long),
+                fc2_expert_weights=layer.w2_weight.view(paddle.long),
+                output_dtype=output_dtype,
+                input_sf=x_sf,
+                quant_scales=[
+                    layer.w13_input_scale_quant,
+                    layer.w13_blockscale_swizzled.view(paddle.int32),
+                    layer.g1_alphas,
+                    layer.w2_input_scale_quant,
+                    layer.w2_blockscale_swizzled.view(paddle.int32),
+                    layer.g2_alphas,
+                ],
+                ep_size=layer.ep_size,
+                ep_rank=layer.ep_rank,
+                tp_size=layer.tp_size,
+                tp_rank=layer.tp_rank,
+                tune_max_num_tokens=next_power_of_2(x.shape[0]),
+                output=output,
+            )
+
+            return output
+
+        # flashinfer-trtllm
         return output

From 0b28b4bf2718e2c6c6abe6ec0f2ea0722c8a5676 Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Mon, 17 Nov 2025 12:52:12 +0000
Subject: [PATCH 09/26] qwen3_moe_fp4

---
 .../layers/moe/fused_moe_deepgemm_backend.py  | 174 +++++--------
 fastdeploy/model_executor/layers/moe/moe.py   |  37 ++-
 .../layers/quantization/nvfp4.py              | 244 +++++++++++++++++-
 fastdeploy/model_executor/models/qwen3moe.py  |  15 +-
 fastdeploy/model_executor/utils.py            |  13 +
 5 files changed, 353 insertions(+), 130 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
index 06cc3294915..9cdd8c30cfe 100644
--- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
+++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py
@@ -22,7 +22,7 @@
 from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce
 from fastdeploy.model_executor.layers.utils import get_tensor
 from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm
-from fastdeploy.model_executor.utils import TensorTracker, set_weight_attrs
+from fastdeploy.model_executor.utils import set_weight_attrs
 from fastdeploy.utils import ceil_div
 
 from .fused_moe_backend_base import MoEMethodBase
@@ -33,121 +33,69 @@ class DeepGemmFusedMoeMethod(MoEMethodBase):
     DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend.
     """
 
-    def create_weights(self, layer: nn.Layer, **extra_weight_attrs):
-        """
-        deepgemm create weight process.
-        """
-        self.up_gate_proj_weight_shape = [
-            layer.num_local_experts,
-            layer.moe_intermediate_size * 2,
-            layer.hidden_size,
-        ]
-        self.down_proj_weight_shape = [
-            layer.num_local_experts,
-            layer.hidden_size,
-            layer.moe_intermediate_size,
-        ]
-        self.up_gate_proj_scale_shape = [
-            layer.num_local_experts,
-            ceil_div(layer.moe_intermediate_size * 2, self.quant_config.weight_block_size[0]),
-            ceil_div(layer.hidden_size, self.quant_config.weight_block_size[1]),
-        ]
-        self.down_proj_scale_shape = [
-            layer.num_local_experts,
-            ceil_div(layer.hidden_size, self.quant_config.weight_block_size[0]),
-            ceil_div(layer.moe_intermediate_size, self.quant_config.weight_block_size[1]),
-        ]
-        # TODO(bukejiyu): remove v1 loader check when v0 loader is removed
-        if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1":
-            layer.up_gate_proj_weight = layer.create_parameter(
-                shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2],
-                dtype=layer.weight_dtype,
+    def create_weights(self, layer, **extra_weight_attrs):
+        self.weight_dtype = paddle.float8_e4m3fn
+        self.added_scale_attrs = ["up_gate_proj_weight_scale_inv", "down_proj_weight_scale_inv"]
+        up_gate_proj_weight_name = self.added_weight_attrs[0]
+        down_proj_weight_name = self.added_weight_attrs[1]
+        up_gate_proj_scale_name = self.added_scale_attrs[0]
+        down_proj_scale_name = self.added_scale_attrs[1]
+        setattr(
+            layer,
+            up_gate_proj_weight_name,
+            layer.create_parameter(
+                shape=self.up_gate_proj_weight_shape,
+                dtype=self.weight_dtype,
                 default_initializer=paddle.nn.initializer.Constant(0),
-            )
-
-            layer.down_proj_weight = layer.create_parameter(
-                shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size],
-                dtype=layer.weight_dtype,
+            ),
+        )
+        setattr(
+            layer,
+            down_proj_weight_name,
+            layer.create_parameter(
+                shape=self.down_proj_weight_shape,
+                dtype=self.weight_dtype,
                 default_initializer=paddle.nn.initializer.Constant(0),
-            )
-            extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch"
-            set_weight_attrs(
-                layer.up_gate_proj_weight,
-                {
-                    **extra_weight_attrs,
-                    "tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=True),
-                },
-            )
-            set_weight_attrs(
-                layer.down_proj_weight,
-                {
-                    **extra_weight_attrs,
-                    "tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=False),
-                },
-            )
-        else:
-            self.weight_dtype = paddle.float8_e4m3fn
-            self.added_scale_attrs = ["up_gate_proj_weight_scale_inv", "down_proj_weight_scale_inv"]
-            up_gate_proj_weight_name = self.added_weight_attrs[0]
-            down_proj_weight_name = self.added_weight_attrs[1]
-            up_gate_proj_scale_name = self.added_scale_attrs[0]
-            down_proj_scale_name = self.added_scale_attrs[1]
-            setattr(
-                layer,
-                up_gate_proj_weight_name,
-                layer.create_parameter(
-                    shape=self.up_gate_proj_weight_shape,
-                    dtype=self.weight_dtype,
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            setattr(
-                layer,
-                down_proj_weight_name,
-                layer.create_parameter(
-                    shape=self.down_proj_weight_shape,
-                    dtype=self.weight_dtype,
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            # weight_scale
-            setattr(
-                layer,
-                up_gate_proj_scale_name,
-                layer.create_parameter(
-                    shape=self.up_gate_proj_scale_shape,
-                    dtype="float32",
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            setattr(
-                layer,
-                down_proj_scale_name,
-                layer.create_parameter(
-                    shape=self.down_proj_scale_shape,
-                    dtype="float32",
-                    default_initializer=paddle.nn.initializer.Constant(0),
-                ),
-            )
-            extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
-            extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
-            set_weight_attrs(
-                getattr(layer, up_gate_proj_weight_name),
-                extra_weight_attrs,
-            )
-            set_weight_attrs(
-                getattr(layer, up_gate_proj_scale_name),
-                extra_weight_attrs,
-            )
+            ),
+        )
+        # weight_scale
+        setattr(
+            layer,
+            up_gate_proj_scale_name,
+            layer.create_parameter(
+                shape=self.up_gate_proj_scale_shape,
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        setattr(
+            layer,
+            down_proj_scale_name,
+            layer.create_parameter(
+                shape=self.down_proj_scale_shape,
+                dtype="float32",
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch"
+        extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}
+        set_weight_attrs(
+            getattr(layer, up_gate_proj_weight_name),
+            extra_weight_attrs,
+        )
+        set_weight_attrs(
+            getattr(layer, up_gate_proj_scale_name),
+            extra_weight_attrs,
+        )
 
-            set_weight_attrs(
-                getattr(layer, down_proj_weight_name),
-                extra_weight_attrs,
-            )
-            set_weight_attrs(
-                getattr(layer, down_proj_scale_name),
-                extra_weight_attrs,
-            )
+        set_weight_attrs(
+            getattr(layer, down_proj_weight_name),
+            extra_weight_attrs,
+        )
+        set_weight_attrs(
+            getattr(layer, down_proj_scale_name),
+            extra_weight_attrs,
+        )
 
     def process_weights_after_loading(self, layer):
         """ """
diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index 09330e549a7..ec5d9adc2ff 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -288,8 +288,12 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_
             )
 
         # To ensure compatibility across backends, apply an extra transpose for GCU and XPU
+
         if expert_param.shape != loaded_weight.shape:
-            loaded_weight = loaded_weight.transpose([1, 0])
+            if len(expert_param.shape) != len(loaded_weight.shape):
+                loaded_weight = loaded_weight.reshape(expert_param.shape)
+            else:
+                loaded_weight = loaded_weight.transpose([1, 0])
         assert expert_param.shape == loaded_weight.shape, (
             f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})"
         )
@@ -352,6 +356,32 @@ def _load_fused_experts_weight(self, param, loaded_weight):
             for i in range(self.num_local_experts):
                 param.tensor_track.mark(start=0, batch_id=i)
 
+    def _load_per_tensor_weight_scale(
+        self,
+        param,
+        expert_id,
+        loaded_weight,
+        shard_id,
+    ):
+        loaded_weight = get_tensor(loaded_weight)
+        expert_param = param[expert_id - self.expert_id_offset]
+        if shard_id in ["gate", "up"]:
+            idx = 0 if shard_id == "gate" else 1
+            if expert_param[idx].shape != loaded_weight.shape:
+                if len(expert_param[idx].shape) != len(loaded_weight.shape):
+                    loaded_weight = loaded_weight.reshape(expert_param[idx].shape)
+                else:
+                    loaded_weight = loaded_weight.transpose([1, 0])
+
+            expert_param[idx].set_value(loaded_weight)
+        elif shard_id == "down":
+            if expert_param.shape != loaded_weight.shape:
+                if len(expert_param.shape) != len(loaded_weight.shape):
+                    loaded_weight = loaded_weight.reshape(expert_param.shape)
+                else:
+                    loaded_weight = loaded_weight.transpose([1, 0])
+            expert_param.set_value(loaded_weight)
+
     def _load_expert_weight(
         self,
         param,
@@ -360,7 +390,10 @@ def _load_expert_weight(
         shard_id,
         shard_dim=None,
     ):
-        if shard_id == "down":
+        weight_type = getattr(param, "weight_type", None)
+        if weight_type in ["weight_scale_2", "input_scale"]:
+            self._load_per_tensor_weight_scale(param, expert_id, loaded_weight, shard_id)
+        elif shard_id == "down":
             self._load_down_weight(param, expert_id, loaded_weight, shard_id, shard_dim)
         elif shard_id in ["gate", "up"]:
             self._load_gate_up_weight(param, expert_id, loaded_weight, shard_id, shard_dim)
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 0a34d529a95..f4f12dff7e5 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -26,7 +26,11 @@
 from fastdeploy.flashinfer import has_flashinfer
 from fastdeploy.model_executor.layers.moe import FusedMoE
 from fastdeploy.model_executor.ops.gpu import moe_topk_select
-from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs
+from fastdeploy.model_executor.utils import (
+    create_parameter_and_copy,
+    free_tensor,
+    set_weight_attrs,
+)
 
 from .quant_base import QuantConfigBase, QuantMethodBase
 
@@ -177,7 +181,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
         if isinstance(layer, FusedMoE):
             # if skip_layer:
             #     return None
-            return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer)
+            return ModelOptNvFp4FusedMoE(self)
         else:
             # LinearBase
             # if skip_layer:
@@ -241,18 +245,37 @@ def create_weights(
         input_scale_shape = [1]
         weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size]
         weight_scale_2_shape = [1]
+
+        self._create_main_weight(layer, weight_shape, extra_weight_attrs)
+        self._create_input_scale(layer, input_scale_shape)
+        self._create_weight_scales(layer, weight_scale_shape, weight_scale_2_shape, extra_weight_attrs)
+
+    def _create_main_weight(self, layer, weight_shape, extra_weight_attrs):
+        """创建主权重参数
+
+        参数:
+            layer: 当前层对象
+            weight_shape: 权重形状
+            extra_weight_attrs: 额外权重属性
+        """
         layer.weight = layer.create_parameter(
             shape=weight_shape,
             dtype=layer.weight_dtype,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
         )
-
         set_weight_attrs(
             layer.weight,
             extra_weight_attrs,
         )
-        # Input Weight Scale
+
+    def _create_input_scale(self, layer, input_scale_shape):
+        """创建输入缩放参数
+
+        参数:
+            layer: 当前层对象
+            input_scale_shape: 输入缩放形状
+        """
         layer.input_scale = layer.create_parameter(
             shape=input_scale_shape,  # output_size
             dtype=paddle.float32,
@@ -260,15 +283,21 @@ def create_weights(
             default_initializer=paddle.nn.initializer.Constant(0),
         )
 
-        # Global Weight Scale
+    def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, extra_weight_attrs):
+        """创建权重缩放参数
+
+        参数:
+            layer: 当前层对象
+            weight_scale_shape: 权重缩放形状
+            weight_scale_2_shape: 权重缩放2形状
+            extra_weight_attrs: 额外权重属性
+        """
         layer.weight_scale_2 = layer.create_parameter(
             shape=weight_scale_2_shape,  # output_size
             dtype=paddle.float32,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
         )
-
-        # Per Block Weight Scale
         layer.weight_scale = layer.create_parameter(
             shape=weight_scale_shape,
             dtype=paddle.float8_e4m3fn,
@@ -400,6 +429,12 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase):
     """
 
     def __init__(self, quant_config: ModelOptNvFp4Config):
+        self.quant_config = quant_config
+        self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"]
+        self.added_scale_attrs = [
+            "up_gate_proj_weight_scale",
+            "down_proj_weight_scale",
+        ]
         self.quant_config = quant_config
         self.backend = "none"
 
@@ -414,8 +449,199 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
         if self.backend == "none":
             raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
 
-    def create_weights(self, layer):
-        pass
+    def create_weights(self, layer, **extra_weight_attrs):
+        """
+        Triton MoE create weight process.
+        """
+        self.up_gate_proj_weight_shape = [
+            layer.num_local_experts,
+            layer.moe_intermediate_size * 2,
+            layer.hidden_size // 2,
+        ]
+        self.down_proj_weight_shape = [
+            layer.num_local_experts,
+            layer.hidden_size,
+            layer.moe_intermediate_size // 2,
+        ]
+        self.up_gate_proj_scale_shape = [
+            layer.num_local_experts,
+            layer.moe_intermediate_size * 2,
+            layer.hidden_size // self.quant_config.group_size,
+        ]
+        self.down_proj_scale_shape = [
+            layer.num_local_experts,
+            layer.hidden_size,
+            layer.moe_intermediate_size // self.quant_config.group_size,
+        ]
+
+        self.weight_scale_dtype = paddle.float8_e4m3fn
+        self.weight_dtype = paddle.uint8
+        self.added_scale_attrs = ["up_gate_proj_weight_scale", "down_proj_weight_scale"]
+        # self.added_blockscale_swizzled_attrs = ["up_gate_proj_blockscale_swizzled", "down_proj_blockscale_swizzled"]
+        up_gate_proj_weight_name = self.added_weight_attrs[0]
+        down_proj_weight_name = self.added_weight_attrs[1]
+        up_gate_proj_scale_name = self.added_scale_attrs[0]
+        down_proj_scale_name = self.added_scale_attrs[1]
+        # up_gate_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[0]
+        # down_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[1]
+        setattr(
+            layer,
+            up_gate_proj_weight_name,
+            layer.create_parameter(
+                shape=self.up_gate_proj_weight_shape,
+                dtype=self.weight_dtype,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        setattr(
+            layer,
+            down_proj_weight_name,
+            layer.create_parameter(
+                shape=self.down_proj_weight_shape,
+                dtype=self.weight_dtype,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        # weight_scale
+        setattr(
+            layer,
+            up_gate_proj_scale_name,
+            layer.create_parameter(
+                shape=self.up_gate_proj_scale_shape,
+                dtype=self.weight_scale_dtype,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        setattr(
+            layer,
+            down_proj_scale_name,
+            layer.create_parameter(
+                shape=self.down_proj_scale_shape,
+                dtype=self.weight_scale_dtype,
+                default_initializer=paddle.nn.initializer.Constant(0),
+            ),
+        )
+        # weight_scale_2
+        layer.up_gate_proj_weight_scale_2 = layer.create_parameter(
+            shape=[layer.num_local_experts, 2],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.down_proj_weight_scale_2 = layer.create_parameter(
+            shape=[layer.num_local_experts],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        # input_scale
+        layer.up_gate_proj_input_scale = layer.create_parameter(
+            shape=[layer.num_local_experts, 2],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+        layer.down_proj_input_scale = layer.create_parameter(
+            shape=[layer.num_local_experts],
+            dtype="float32",
+            default_initializer=paddle.nn.initializer.Constant(0),
+        )
+
+        set_weight_attrs(
+            getattr(layer, up_gate_proj_weight_name),
+            {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}},
+        )
+        set_weight_attrs(
+            getattr(layer, up_gate_proj_scale_name),
+            {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}},
+        )
+
+        set_weight_attrs(
+            getattr(layer, down_proj_weight_name),
+            {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}},
+        )
+        set_weight_attrs(
+            getattr(layer, down_proj_scale_name),
+            {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}},
+        )
+
+        set_weight_attrs(
+            layer.up_gate_proj_weight_scale_2,
+            {**extra_weight_attrs, "weight_type": "weight_scale_2"},
+        )
+        set_weight_attrs(layer.down_proj_weight_scale_2, {**extra_weight_attrs, "weight_type": "weight_scale_2"})
+        set_weight_attrs(layer.up_gate_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"})
+        set_weight_attrs(layer.down_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"})
+
+    def swizzle_blockscale(self, scale):
+        assert scale.dtype == paddle.float8_e4m3fn
+        # Pad and blockwise interleave weight_scale
+        scale_dim = len(scale.shape)
+        if len(scale.shape) == 2:
+            scale = scale.unsqueeze(0)
+        assert len(scale.shape) == 3
+        B, M, K = scale.shape
+        round_up_multiple = lambda x, m: (x + m - 1) // m * m
+        M_padded = round_up_multiple(M, 128)
+        K_padded = round_up_multiple(K, 4)
+        padded_scale = paddle.empty([B, M_padded, K_padded], dtype=scale.dtype)
+        padded_scale[:B, :M, :K].copy_(scale)
+        batches, rows, cols = padded_scale.shape
+        assert rows % 128 == 0
+        assert cols % 4 == 0
+        padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4)
+        swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5))
+        swizzled_scale = swizzled_scale.contiguous().to(paddle.device.get_device())
+        return (
+            swizzled_scale.reshape(M_padded, K_padded)
+            if scale_dim == 2
+            else swizzled_scale.reshape(B, M_padded, K_padded)
+        )
+
+    def process_weights_after_loading(self, layer):
+        """ """
+        up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0]
+        free_tensor(layer.up_gate_proj_weight_scale_2)
+        create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2)
+        # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe
+        # conda2 = self.enable_flashinfer_cutedsl_moe
+        # conda3 only support now
+        up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32")
+        down_proj_input_scale = layer.down_proj_input_scale
+
+        # Create shared parameters
+        create_parameter_and_copy(
+            layer, "g1_alphas", (up_gate_proj_input_scale * up_gate_proj_weight_scale_2).cast("float32")
+        )
+        create_parameter_and_copy(
+            layer, "g2_alphas", (down_proj_input_scale * layer.down_proj_weight_scale_2).cast("float32")
+        )
+        create_parameter_and_copy(
+            layer, "up_gate_proj_input_scale_quant", (1 / up_gate_proj_input_scale).cast("float32")
+        )
+        create_parameter_and_copy(layer, "down_proj_input_scale_quant", (1 / down_proj_input_scale).cast("float32"))
+
+        # update input_global_scale ?
+        # layer.dispatcher.set_quant_config(
+        #     {"input_global_scale": layer.w13_input_scale_quant}
+        # )
+
+        for name, weight_scale in [
+            ("up_gate", layer.up_gate_proj_weight_scale),
+            ("down", layer.down_proj_weight_scale),
+        ]:
+            assert weight_scale.shape[2] % 16 == 0, f"Expected {name}_weight_scale.dim(2) to be divisible by 16"
+            assert (
+                weight_scale.dtype == paddle.float8_e4m3fn
+            ), f"{name} Weight Blockscale must be represented as FP8-E4M3"
+
+        # trtllm
+        # cultass
+        up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale)
+        free_tensor(layer.up_gate_proj_weight_scale)
+        create_parameter_and_copy(
+            layer, name="up_gate_proj_blockscale_swizzled", weight=up_gate_proj_blockscale_swizzled
+        )
+        down_proj_blockscale_swizzled = self.swizzle_blockscale(layer.down_proj_weight_scale)
+        free_tensor(layer.down_proj_weight_scale)
+        create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled)
 
     def apply(self, layer, x, gate):
         """
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
index 8e47a919bc4..3e3c2645693 100644
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -16,9 +16,9 @@
 
 from __future__ import annotations
 
-import re
 from functools import partial
 
+# import re
 import paddle
 from paddle import nn
 from paddleformers.transformers import PretrainedModel
@@ -376,9 +376,8 @@ def load_weights(self, weights_iterator) -> None:
             weights_iterator (Iterator): An iterator yielding (name, weight) pairs.
         """
 
-        from fastdeploy.model_executor.utils import (
+        from fastdeploy.model_executor.utils import (  # process_weights_after_loading,
             default_weight_loader,
-            process_weights_after_loading,
         )
 
         stacked_params_mapping = [
@@ -393,7 +392,7 @@ def load_weights(self, weights_iterator) -> None:
         ]
         expert_params_mapping = self.get_expert_mapping()
         params_dict = dict(self.named_parameters())
-        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
+        # process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
         for loaded_weight_name, loaded_weight in weights_iterator:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
@@ -427,8 +426,12 @@ def load_weights(self, weights_iterator) -> None:
                     weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                     weight_loader(param, loaded_weight)
 
-            model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
-            process_weights_after_loading_fn(model_sublayer_name, param)
+            # model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
+            # process_weights_after_loading_fn(model_sublayer_name, param)
+        for name, sublayer in self.named_sublayers():
+            quant_method = getattr(sublayer, "quant_method", None)
+            if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
+                quant_method.process_weights_after_loading(sublayer)
 
     @paddle.no_grad()
     def set_state_dict(self, state_dict):
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
index 15d285212b0..81f076a5b68 100644
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -189,6 +189,19 @@ def free_tensor(tensor):
     del tensor
 
 
+def create_parameter_and_copy(layer, name, weight):
+    setattr(
+        layer,
+        name,
+        layer.create_parameter(
+            shape=weight.shape,
+            dtype=weight.dtype,
+            default_initializer=paddle.nn.initializer.Constant(0),
+        ),
+    )
+    getattr(layer, name).copy_(weight, False)
+
+
 def default_weight_loader(fd_config: FDConfig = None) -> None:
     """Default weight loader"""
 

From 2d2bd069b4578b4ba8221639d28e304110b8efb0 Mon Sep 17 00:00:00 2001
From: bukejiyu <395822456@qq.com>
Date: Tue, 18 Nov 2025 12:13:35 +0000
Subject: [PATCH 10/26] update

---
 fastdeploy/model_executor/layers/moe/moe.py        |  4 ++--
 .../model_executor/layers/quantization/nvfp4.py    | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py
index ec5d9adc2ff..2f168db2a09 100644
--- a/fastdeploy/model_executor/layers/moe/moe.py
+++ b/fastdeploy/model_executor/layers/moe/moe.py
@@ -271,10 +271,10 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_
         expert_param = param[expert_id - self.expert_id_offset]
         dim = -1 if shard_dim else 0
         param_shard_size = expert_param.shape[dim] // 2
-        if shard_id == "gate":
+        switch_w13 = getattr(self.quant_method, "load_up_proj_weight_first", False)
+        if (shard_id == "gate" and not switch_w13) or (shard_id == "up" and switch_w13):
             param_shard_offset = 0
         else:
-            # shard_id == "up":
             param_shard_offset = param_shard_size
         expert_param = slice_fn(
             expert_param, shard_dim, start=param_shard_offset, end=param_shard_offset + param_shard_size
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index f4f12dff7e5..c4ddba2974a 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -595,16 +595,24 @@ def swizzle_blockscale(self, scale):
             else swizzled_scale.reshape(B, M_padded, K_padded)
         )
 
+    @property
+    def load_up_proj_weight_first(self) -> bool:
+        # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13
+        # 目前默认给True
+        return True
+
     def process_weights_after_loading(self, layer):
         """ """
         up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0]
         free_tensor(layer.up_gate_proj_weight_scale_2)
         create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2)
         # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe
+        up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale).cast("float32")
+        down_proj_input_scale = paddle.max(layer.down_proj_input_scale).cast("float32")
         # conda2 = self.enable_flashinfer_cutedsl_moe
         # conda3 only support now
-        up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32")
-        down_proj_input_scale = layer.down_proj_input_scale
+        # up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32")
+        # down_proj_input_scale = layer.down_proj_input_scale
 
         # Create shared parameters
         create_parameter_and_copy(
@@ -636,11 +644,13 @@ def process_weights_after_loading(self, layer):
         # cultass
         up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale)
         free_tensor(layer.up_gate_proj_weight_scale)
+        layer.up_gate_proj_weight_scale = None
         create_parameter_and_copy(
             layer, name="up_gate_proj_blockscale_swizzled", weight=up_gate_proj_blockscale_swizzled
         )
         down_proj_blockscale_swizzled = self.swizzle_blockscale(layer.down_proj_weight_scale)
         free_tensor(layer.down_proj_weight_scale)
+        layer.down_proj_weight_scale = None
         create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled)
 
     def apply(self, layer, x, gate):

From c329d921e1b57e7a99f6470622e059e0b5c459fb Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Tue, 18 Nov 2025 20:50:31 +0800
Subject: [PATCH 11/26] support flashinfer-cutlass moe, qwen3-moe-fp4 OK

---
 fastdeploy/flashinfer.py                      |  4 ++
 .../layers/quantization/nvfp4.py              | 67 ++-----------------
 2 files changed, 10 insertions(+), 61 deletions(-)

diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py
index 4bc6aa994f2..f30aa028308 100644
--- a/fastdeploy/flashinfer.py
+++ b/fastdeploy/flashinfer.py
@@ -17,6 +17,7 @@
 import functools
 import importlib
 import importlib.util
+import os
 import shutil
 
 
@@ -25,6 +26,9 @@ def has_flashinfer() -> bool:
     """Return `True` if FlashInfer is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
+    if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]:
+        # currently must support by Paddle compatible API
+        return False
     if importlib.util.find_spec("flashinfer") is None:
         # logger.debug_once("FlashInfer unavailable since package was not found")
         return False
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index c4ddba2974a..2a648f673a0 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -42,49 +42,6 @@
     logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.")
 
 
-def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor:
-    """
-    Pad and block-interleave the FP4 block-scales so that they match the data
-    layout expected by the CUTLASS / FlashInfer kernels.
-
-    Parameters
-    ----------
-    scale: paddle.Tensor
-
-    Returns
-    -------
-    paddle.Tensor
-        The swizzled tensor with the same logical shape as *scale*.
-    """
-    assert scale.dtype == paddle.float8_e4m3fn, (
-        "swizzle_blockscale expects the input tensor to be in " "paddle.float8_e4m3fn format."
-    )
-
-    scale_ndim = scale.ndim
-    if scale_ndim == 2:
-        scale = scale.unsqueeze(0)  # (1, M, K)
-    assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales."
-
-    B, M, K = scale.shape
-
-    def _round_up(x: int, m: int) -> int:
-        return (x + m - 1) // m * m
-
-    M_padded = _round_up(M, 128)
-    K_padded = _round_up(K, 4)
-
-    padded = paddle.zeros((B, M_padded, K_padded), dtype=scale.dtype, device=scale.place)
-    padded[:B, :M, :K] = scale
-
-    # Reshape / permute to the layout required by the kernel.
-    padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4)
-    swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda()
-
-    if scale_ndim == 2:
-        return swizzled.reshape(M_padded, K_padded)
-    return swizzled.reshape(B, M_padded, K_padded)
-
-
 def next_power_of_2(n: int):
     return 1 << (n - 1).bit_length() if n > 0 else 1
 
@@ -606,13 +563,8 @@ def process_weights_after_loading(self, layer):
         up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0]
         free_tensor(layer.up_gate_proj_weight_scale_2)
         create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2)
-        # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe
         up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale).cast("float32")
         down_proj_input_scale = paddle.max(layer.down_proj_input_scale).cast("float32")
-        # conda2 = self.enable_flashinfer_cutedsl_moe
-        # conda3 only support now
-        # up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32")
-        # down_proj_input_scale = layer.down_proj_input_scale
 
         # Create shared parameters
         create_parameter_and_copy(
@@ -626,11 +578,6 @@ def process_weights_after_loading(self, layer):
         )
         create_parameter_and_copy(layer, "down_proj_input_scale_quant", (1 / down_proj_input_scale).cast("float32"))
 
-        # update input_global_scale ?
-        # layer.dispatcher.set_quant_config(
-        #     {"input_global_scale": layer.w13_input_scale_quant}
-        # )
-
         for name, weight_scale in [
             ("up_gate", layer.up_gate_proj_weight_scale),
             ("down", layer.down_proj_weight_scale),
@@ -640,8 +587,6 @@ def process_weights_after_loading(self, layer):
                 weight_scale.dtype == paddle.float8_e4m3fn
             ), f"{name} Weight Blockscale must be represented as FP8-E4M3"
 
-        # trtllm
-        # cultass
         up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale)
         free_tensor(layer.up_gate_proj_weight_scale)
         layer.up_gate_proj_weight_scale = None
@@ -676,16 +621,16 @@ def apply(self, layer, x, gate):
                 input=x,
                 token_selected_experts=topk_ids.to(paddle.int),
                 token_final_scales=topk_weights,
-                fc1_expert_weights=layer.w13_weight.view(paddle.long),
-                fc2_expert_weights=layer.w2_weight.view(paddle.long),
+                fc1_expert_weights=getattr(layer, self.added_weight_attrs[0]).view(paddle.long),
+                fc2_expert_weights=getattr(layer, self.added_weight_attrs[1]).view(paddle.long),
                 output_dtype=output_dtype,
                 input_sf=x_sf,
                 quant_scales=[
-                    layer.w13_input_scale_quant,
-                    layer.w13_blockscale_swizzled.view(paddle.int32),
+                    layer.up_gate_proj_input_scale_quant,
+                    layer.up_gate_proj_blockscale_swizzled.view(paddle.int32),
                     layer.g1_alphas,
-                    layer.w2_input_scale_quant,
-                    layer.w2_blockscale_swizzled.view(paddle.int32),
+                    layer.down_proj_input_scale_quant,
+                    layer.down_proj_blockscale_swizzled.view(paddle.int32),
                     layer.g2_alphas,
                 ],
                 ep_size=layer.ep_size,

From eb089b38b0f342006cf587fa1de611f0b8afa619 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Wed, 19 Nov 2025 18:59:59 +0800
Subject: [PATCH 12/26] support ernie4.5-fp4

---
 .../model_executor/layers/quantization/nvfp4.py      |  4 ++++
 fastdeploy/model_executor/models/ernie4_5_moe.py     | 12 ++++--------
 fastdeploy/model_executor/models/qwen3moe.py         |  3 ---
 fastdeploy/model_executor/utils.py                   |  2 ++
 4 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 2a648f673a0..0efa9204e3c 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -57,6 +57,7 @@ def __init__(
         kv_cache_quant_algo: str | None,
         exclude_modules: list[str],
         group_size: int = 16,
+        is_checkpoint_bf16: bool = False,
     ) -> None:
         self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized
         if is_checkpoint_nvfp4_serialized:
@@ -72,6 +73,7 @@ def __init__(
         self.quant_max_bound = 6
         self.quant_min_bound = -6
         self.quant_round_type = 1
+        self.is_checkpoint_bf16 = is_checkpoint_bf16
 
     def name(self) -> str:
         return "modelopt_fp4"
@@ -406,6 +408,8 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
         if self.backend == "none":
             raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
 
+        logger.info(f"Using {self.backend} for NVFP4 FusedMoE")
+
     def create_weights(self, layer, **extra_weight_attrs):
         """
         Triton MoE create weight process.
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index c2baeb91049..136b144cf94 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -17,7 +17,6 @@
 from __future__ import annotations
 
 import inspect
-import re
 from functools import partial
 from typing import Dict, Union
 
@@ -543,7 +542,6 @@ def load_weights(self, weights_iterator) -> None:
 
         from fastdeploy.model_executor.utils import (
             default_weight_loader,
-            process_weights_after_loading,
             rename_offline_ckpt_suffix_to_fd_suffix,
         )
 
@@ -590,8 +588,6 @@ def load_weights(self, weights_iterator) -> None:
         )
         params_dict = dict(self.named_parameters())
 
-        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
-
         for loaded_weight_name, loaded_weight in weights_iterator:
             loaded_weight_name = loaded_weight_name.replace("model", "ernie")
             for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
@@ -620,10 +616,10 @@ def load_weights(self, weights_iterator) -> None:
             else:
                 weight_loader(param, loaded_weight, shard_id)
 
-            model_sublayer_name = re.sub(
-                r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name
-            )
-            process_weights_after_loading_fn(model_sublayer_name, param)
+        for name, sublayer in self.named_sublayers():
+            quant_method = getattr(sublayer, "quant_method", None)
+            if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
+                quant_method.process_weights_after_loading(sublayer)
 
         if self.tie_word_embeddings:
             self.lm_head.load_state_dict({self.lm_head.weight_key: self.ernie.embed_tokens.embeddings.weight})
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
index 3e3c2645693..c9a9d717e6a 100644
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -392,7 +392,6 @@ def load_weights(self, weights_iterator) -> None:
         ]
         expert_params_mapping = self.get_expert_mapping()
         params_dict = dict(self.named_parameters())
-        # process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()))
         for loaded_weight_name, loaded_weight in weights_iterator:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
@@ -426,8 +425,6 @@ def load_weights(self, weights_iterator) -> None:
                     weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                     weight_loader(param, loaded_weight)
 
-            # model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
-            # process_weights_after_loading_fn(model_sublayer_name, param)
         for name, sublayer in self.named_sublayers():
             quant_method = getattr(sublayer, "quant_method", None)
             if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py
index 81f076a5b68..7949d5e0541 100644
--- a/fastdeploy/model_executor/utils.py
+++ b/fastdeploy/model_executor/utils.py
@@ -371,6 +371,8 @@ def fn(loaded_weight_name, is_moe):
         # Can be extended to other offline quantization suffixes if needed.
         if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"):
             fd_suffix_map = fp8_suffix_map
+        else:
+            fd_suffix_map = {}
         for ckpt_suffix, fd_suffix in fd_suffix_map.items():
             if re.search(rf"{ckpt_suffix}$", loaded_weight_name):
                 loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix)

From 03aa695a80927d6bb9f2a2c819b079bb4b264b89 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Thu, 20 Nov 2025 12:10:07 +0800
Subject: [PATCH 13/26] fix load error

---
 fastdeploy/flashinfer.py                      |  5 +++--
 fastdeploy/model_executor/layers/linear.py    |  1 -
 .../layers/quantization/nvfp4.py              | 19 +++++++++----------
 .../model_executor/models/ernie4_5_moe.py     | 14 ++++++++++----
 fastdeploy/model_executor/models/qwen3moe.py  | 13 ++++++++-----
 5 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py
index f30aa028308..23634faed5f 100644
--- a/fastdeploy/flashinfer.py
+++ b/fastdeploy/flashinfer.py
@@ -20,6 +20,8 @@
 import os
 import shutil
 
+from paddleformers.utils.log import logger
+
 
 @functools.cache
 def has_flashinfer() -> bool:
@@ -28,12 +30,11 @@ def has_flashinfer() -> bool:
     # This avoids potential CUDA initialization side effects
     if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]:
         # currently must support by Paddle compatible API
+        logger.warning("FlashInfer is not supported by Paddle compatible API.")
         return False
     if importlib.util.find_spec("flashinfer") is None:
-        # logger.debug_once("FlashInfer unavailable since package was not found")
         return False
     # Also check if nvcc is available since it's required to JIT compile flashinfer
     if shutil.which("nvcc") is None:
-        # logger.debug_once("FlashInfer unavailable since nvcc was not found")
         return False
     return True
diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py
index e7725be6d23..3227edac765 100644
--- a/fastdeploy/model_executor/layers/linear.py
+++ b/fastdeploy/model_executor/layers/linear.py
@@ -82,7 +82,6 @@ def process_loaded_weights(self, layer, weights) -> None:
         layer.weight.set_value(weights)
 
     def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor:
-
         linear_out = paddle.matmul(x, layer.weight)
         if layer.with_bias:
             linear_out = paddle.add(linear_out, layer.bias)
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 8a44a93a199..92b47997470 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -17,15 +17,12 @@
 from typing import Optional
 
 import paddle
-
-paddle.compat.enable_torch_proxy()
-
 from paddleformers.utils.log import logger
 
+import fastdeploy
 from fastdeploy import envs
 from fastdeploy.flashinfer import has_flashinfer
 from fastdeploy.model_executor.layers.moe import FusedMoE
-from fastdeploy.model_executor.ops.gpu import moe_topk_select
 from fastdeploy.model_executor.utils import (
     create_parameter_and_copy,
     free_tensor,
@@ -35,6 +32,7 @@
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 if has_flashinfer():
+    paddle.compat.enable_torch_proxy()
     from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
@@ -176,7 +174,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None:
             assert has_flashinfer(), f"FlashInfer is required for {self.backend}"
 
         if self.backend == "none":
-            raise ValueError("No valid NVFP4 GEMM backend found. " "Please check your platform capability.")
+            raise ValueError(
+                "No valid NVFP4 GEMM backend found. Please check your platform capability and installtion of Flashinfer."
+            )
 
         logger.info(f"Using {self.backend} for NVFP4 GEMM")
 
@@ -394,7 +394,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
             assert has_flashinfer(), f"FlashInfer is required for MoE backend {self.backend}"
 
         if self.backend == "none":
-            raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.")
+            raise ValueError(
+                "No valid NVFP4 flashinfer MoE backend found. Please check your platform capability and installtion of FlashInfer."
+            )
 
         logger.info(f"Using {self.backend} for NVFP4 FusedMoE")
 
@@ -426,13 +428,10 @@ def create_weights(self, layer, **extra_weight_attrs):
         self.weight_scale_dtype = paddle.float8_e4m3fn
         self.weight_dtype = paddle.uint8
         self.added_scale_attrs = ["up_gate_proj_weight_scale", "down_proj_weight_scale"]
-        # self.added_blockscale_swizzled_attrs = ["up_gate_proj_blockscale_swizzled", "down_proj_blockscale_swizzled"]
         up_gate_proj_weight_name = self.added_weight_attrs[0]
         down_proj_weight_name = self.added_weight_attrs[1]
         up_gate_proj_scale_name = self.added_scale_attrs[0]
         down_proj_scale_name = self.added_scale_attrs[1]
-        # up_gate_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[0]
-        # down_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[1]
         setattr(
             layer,
             up_gate_proj_weight_name,
@@ -595,7 +594,7 @@ def apply(self, layer, x, gate):
         flashinfer nvfp4 fusedmoe for Model Optimizer
         """
         gate_out = gate(x.cast("float32"))
-        topk_ids, topk_weights = moe_topk_select(
+        topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select(
             gate_out,
             layer.gate_correction_bias,
             layer.top_k,
diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py
index f773afc4c9a..75947590be8 100644
--- a/fastdeploy/model_executor/models/ernie4_5_moe.py
+++ b/fastdeploy/model_executor/models/ernie4_5_moe.py
@@ -17,6 +17,7 @@
 from __future__ import annotations
 
 import inspect
+import re
 from functools import partial
 from typing import Dict, Union
 
@@ -514,6 +515,7 @@ def load_weights(self, weights_iterator) -> None:
 
         from fastdeploy.model_executor.utils import (
             default_weight_loader,
+            process_weights_after_loading,
             rename_offline_ckpt_suffix_to_fd_suffix,
         )
 
@@ -560,6 +562,10 @@ def load_weights(self, weights_iterator) -> None:
         )
         params_dict = dict(self.named_parameters())
 
+        process_weights_after_loading_fn = process_weights_after_loading(
+            dict(self.named_sublayers()), fd_config=self.fd_config
+        )
+
         for loaded_weight_name, loaded_weight in weights_iterator:
             loaded_weight_name = loaded_weight_name.replace("model", "ernie")
             for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping:
@@ -588,10 +594,10 @@ def load_weights(self, weights_iterator) -> None:
             else:
                 weight_loader(param, loaded_weight, shard_id)
 
-        for name, sublayer in self.named_sublayers():
-            quant_method = getattr(sublayer, "quant_method", None)
-            if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
-                quant_method.process_weights_after_loading(sublayer)
+            model_sublayer_name = re.sub(
+                r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name
+            )
+            process_weights_after_loading_fn(model_sublayer_name, param)
 
         if self.tie_word_embeddings:
             self.lm_head.linear.weight.set_value(self.ernie.embed_tokens.embeddings.weight.transpose([1, 0]))
diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py
index 662483ea15a..9537b84f22c 100644
--- a/fastdeploy/model_executor/models/qwen3moe.py
+++ b/fastdeploy/model_executor/models/qwen3moe.py
@@ -16,6 +16,7 @@
 
 from __future__ import annotations
 
+import re
 from functools import partial
 
 import paddle
@@ -342,7 +343,10 @@ def load_weights(self, weights_iterator) -> None:
             weights_iterator (Iterator): An iterator yielding (name, weight) pairs.
         """
 
-        from fastdeploy.model_executor.utils import default_weight_loader
+        from fastdeploy.model_executor.utils import (
+            default_weight_loader,
+            process_weights_after_loading,
+        )
 
         stacked_params_mapping = [
             # (param_name, shard_name, shard_id)
@@ -356,6 +360,7 @@ def load_weights(self, weights_iterator) -> None:
         ]
         expert_params_mapping = self.get_expert_mapping()
         params_dict = dict(self.named_parameters())
+        process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config)
         for loaded_weight_name, loaded_weight in weights_iterator:
             for param_name, weight_name, shard_id in stacked_params_mapping:
                 if weight_name not in loaded_weight_name:
@@ -389,10 +394,8 @@ def load_weights(self, weights_iterator) -> None:
                     weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config))
                     weight_loader(param, loaded_weight)
 
-        for name, sublayer in self.named_sublayers():
-            quant_method = getattr(sublayer, "quant_method", None)
-            if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"):
-                quant_method.process_weights_after_loading(sublayer)
+            model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name)
+            process_weights_after_loading_fn(model_sublayer_name, param)
 
     @paddle.no_grad()
     def set_state_dict(self, state_dict):

From 5233398f8f1dbf651f0013e62cbef5eeafcfcf33 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Thu, 20 Nov 2025 18:07:58 +0800
Subject: [PATCH 14/26] add some ut

---
 .../layers/quantization/nvfp4.py              | 16 +---
 tests/quantization/test_modelopt_nvfp4.py     | 96 +++++++++++++++++++
 2 files changed, 99 insertions(+), 13 deletions(-)
 create mode 100644 tests/quantization/test_modelopt_nvfp4.py

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 92b47997470..49c88be7882 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -134,18 +134,9 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
         """
         Get quantization method.
         """
-        # skip_layer = self.is_layer_excluded(prefix)
         if isinstance(layer, FusedMoE):
-            # if skip_layer:
-            #     return None
             return ModelOptNvFp4FusedMoE(self)
         else:
-            # LinearBase
-            # if skip_layer:
-            #     return UnquantizedLinearMethod()
-            # Check if this is a vision model layer that should not be quantized
-            # if "vision_tower" in prefix or "vision_model" in prefix:
-            #     return UnquantizedLinearMethod()
             return ModelOptNvFp4LinearMethod(self)
 
         return None
@@ -224,7 +215,7 @@ def _create_input_scale(self, layer, input_scale_shape):
             input_scale_shape: 输入缩放形状
         """
         layer.input_scale = layer.create_parameter(
-            shape=input_scale_shape,  # output_size
+            shape=input_scale_shape,
             dtype=paddle.float32,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
@@ -240,7 +231,7 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape,
             extra_weight_attrs: 额外权重属性
         """
         layer.weight_scale_2 = layer.create_parameter(
-            shape=weight_scale_2_shape,  # output_size
+            shape=weight_scale_2_shape,
             dtype=paddle.float32,
             is_bias=False,
             default_initializer=paddle.nn.initializer.Constant(0),
@@ -257,7 +248,6 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape,
         )
 
     def process_weights_after_loading(self, layer) -> None:
-        # if
         def _process_scale_interleaved(scales):
             scale_dim = len(scales.shape)
             if scale_dim == 2:
@@ -386,7 +376,7 @@ def __init__(self, quant_config: ModelOptNvFp4Config):
         self.backend = "none"
 
         if envs.FD_FLASHINFER_MOE_BACKEND is None:
-            # currently support flashinfer-cutlass and flashinfer-trtllm
+            # currently support flashinfer-cutlass,  flashinfer-trtllm will support in the future
             if has_flashinfer():
                 self.backend = "flashinfer-cutlass"
         elif envs.FD_FLASHINFER_MOE_BACKEND.startswith("flashinfer-"):
diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py
new file mode 100644
index 00000000000..b3dc1244715
--- /dev/null
+++ b/tests/quantization/test_modelopt_nvfp4.py
@@ -0,0 +1,96 @@
+"""
+# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+
+import unittest
+from unittest import mock
+
+import paddle
+
+from fastdeploy.flashinfer import has_flashinfer
+
+# import fastdeploy
+from fastdeploy.model_executor.layers.linear import QKVParallelLinear
+from fastdeploy.model_executor.layers.moe import FusedMoE
+from fastdeploy.model_executor.layers.quantization.nvfp4 import (
+    ModelOptNvFp4Config,
+    ModelOptNvFp4FusedMoE,
+    ModelOptNvFp4LinearMethod,
+)
+
+
+def get_sm_version():
+    prop = paddle.device.cuda.get_device_properties()
+    cc = prop.major * 10 + prop.minor
+    return cc
+
+
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda() or get_sm_version() < 100,
+    "Nvfp4 do not support sm < 100.",
+)
+class TestModelOptNvFp4Config(unittest.TestCase):
+    def setUp(self):
+        prop = paddle.device.cuda.get_device_properties()
+        self.sm_version = prop.major * 10 + prop.minor
+
+        self.raw_config = {
+            "config_groups": {
+                "group_0": {
+                    "input_activations": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": 16},
+                    "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": 16},
+                    "targets": ["Linear"],
+                }
+            },
+            "quant_algo": "NVFP4",
+            "producer": {"name": "modelopt", "version": "0.34.1.dev85+g7a72957d"},
+            "quant_method": "modelopt",
+        }
+
+        self.config = ModelOptNvFp4Config.from_config(self.raw_config)
+
+    def test_name(self):
+        """Test name() method"""
+        self.assertEqual(self.config.name(), "modelopt_fp4")
+
+    def test_from_config(self):
+        """Test from_config with full dict"""
+        cfg = ModelOptNvFp4Config.from_config(self.raw_config)
+        self.assertFalse(cfg.is_checkpoint_bf16)
+        self.assertTrue(cfg.is_checkpoint_nvfp4_serialized)
+        self.assertEqual(cfg.group_size, 16)
+        self.assertEqual(cfg.exclude_modules, [])
+        self.assertEqual(cfg.kv_cache_quant_algo, None)
+        self.assertEqual(cfg.quant_max_bound, 6)
+        self.assertEqual(cfg.quant_min_bound, -6)
+        self.assertEqual(cfg.quant_round_type, 1)
+
+    @unittest.skipIf(not has_flashinfer(), "Skip if no FlashInfer available")
+    def test_get_quant_method_linear(self):
+        """Test get_quant_method with a linear layer"""
+        layer = mock.Mock(spec=QKVParallelLinear)
+        method = self.config.get_quant_method(layer)
+        assert isinstance(method, ModelOptNvFp4LinearMethod)
+
+    @unittest.skipIf(not has_flashinfer(), "Skip if no FlashInfer available")
+    def test_get_quant_method_fused_moe(self):
+        """Test get_quant_method with a moe layer"""
+        layer = mock.Mock(spec=FusedMoE)
+        method = self.config.get_quant_method(layer)
+        assert isinstance(method, ModelOptNvFp4FusedMoE)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 748e81268bf4f4c128eae7691ff600ea225465d9 Mon Sep 17 00:00:00 2001
From: zoooo0820 <zoooo0820@qq.com>
Date: Thu, 20 Nov 2025 18:58:38 +0800
Subject: [PATCH 15/26] add docs

---
 docs/quantization/nvfp4.md                | 74 ++++++++++++++++++++++
 docs/zh/quantization/nvfp4.md             | 75 +++++++++++++++++++++++
 tests/quantization/test_modelopt_nvfp4.py |  2 -
 3 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 docs/quantization/nvfp4.md
 create mode 100644 docs/zh/quantization/nvfp4.md

diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md
new file mode 100644
index 00000000000..c8edd091c79
--- /dev/null
+++ b/docs/quantization/nvfp4.md
@@ -0,0 +1,74 @@
+
+# NVFP4 Quantization
+NVFP4 is an innovative 4-bit floating-point format introduced by NVIDIA. For detailed information, please refer to [Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/).
+
+Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy supports NVFP4 quantized model inference in the format produced by [Modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
+
+- Note: Currently, this feature only supports FP4 quantized models of Ernie/Qwen series.
+
+## How to Use
+### Environment Setup
+- **Supported Hardware**: GPU sm >= 100
+- **PaddlePaddle Version**: 3.3.0 or higher
+- **Fastdeploy Version**: 2.4.0 or higher
+
+#### 1. Fastdeploy Installation
+First, install the Fastdeploy base environment according to the [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md).
+
+#### 2. Flashinfer Installation
+```bash
+git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive
+
+cd flashinfer
+python -m pip install -v .
+```
+
+### Running Inference Service
+- Note: Need to set environment variable `export PADDLE_COMPATIBLE_API=true` and install the corresponding Flashinfer correctly
+```bash
+export PADDLE_COMPATIBLE_API=true
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model nv-community/Qwen3-30B-A3B-FP4 \
+    --port 8180 \
+    --metrics-port 8181 \
+    --engine-worker-queue-port 8182 \
+    --cache-queue-port 8183 \
+    --tensor-parallel-size 1 \
+    --max-model-len  32768 \
+    --max-num-seqs 128
+```
+
+### API Access
+Make service requests using the following command
+
+```shell
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "把李白的静夜思改写为现代诗"}
+  ]
+}'
+```
+
+FastDeploy service interface is compatible with OpenAI protocol. You can make service requests using the following Python code.
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8180"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+        {"role": "user", "content": "把李白的静夜思改写为现代诗"},
+    ],
+    stream=True,
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```.
diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md
new file mode 100644
index 00000000000..62e6e36aa57
--- /dev/null
+++ b/docs/zh/quantization/nvfp4.md
@@ -0,0 +1,75 @@
+[English](../../quantization/nvfp4.md)
+
+# NVFP4量化
+NVFP4 是 NVIDIA 引入的创新 4 位浮点格式，详细介绍请参考[Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/)。
+
+基于[FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy 支持[Modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) 产出格式的NVFP4量化模型推理。
+
+- 注：目前该功能仅支持Ernie / Qwen系列的FP4量化模型。
+
+## 如何使用
+### 环境安装
+- **支持硬件**：GPU sm >= 100
+- **PaddlePaddle 版本**：3.3.0 或更高版本
+- **Fastdeploy 版本**：2.4.0 或更高版本
+
+#### 1. Fastdeploy 安装
+首先请根据[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)，安装Fastdeploy基础环境。
+
+#### 2. Flashinfer 安装
+```bash
+git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive
+
+cd flashinfer
+python -m pip install -v .
+```
+
+### 运行推理服务
+- 注意：需要指定环境变量`export PADDLE_COMPATIBLE_API=true`并正确安装对应Flashinfer
+```bash
+export PADDLE_COMPATIBLE_API=true
+python -m fastdeploy.entrypoints.openai.api_server \
+    --model nv-community/Qwen3-30B-A3B-FP4 \
+    --port 8180 \
+    --metrics-port 8181 \
+    --engine-worker-queue-port 8182 \
+    --cache-queue-port 8183 \
+    --tensor-parallel-size 1 \
+    --max-model-len  32768 \
+    --max-num-seqs 128
+```
+
+### 接口访问
+通过如下命令发起服务请求
+
+```shell
+curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \
+-H "Content-Type: application/json" \
+-d '{
+  "messages": [
+    {"role": "user", "content": "把李白的静夜思改写为现代诗"}
+  ]
+}'
+```
+
+FastDeploy服务接口兼容OpenAI协议，可以通过如下Python代码发起服务请求。
+
+```python
+import openai
+host = "0.0.0.0"
+port = "8180"
+client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null")
+
+response = client.chat.completions.create(
+    model="null",
+    messages=[
+        {"role": "system", "content": "I'm a helpful AI assistant."},
+        {"role": "user", "content": "把李白的静夜思改写为现代诗"},
+    ],
+    stream=True,
+)
+for chunk in response:
+    if chunk.choices[0].delta:
+        print(chunk.choices[0].delta.content, end='')
+print('\n')
+```
diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py
index b3dc1244715..6015a0dff03 100644
--- a/tests/quantization/test_modelopt_nvfp4.py
+++ b/tests/quantization/test_modelopt_nvfp4.py
@@ -20,8 +20,6 @@
 import paddle
 
 from fastdeploy.flashinfer import has_flashinfer
-
-# import fastdeploy
 from fastdeploy.model_executor.layers.linear import QKVParallelLinear
 from fastdeploy.model_executor.layers.moe import FusedMoE
 from fastdeploy.model_executor.layers.quantization.nvfp4 import (

From be11fc33e1b19ed0a194f4dd3121481e9f19788a Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Mon, 12 Jan 2026 11:30:49 +0000
Subject: [PATCH 16/26] fix CLA, test

---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 5597aec2d0f..b78f5f54e33 100644
--- a/build.sh
+++ b/build.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash
+#!/usr/bin/env bash 
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #

From 509fc3314f58fef4d0db39b53b3fc40da806b589 Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Mon, 12 Jan 2026 11:32:05 +0000
Subject: [PATCH 17/26] fix the apply() in ModelOptNvFp4FusedMoE

---
 build.sh                                               | 2 +-
 fastdeploy/model_executor/layers/quantization/nvfp4.py | 8 ++++++--
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/build.sh b/build.sh
index b78f5f54e33..5597aec2d0f 100644
--- a/build.sh
+++ b/build.sh
@@ -1,4 +1,4 @@
-#!/usr/bin/env bash 
+#!/usr/bin/env bash
 
 # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
 #
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 49c88be7882..5a5f13297d9 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -32,7 +32,8 @@
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 if has_flashinfer():
-    paddle.compat.enable_torch_proxy()
+    # 加一个scope
+    paddle.compat.enable_torch_proxy(scope={"flashinfer"})
     from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
     from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
@@ -579,7 +580,7 @@ def process_weights_after_loading(self, layer):
         layer.down_proj_weight_scale = None
         create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled)
 
-    def apply(self, layer, x, gate):
+    def apply(self, layer, x, gate, topk_ids_hookfunc=None,):
         """
         flashinfer nvfp4 fusedmoe for Model Optimizer
         """
@@ -591,6 +592,9 @@ def apply(self, layer, x, gate):
             True,  # apply_norm_weight,
             False,
         )
+        
+        if topk_ids_hookfunc is not None:
+            topk_ids_hookfunc(topk_ids)
 
         output_dtype = x.dtype
         x_sf = None

From 798cb6b36abb09d30338c8dc8a40a1c361b57896 Mon Sep 17 00:00:00 2001
From: nyx-c-language <nyxchaoji123@163.com>
Date: Tue, 13 Jan 2026 00:07:06 +0800
Subject: [PATCH 18/26] fix CodeStyle

---
 fastdeploy/model_executor/layers/quantization/nvfp4.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index 5a5f13297d9..fa95433f5a3 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -580,7 +580,13 @@ def process_weights_after_loading(self, layer):
         layer.down_proj_weight_scale = None
         create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled)
 
-    def apply(self, layer, x, gate, topk_ids_hookfunc=None,):
+    def apply(
+        self,
+        layer,
+        x,
+        gate,
+        topk_ids_hookfunc=None,
+    ):
         """
         flashinfer nvfp4 fusedmoe for Model Optimizer
         """
@@ -592,7 +598,7 @@ def apply(self, layer, x, gate, topk_ids_hookfunc=None,):
             True,  # apply_norm_weight,
             False,
         )
-        
+
         if topk_ids_hookfunc is not None:
             topk_ids_hookfunc(topk_ids)
 

From ca2a6991f92dd788c56392c6c048a6bba9c18b38 Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Tue, 13 Jan 2026 07:28:53 +0000
Subject: [PATCH 19/26] del the PADDLE_COMPATIBLE_API

---
 docs/quantization/nvfp4.md                    | 17 +++++------------
 docs/zh/quantization/nvfp4.md                 | 19 +++++--------------
 fastdeploy/flashinfer.py                      |  7 -------
 .../layers/quantization/nvfp4.py              |  3 +--
 tests/quantization/test_modelopt_nvfp4.py     |  2 +-
 5 files changed, 12 insertions(+), 36 deletions(-)

diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md
index c8edd091c79..2f1831133b6 100644
--- a/docs/quantization/nvfp4.md
+++ b/docs/quantization/nvfp4.md
@@ -8,23 +8,16 @@ Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy s
 
 ## How to Use
 ### Environment Setup
+#### Supported Environment
 - **Supported Hardware**: GPU sm >= 100
 - **PaddlePaddle Version**: 3.3.0 or higher
-- **Fastdeploy Version**: 2.4.0 or higher
+- **Fastdeploy Version**: 2.5.0 or higher
 
-#### 1. Fastdeploy Installation
-First, install the Fastdeploy base environment according to the [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md).
-
-#### 2. Flashinfer Installation
-```bash
-git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive
-
-cd flashinfer
-python -m pip install -v .
-```
+#### FastDeploy Installation
+Please ensure that FastDeploy is installed with NVIDIA GPU support.
+Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md).
 
 ### Running Inference Service
-- Note: Need to set environment variable `export PADDLE_COMPATIBLE_API=true` and install the corresponding Flashinfer correctly
 ```bash
 export PADDLE_COMPATIBLE_API=true
 python -m fastdeploy.entrypoints.openai.api_server \
diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md
index 62e6e36aa57..845a5e6e4a0 100644
--- a/docs/zh/quantization/nvfp4.md
+++ b/docs/zh/quantization/nvfp4.md
@@ -8,26 +8,17 @@ NVFP4 是 NVIDIA 引入的创新 4 位浮点格式，详细介绍请参考[Intro
 - 注：目前该功能仅支持Ernie / Qwen系列的FP4量化模型。
 
 ## 如何使用
-### 环境安装
+### 环境准备
+#### 支持环境
 - **支持硬件**：GPU sm >= 100
 - **PaddlePaddle 版本**：3.3.0 或更高版本
-- **Fastdeploy 版本**：2.4.0 或更高版本
+- **Fastdeploy 版本**：2.5.0 或更高版本
 
-#### 1. Fastdeploy 安装
-首先请根据[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)，安装Fastdeploy基础环境。
-
-#### 2. Flashinfer 安装
-```bash
-git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive
-
-cd flashinfer
-python -m pip install -v .
-```
+#### Fastdeploy 安装
+FastDeploy 需以 NVIDIA GPU 模式安装，具体安装方式请参考官方文档：[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)。
 
 ### 运行推理服务
-- 注意：需要指定环境变量`export PADDLE_COMPATIBLE_API=true`并正确安装对应Flashinfer
 ```bash
-export PADDLE_COMPATIBLE_API=true
 python -m fastdeploy.entrypoints.openai.api_server \
     --model nv-community/Qwen3-30B-A3B-FP4 \
     --port 8180 \
diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py
index 23634faed5f..2d3d1befce1 100644
--- a/fastdeploy/flashinfer.py
+++ b/fastdeploy/flashinfer.py
@@ -17,21 +17,14 @@
 import functools
 import importlib
 import importlib.util
-import os
 import shutil
 
-from paddleformers.utils.log import logger
-
 
 @functools.cache
 def has_flashinfer() -> bool:
     """Return `True` if FlashInfer is available."""
     # Use find_spec to check if the module exists without importing it
     # This avoids potential CUDA initialization side effects
-    if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]:
-        # currently must support by Paddle compatible API
-        logger.warning("FlashInfer is not supported by Paddle compatible API.")
-        return False
     if importlib.util.find_spec("flashinfer") is None:
         return False
     # Also check if nvcc is available since it's required to JIT compile flashinfer
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index fa95433f5a3..d45bf69b99a 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -1,5 +1,5 @@
 """
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -32,7 +32,6 @@
 from .quant_base import QuantConfigBase, QuantMethodBase
 
 if has_flashinfer():
-    # 加一个scope
     paddle.compat.enable_torch_proxy(scope={"flashinfer"})
     from flashinfer import fp4_quantize
     from flashinfer import mm_fp4 as fp4_gemm
diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py
index 6015a0dff03..609726659b0 100644
--- a/tests/quantization/test_modelopt_nvfp4.py
+++ b/tests/quantization/test_modelopt_nvfp4.py
@@ -1,5 +1,5 @@
 """
-# Copyright (c) 2025  PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2026  PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"
 # you may not use this file except in compliance with the License.

From 14fc296e4810c3054f0ed88ec9f1c4013abce2f1 Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Tue, 13 Jan 2026 08:17:46 +0000
Subject: [PATCH 20/26] fix broken url: nvidia_gpu.md

---
 docs/quantization/nvfp4.md    | 2 +-
 docs/zh/quantization/nvfp4.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md
index 2f1831133b6..241bf931241 100644
--- a/docs/quantization/nvfp4.md
+++ b/docs/quantization/nvfp4.md
@@ -15,7 +15,7 @@ Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy s
 
 #### FastDeploy Installation
 Please ensure that FastDeploy is installed with NVIDIA GPU support.
-Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md).
+Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/).
 
 ### Running Inference Service
 ```bash
diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md
index 845a5e6e4a0..656cc267af1 100644
--- a/docs/zh/quantization/nvfp4.md
+++ b/docs/zh/quantization/nvfp4.md
@@ -15,7 +15,7 @@ NVFP4 是 NVIDIA 引入的创新 4 位浮点格式，详细介绍请参考[Intro
 - **Fastdeploy 版本**：2.5.0 或更高版本
 
 #### Fastdeploy 安装
-FastDeploy 需以 NVIDIA GPU 模式安装，具体安装方式请参考官方文档：[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)。
+FastDeploy 需以 NVIDIA GPU 模式安装，具体安装方式请参考官方文档：[Fastdeploy NVIDIA GPU 环境安装指南](https://paddlepaddle.github.io/FastDeploy/zh/get_started/installation/nvidia_gpu/)。
 
 ### 运行推理服务
 ```bash

From a25fea0da153eb9ad7b22fd6ba7a8e82e22c8d0e Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Tue, 13 Jan 2026 10:44:32 +0000
Subject: [PATCH 21/26] fix docs

---
 docs/quantization/nvfp4.md | 1 -
 fastdeploy/flashinfer.py   | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md
index 241bf931241..e89b31dcd89 100644
--- a/docs/quantization/nvfp4.md
+++ b/docs/quantization/nvfp4.md
@@ -19,7 +19,6 @@ Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU
 
 ### Running Inference Service
 ```bash
-export PADDLE_COMPATIBLE_API=true
 python -m fastdeploy.entrypoints.openai.api_server \
     --model nv-community/Qwen3-30B-A3B-FP4 \
     --port 8180 \
diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py
index 2d3d1befce1..c76564196a2 100644
--- a/fastdeploy/flashinfer.py
+++ b/fastdeploy/flashinfer.py
@@ -1,5 +1,5 @@
 """
-# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

From b3e600d35fda0392938133c78783c46b3fba6039 Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Mon, 19 Jan 2026 11:22:46 +0000
Subject: [PATCH 22/26] fix token_ids

---
 .../layers/quantization/__init__.py           | 10 ++++++++--
 fastdeploy/worker/gpu_model_runner.py         | 19 +++++++++++--------
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index 53fdb7ea0dd..da88bc8330a 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -154,7 +154,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .block_wise_fp8 import BlockWiseFP8Config
     from .kv_cache import KvCacheQuantConfig
     from .mix_quant import MixQuantConfig
-    from .nvfp4 import ModelOptNvFp4Config
     from .tensor_wise_fp8 import TensorWiseFP8Config
     from .w4a8 import W4A8Config
     from .w4afp8 import W4AFP8Config
@@ -163,6 +162,14 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .wfp8afp8 import WFP8AFP8Config
     from .wint2 import WINT2Config
 
+    if quantization == "modelopt_fp4":
+        try:
+            from .nvfp4 import ModelOptNvFp4Config
+
+            return ModelOptNvFp4Config
+        except ImportError as e:
+            raise ImportError(f"Failed to import ModelOptNvFp4Config. Details: {e}")
+
     method_to_config: Dict[str, Type[QuantConfigBase]] = {
         "wint2": WINT2Config,
         "wint4": WINT4Config,
@@ -176,7 +183,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
         "tensor_wise_fp8": TensorWiseFP8Config,
         "kvcache": KvCacheQuantConfig,
         "mix_quant": MixQuantConfig,
-        "modelopt_fp4": ModelOptNvFp4Config,
     }
 
     return method_to_config[quantization]
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 768e59b2460..511dccb2d77 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1274,12 +1274,6 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32")
         self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
-        # NOTE(liuzichang): token after \n</think>\n\n must be <tool_call> 100973 or <response> 100975
-        # It is a hard code to cover up model's performance
-        # Detailed notes can be found in FastDeploy/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu
-        self.share_inputs["reasoning_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
-        self.share_inputs["reasoning_allowed_tokens"] = paddle.to_tensor([100973, 100975], dtype="int64")
-
         # Initialize rotary position embedding
         if not self.enable_mm:
             self.share_inputs["rope_emb"] = get_rope(
@@ -2017,6 +2011,17 @@ def _dummy_run(
             self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph
             self.padding_cudagraph_inputs()
 
+            # Replace uninitialized tensors with valid random token IDs.
+            if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None:
+                vocab_size = getattr(self.model_config, "vocab_size", 32000)
+
+                self.forward_meta.ids_remove_padding = paddle.randint(
+                    low=0,
+                    high=vocab_size,
+                    shape=self.forward_meta.ids_remove_padding.shape,
+                    dtype=self.forward_meta.ids_remove_padding.dtype,
+                )
+
             # 3. Run model
             if self.enable_mm:
                 model_output = self.model(
@@ -2766,8 +2771,6 @@ def clear_requests(self):
         self.prompt_logprobs_reqs.clear()
         self.in_progress_prompt_logprobs.clear()
         self.forward_batch_reqs_list = [None for _ in range(self.scheduler_config.max_num_seqs)]
-        if self.fd_config.routing_replay_config.enable_routing_replay:
-            self.routing_replay_manager.put_table_to_store()
 
     def update_parameters(self, pid):
         """Dynamic model loader use to update parameters use for RL"""

From ee8f622ba6282ba45f0edf5cb1415aad11470c0b Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Mon, 19 Jan 2026 11:41:08 +0000
Subject: [PATCH 23/26] fix CI in Hopper

---
 fastdeploy/worker/gpu_model_runner.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index fde07760d46..e979a1ee1ca 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -1274,6 +1274,12 @@ def _init_share_inputs(self, max_num_seqs: int):
         self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32")
         self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
 
+        # NOTE(liuzichang): token after \n</think>\n\n must be <tool_call> 100973 or <response> 100975
+        # It is a hard code to cover up model's performance
+        # Detailed notes can be found in FastDeploy/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu
+        self.share_inputs["reasoning_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32")
+        self.share_inputs["reasoning_allowed_tokens"] = paddle.to_tensor([100973, 100975], dtype="int64")
+
         # Initialize rotary position embedding
         if not self.enable_mm:
             self.share_inputs["rope_emb"] = get_rope(
@@ -2015,10 +2021,8 @@ def _dummy_run(
             self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph
             self.padding_cudagraph_inputs()
 
-            # Replace uninitialized tensors with valid random token IDs.
             if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None:
                 vocab_size = getattr(self.model_config, "vocab_size", 32000)
-
                 self.forward_meta.ids_remove_padding = paddle.randint(
                     low=0,
                     high=vocab_size,
@@ -2775,6 +2779,8 @@ def clear_requests(self):
         self.prompt_logprobs_reqs.clear()
         self.in_progress_prompt_logprobs.clear()
         self.forward_batch_reqs_list = [None for _ in range(self.scheduler_config.max_num_seqs)]
+        if self.fd_config.routing_replay_config.enable_routing_replay:
+            self.routing_replay_manager.put_table_to_store()
 
     def update_parameters(self, pid):
         """Dynamic model loader use to update parameters use for RL"""

From 4057e1e1ab17eac96962bba23413c3d49428c7bb Mon Sep 17 00:00:00 2001
From: Echo-Nie <nyxchaoji123@163.com>
Date: Tue, 20 Jan 2026 08:26:35 +0000
Subject: [PATCH 24/26] move flashinfer imports inside the function

---
 .../model_executor/layers/quantization/__init__.py    | 10 ++--------
 .../model_executor/layers/quantization/nvfp4.py       | 11 ++++++++---
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py
index da88bc8330a..53fdb7ea0dd 100644
--- a/fastdeploy/model_executor/layers/quantization/__init__.py
+++ b/fastdeploy/model_executor/layers/quantization/__init__.py
@@ -154,6 +154,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .block_wise_fp8 import BlockWiseFP8Config
     from .kv_cache import KvCacheQuantConfig
     from .mix_quant import MixQuantConfig
+    from .nvfp4 import ModelOptNvFp4Config
     from .tensor_wise_fp8 import TensorWiseFP8Config
     from .w4a8 import W4A8Config
     from .w4afp8 import W4AFP8Config
@@ -162,14 +163,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
     from .wfp8afp8 import WFP8AFP8Config
     from .wint2 import WINT2Config
 
-    if quantization == "modelopt_fp4":
-        try:
-            from .nvfp4 import ModelOptNvFp4Config
-
-            return ModelOptNvFp4Config
-        except ImportError as e:
-            raise ImportError(f"Failed to import ModelOptNvFp4Config. Details: {e}")
-
     method_to_config: Dict[str, Type[QuantConfigBase]] = {
         "wint2": WINT2Config,
         "wint4": WINT4Config,
@@ -183,6 +176,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]:
         "tensor_wise_fp8": TensorWiseFP8Config,
         "kvcache": KvCacheQuantConfig,
         "mix_quant": MixQuantConfig,
+        "modelopt_fp4": ModelOptNvFp4Config,
     }
 
     return method_to_config[quantization]
diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py
index d45bf69b99a..cd17b53f34b 100644
--- a/fastdeploy/model_executor/layers/quantization/nvfp4.py
+++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py
@@ -33,9 +33,6 @@
 
 if has_flashinfer():
     paddle.compat.enable_torch_proxy(scope={"flashinfer"})
-    from flashinfer import fp4_quantize
-    from flashinfer import mm_fp4 as fp4_gemm
-    from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe
 else:
     logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.")
 
@@ -327,6 +324,8 @@ def apply(
         output_dtype = x.dtype
 
         # Quantize BF16 or FP16 to (FP4 and interleaved block scale)
+        from flashinfer import fp4_quantize
+
         x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv)
 
         assert x_fp4.dtype == paddle.uint8
@@ -345,6 +344,8 @@ def apply(
         if backend == "cutlass":
             x_scale_interleaved = x_scale_interleaved.view(paddle.uint8)
             w_scale_interleaved = w_scale_interleaved.view(paddle.uint8)
+        from flashinfer import mm_fp4 as fp4_gemm
+
         out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend)
         if layer.with_bias:
             out = paddle.add(out, layer.bias)
@@ -607,6 +608,10 @@ def apply(
 
         if self.backend == "flashinfer-cutlass":
             # flashinfer cutlass
+            from flashinfer.fused_moe import (
+                cutlass_fused_moe as flashinfer_cutlass_fused_moe,
+            )
+
             _ = flashinfer_cutlass_fused_moe(
                 input=x,
                 token_selected_experts=topk_ids.to(paddle.int),

From f9ec3445ab98f2471918dce11a80baf7fcd5c9b7 Mon Sep 17 00:00:00 2001
From: xxxuan <157974576+Echo-Nie@users.noreply.github.com>
Date: Wed, 21 Jan 2026 00:14:01 +0800
Subject: [PATCH 25/26] fix model_runner

Removed the logic for generating random padding IDs.
---
 fastdeploy/worker/gpu_model_runner.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
index 116f6f5270a..26872b4aa06 100644
--- a/fastdeploy/worker/gpu_model_runner.py
+++ b/fastdeploy/worker/gpu_model_runner.py
@@ -2030,15 +2030,6 @@ def _dummy_run(
             self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph
             self.padding_cudagraph_inputs()
 
-            if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None:
-                vocab_size = getattr(self.model_config, "vocab_size", 32000)
-                self.forward_meta.ids_remove_padding = paddle.randint(
-                    low=0,
-                    high=vocab_size,
-                    shape=self.forward_meta.ids_remove_padding.shape,
-                    dtype=self.forward_meta.ids_remove_padding.dtype,
-                )
-
             # 3. Run model
             if self.enable_mm:
                 model_output = self.model(

From fb71ccaf511dcbac21ca06d770edf1f1c896466f Mon Sep 17 00:00:00 2001
From: xxxuan <157974576+Echo-Nie@users.noreply.github.com>
Date: Thu, 22 Jan 2026 17:47:40 +0800
Subject: [PATCH 26/26] Remove skip condition for CUDA version in nvfp4 test

---
 tests/quantization/test_modelopt_nvfp4.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py
index 609726659b0..577864adc15 100644
--- a/tests/quantization/test_modelopt_nvfp4.py
+++ b/tests/quantization/test_modelopt_nvfp4.py
@@ -35,10 +35,6 @@ def get_sm_version():
     return cc
 
 
-@unittest.skipIf(
-    not paddle.is_compiled_with_cuda() or get_sm_version() < 100,
-    "Nvfp4 do not support sm < 100.",
-)
 class TestModelOptNvFp4Config(unittest.TestCase):
     def setUp(self):
         prop = paddle.device.cuda.get_device_properties()