From 52500854c106b8cf1bbb7c6d00e2fe30f11df441 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Fri, 24 Oct 2025 10:39:24 +0800 Subject: [PATCH 01/26] fp4 dense --- fastdeploy/envs.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index 3b0be3df998..f7bd505ca06 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -126,6 +126,8 @@ "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), + # FP4 dense GEMM backend + "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None), } From b0c863a0dea315e10f512b99db0029a85d942e3b Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Mon, 27 Oct 2025 18:49:31 +0800 Subject: [PATCH 02/26] [WIP] support nvfp4, dense part --- fastdeploy/flashinfer.py | 35 +++ .../layers/quantization/__init__.py | 3 + .../layers/quantization/nvfp4.py | 276 ++++++++++++++++++ 3 files changed, 314 insertions(+) create mode 100644 fastdeploy/flashinfer.py create mode 100644 fastdeploy/model_executor/layers/quantization/nvfp4.py diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py new file mode 100644 index 00000000000..4bc6aa994f2 --- /dev/null +++ b/fastdeploy/flashinfer.py @@ -0,0 +1,35 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import functools +import importlib +import importlib.util +import shutil + + +@functools.cache +def has_flashinfer() -> bool: + """Return `True` if FlashInfer is available.""" + # Use find_spec to check if the module exists without importing it + # This avoids potential CUDA initialization side effects + if importlib.util.find_spec("flashinfer") is None: + # logger.debug_once("FlashInfer unavailable since package was not found") + return False + # Also check if nvcc is available since it's required to JIT compile flashinfer + if shutil.which("nvcc") is None: + # logger.debug_once("FlashInfer unavailable since nvcc was not found") + return False + return True diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index f8716369852..1c9a169a5c4 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -33,6 +33,7 @@ "mix_quant", "tensor_wise_fp8", "kvcache", + "modelopt_fp4", ] @@ -116,6 +117,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .block_wise_fp8 import BlockWiseFP8Config from .kv_cache import KvCacheQuantConfig from .mix_quant import MixQuantConfig + from .nvfp4 import ModelOptNvFp4Config from .tensor_wise_fp8 import TensorWiseFP8Config from .w4a8 import W4A8Config from .w4afp8 import W4AFP8Config @@ -137,6 +139,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: "tensor_wise_fp8": TensorWiseFP8Config, "kvcache": KvCacheQuantConfig, "mix_quant": MixQuantConfig, + "modelopt_fp4": ModelOptNvFp4Config, } return method_to_config[quantization] diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py new file mode 100644 index 00000000000..e686512d50b --- /dev/null +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -0,0 +1,276 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +from typing import Optional + +import paddle +from paddleformers.utils.log import logger + +from fastdeploy import envs +from fastdeploy.flashinfer import has_flashinfer +from fastdeploy.model_executor.layers.moe import FusedMoE + +from .quant_base import QuantConfigBase, QuantMethodBase + +if has_flashinfer(): + from flashinfer import fp4_quantize as scaled_fp4_quant # need to use vllm version + from flashinfer import mm_fp4 as fp4_gemm + + +def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor: + """ + Pad and block-interleave the FP4 block-scales so that they match the data + layout expected by the CUTLASS / FlashInfer kernels. + + Parameters + ---------- + scale: paddle.Tensor + + Returns + ------- + torch.Tensor + The swizzled tensor with the same logical shape as *scale*. + """ + assert scale.dtype == paddle.float8_e4m3fn, ( + "swizzle_blockscale expects the input tensor to be in " "paddle.float8_e4m3fn format." + ) + + scale_ndim = scale.ndim + if scale_ndim == 2: + scale = scale.unsqueeze(0) # (1, M, K) + assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales." + + B, M, K = scale.shape + + def _round_up(x: int, m: int) -> int: + return (x + m - 1) // m * m + + M_padded = _round_up(M, 128) + K_padded = _round_up(K, 4) + + padded = paddle.zeros((B, M_padded, K_padded), dtype=scale.dtype, device=scale.place) + padded[:B, :M, :K] = scale + + # Reshape / permute to the layout required by the kernel. + padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4) + swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda() + + if scale_ndim == 2: + return swizzled.reshape(M_padded, K_padded) + return swizzled.reshape(B, M_padded, K_padded) + + +class ModelOptNvFp4Config(QuantConfigBase): + """ + quantization config for ModelOpt Nvfp4 datatype + """ + + def __init__( + self, + is_checkpoint_nvfp4_serialized: bool, + kv_cache_quant_algo: str | None, + exclude_modules: list[str], + group_size: int = 16, + ) -> None: + self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized + if is_checkpoint_nvfp4_serialized: + logger.warning( + "Detected ModelOpt NVFP4 checkpoint. Please note that" + " the format is experimental and could change in future." + ) + + self.group_size = group_size + self.kv_cache_quant_algo = kv_cache_quant_algo + self.exclude_modules = exclude_modules + + def name(self) -> str: + return "modelopt_fp4" + + @classmethod + def from_config(cls, config: dict) -> "ModelOptNvFp4Config": + if "quantization" in config: + # Traditional ModelOpt format: + # {"quantization": {"quant_algo": "..."}} + quant_config = cls.get_from_keys(config, ["quantization"]) + if not isinstance(quant_config, dict): + raise ValueError("Expected 'quantization' to be a dictionary in config") + + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw + else: + raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None + + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}") + else: + raise ValueError( + "Missing 'quantization' section in config. Please make sure your model is exported using FastDeploy." + ) + + is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method + + # For FP4, these fields are required + if is_checkpoint_nvfp4_serialized and "quantization" in config: + # Check if required fields are present in the quantization config + quant_config = config["quantization"] + required_fields = ["group_size", "kv_cache_quant_algo", "exclude_modules"] + missing_fields = [field for field in required_fields if field not in quant_config] + if missing_fields: + raise ValueError( + f"NVFP4 quantization requires the following fields in " f"hf_quant_config.json: {missing_fields}" + ) + + return cls( + is_checkpoint_nvfp4_serialized=is_checkpoint_nvfp4_serialized, + kv_cache_quant_algo=kv_cache_quant_algo, + exclude_modules=exclude_modules, + group_size=group_size, + ) + + def get_quant_method(self, layer) -> Optional[QuantMethodBase]: + """ + Get quantization method. + """ + # skip_layer = self.is_layer_excluded(prefix) + if isinstance(layer, FusedMoE): + # if skip_layer: + # return None + return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) + else: + # LinearBase + # if skip_layer: + # return UnquantizedLinearMethod() + # Check if this is a vision model layer that should not be quantized + # if "vision_tower" in prefix or "vision_model" in prefix: + # return UnquantizedLinearMethod() + return ModelOptNvFp4LinearMethod(self) + + return None + + +class ModelOptNvFp4LinearMethod(QuantMethodBase): + """Linear method for Model Optimizer NVFP4. + Supports loading NVFP4 checkpoints with the following structure: + + input_scale: paddle.float32, scalar , + weight: NVFP4(represented as byte) Shape: [1, X, y/2] + weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale, + weight_scale_2: paddle.float32, scalar, + Args: quant_config: The ModelOpt quantization config. + """ + + def __init__(self, quant_config: ModelOptNvFp4Config) -> None: + self.quant_config = quant_config + + self.backend = "none" + if envs.FD_NVFP4_GEMM_BACKEND is None: + if has_flashinfer(): + self.backend = "flashinfer-cutlass" + elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"): + self.backend = envs.FD_NVFP4_GEMM_BACKEND + assert has_flashinfer(), f"FlashInfer is required for {self.backend}" + + if self.backend == "none": + raise ValueError("No valid NVFP4 GEMM backend found. " "Please check your platform capability.") + + logger.info(f"Using {self.backend} for NVFP4 GEMM") + + def create_weights( + self, + layer, + **extra_weight_attrs, + ): + return + + def process_weights_after_loading(self, layer) -> None: + return + + def apply( + self, + layer, + x, + ): + x_m, _ = x.shape + w_n, _ = layer.weight.shape + output_shape = [x_m, w_n] + output_dtype = x.dtype + + # Quantize BF16 or FP16 to (FP4 and interleaved block scale) + x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv) + + assert x_fp4.dtype == paddle.uint8 + assert x_scale_interleaved.dtype == paddle.float8_e4m3fn + assert layer.weight.dtype == paddle.uint8 + assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn + assert layer.alpha.dtype == paddle.float32 + + if self.backend.startswith("flashinfer-"): + backend = self.backend[len("flashinfer-") :] + else: + raise ValueError(f"Unsupported backend: {self.backend}.") + + w = layer.weight.T + w_scale_interleaved = layer.weight_scale_interleaved.T + + if backend == "cutlass": + x_scale_interleaved = x_scale_interleaved.view(paddle.uint8) + w_scale_interleaved = w_scale_interleaved.view(paddle.uint8) + out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend) + + if layer.with_bias: + out = paddle.add(out, layer.bias) + return out.view(*output_shape) + + +class ModelOptNvFp4FusedMoE: + """Fused MoE method for Model Optimizer NVFP4. + Supports loading NVFP4 checkpoints with the following structure: + + input_scale: paddle.float32, scalar , + weight: NVFP4(represented as byte) Shape: [1, X, y/2] + weight_scale: FP8-E4M3, Shape: [X, Y], aka per block scale, + weight_scale_2: paddle.float32, scalar, + Args: + quant_config: The ModelOpt quantization config. + moe_config: The MoE configuration. + layer: The linear layer. + """ + + def __init__(self): + pass From d5f3fd269d3e552f935c6d21398e1321d949a9f3 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Tue, 28 Oct 2025 18:29:34 +0800 Subject: [PATCH 03/26] [wip] developing loading qwen model --- .../layers/quantization/__init__.py | 5 + .../layers/quantization/nvfp4.py | 121 ++++++++++++------ 2 files changed, 84 insertions(+), 42 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 1c9a169a5c4..a6bffde03db 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -100,6 +100,11 @@ def _get_offline_quant_config_name(quantization_config, is_torch_weight, is_v1_l has_block_size = "weight_block_size" in quantization_config if quant_method == "fp8" and has_block_size: quant_config_name = "block_wise_fp8" + elif quant_method == "modelopt": + if quantization_config.get("quant_algo", "") == "NVFP4": + quant_config_name = "modelopt_fp4" + else: + raise ValueError("modelopt only supports NVFP4 quantization.") else: raise ValueError("Torch weight offline quantization only supports block-wise FP8.") else: diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index e686512d50b..ebd122a1b84 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -17,6 +17,9 @@ from typing import Optional import paddle + +paddle.compat.enable_torch_proxy() + from paddleformers.utils.log import logger from fastdeploy import envs @@ -96,52 +99,46 @@ def __init__( self.kv_cache_quant_algo = kv_cache_quant_algo self.exclude_modules = exclude_modules + self.quant_max_bound = 6 + self.quant_min_bound = -6 + self.quant_round_type = 1 + def name(self) -> str: return "modelopt_fp4" @classmethod def from_config(cls, config: dict) -> "ModelOptNvFp4Config": - if "quantization" in config: - # Traditional ModelOpt format: - # {"quantization": {"quant_algo": "..."}} - quant_config = cls.get_from_keys(config, ["quantization"]) - if not isinstance(quant_config, dict): - raise ValueError("Expected 'quantization' to be a dictionary in config") - - quant_method = quant_config.get("quant_algo", "") - if not quant_method: - raise ValueError("Missing 'quant_algo' in quantization config") - - # Handle kv_cache_quant_algo with proper type validation - kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") - if kv_cache_quant_algo_raw is None: - # No KV cache quantization by default - kv_cache_quant_algo = None - elif isinstance(kv_cache_quant_algo_raw, str): - kv_cache_quant_algo = kv_cache_quant_algo_raw - else: - raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}") - - # Handle group_size with proper type validation - group_size_raw = quant_config.get("group_size") - if group_size_raw is None: - group_size = 16 # Default value - elif isinstance(group_size_raw, int): - group_size = group_size_raw - else: - try: - group_size = int(group_size_raw) - except (ValueError, TypeError): - raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None - - # "exclude_modules" is the key in the legacy hf_quant_config.json - exclude_modules = quant_config.get("exclude_modules", []) - if not isinstance(exclude_modules, list): - raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}") + quant_config = config + quant_method = quant_config.get("quant_algo", "") + if not quant_method: + raise ValueError("Missing 'quant_algo' in quantization config") + + # Handle kv_cache_quant_algo with proper type validation + kv_cache_quant_algo_raw = quant_config.get("kv_cache_quant_algo") + if kv_cache_quant_algo_raw is None: + # No KV cache quantization by default + kv_cache_quant_algo = None + elif isinstance(kv_cache_quant_algo_raw, str): + kv_cache_quant_algo = kv_cache_quant_algo_raw else: - raise ValueError( - "Missing 'quantization' section in config. Please make sure your model is exported using FastDeploy." - ) + raise ValueError(f"kv_cache_quant_algo must be a string, got " f"{type(kv_cache_quant_algo_raw)}") + + # Handle group_size with proper type validation + group_size_raw = quant_config.get("group_size") + if group_size_raw is None: + group_size = 16 # Default value + elif isinstance(group_size_raw, int): + group_size = group_size_raw + else: + try: + group_size = int(group_size_raw) + except (ValueError, TypeError): + raise ValueError(f"group_size must be an integer, got {type(group_size_raw)}") from None + + # "exclude_modules" is the key in the legacy hf_quant_config.json + exclude_modules = quant_config.get("exclude_modules", []) + if not isinstance(exclude_modules, list): + raise ValueError(f"exclude_modules must be a list, got {type(exclude_modules)}") is_checkpoint_nvfp4_serialized = "NVFP4" in quant_method @@ -216,10 +213,50 @@ def create_weights( layer, **extra_weight_attrs, ): - return + if not self.quant_config.is_checkpoint_nvfp4_serialized: + raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.") + + input_size = layer.weight_shape[0] + output_size = layer.weight_shape[1] + if input_size % 16 != 0: + raise ValueError("Unsupported model when in features size is not multiple of 16") + # Weight + # 2 fp4 items are packed in the input dimension + print("====aaaaaa======= [output_size, input_size // 2]", [output_size, input_size // 2]) + layer.weight = layer.create_parameter( + shape=[output_size, input_size // 2], + dtype=paddle.uint8, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch" + + # Input Weight Scale + layer.input_scale = layer.create_parameter( + shape=[], # output_size + dtype=paddle.float32, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + + # Global Weight Scale + layer.weight_scale_2 = layer.create_parameter( + shape=[], # output_size + dtype=paddle.float32, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + + # Per Block Weight Scale + layer.weight_scale = layer.create_parameter( + shape=[output_size, input_size // self.quant_config.group_size], + dtype=paddle.float8_e4m3fn, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) def process_weights_after_loading(self, layer) -> None: - return + raise ValueError("eeeeeeee") def apply( self, From 1176caea3bb72351bfd18265854dff799722fe6a Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Wed, 5 Nov 2025 12:33:19 +0000 Subject: [PATCH 04/26] loading --- .../layers/quantization/nvfp4.py | 113 +++++++++++++++--- 1 file changed, 99 insertions(+), 14 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index ebd122a1b84..16295c4296b 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -25,6 +25,7 @@ from fastdeploy import envs from fastdeploy.flashinfer import has_flashinfer from fastdeploy.model_executor.layers.moe import FusedMoE +from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs from .quant_base import QuantConfigBase, QuantMethodBase @@ -213,27 +214,40 @@ def create_weights( layer, **extra_weight_attrs, ): - if not self.quant_config.is_checkpoint_nvfp4_serialized: - raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.") - input_size = layer.weight_shape[0] - output_size = layer.weight_shape[1] - if input_size % 16 != 0: - raise ValueError("Unsupported model when in features size is not multiple of 16") + # if not self.quant_config.is_checkpoint_nvfp4_serialized: + # raise ValueError("NVFP4 quantization was selected, " " dynamic quantization is not supported.") + + # input_size = layer.weight_shape[0] + # output_size = layer.weight_shape[1] + # if input_size % 16 != 0: + # raise ValueError("Unsupported model when in features size is not multiple of 16") # Weight # 2 fp4 items are packed in the input dimension - print("====aaaaaa======= [output_size, input_size // 2]", [output_size, input_size // 2]) + # weight_scale_shape = [layer.weight_shape[1]] + # layer.weight_shape.reverse() + dim = -1 if extra_weight_attrs["output_dim"] else 0 + extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] + weight_shape = layer.weight_shape[::-1] + weight_shape[dim] = weight_shape[dim] // 2 + layer.weight_dtype = "uint8" + input_scale_shape = [1] + weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size] + weight_scale_2_shape = [1] layer.weight = layer.create_parameter( - shape=[output_size, input_size // 2], - dtype=paddle.uint8, + shape=weight_shape, + dtype=layer.weight_dtype, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) - extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch" + set_weight_attrs( + layer.weight, + extra_weight_attrs, + ) # Input Weight Scale layer.input_scale = layer.create_parameter( - shape=[], # output_size + shape=input_scale_shape, # output_size dtype=paddle.float32, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), @@ -241,7 +255,7 @@ def create_weights( # Global Weight Scale layer.weight_scale_2 = layer.create_parameter( - shape=[], # output_size + shape=weight_scale_2_shape, # output_size dtype=paddle.float32, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), @@ -249,14 +263,85 @@ def create_weights( # Per Block Weight Scale layer.weight_scale = layer.create_parameter( - shape=[output_size, input_size // self.quant_config.group_size], + shape=weight_scale_shape, dtype=paddle.float8_e4m3fn, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) + set_weight_attrs( + layer.weight_scale, + extra_weight_attrs, + ) def process_weights_after_loading(self, layer) -> None: - raise ValueError("eeeeeeee") + # if + def _process_scale_interleaved(scales): + scale_dim = len(scales.shape) + if scale_dim == 2: + scales = scales.unsqueeze(0) + assert len(scales.shape) == 3 + B, M, K = scales.shape + round_up_multiple = lambda x, m: (x + m - 1) // m * m + M_padded = round_up_multiple(M, 128) + K_padded = round_up_multiple(K, 4) + padded_scales = paddle.empty([B, M_padded, K_padded], dtype=scales.dtype) + padded_scales[:B, :M, :K].copy_(scales) + batches, rows, cols = padded_scales.shape + assert rows % 128 == 0 + assert cols % 4 == 0 + padded_scales = padded_scales.reshape(batches, rows // 128, 4, 32, cols // 4, 4) + padded_scales = padded_scales.transpose([0, 1, 4, 3, 2, 5]) + padded_scales = padded_scales.contiguous().to(paddle.device.get_device()) + padded_scales = ( + padded_scales.reshape(M_padded, K_padded) + if scale_dim == 2 + else padded_scales.reshape(B, M_padded, K_padded) + ) + return padded_scales + + input_scale_2 = layer.input_scale.max().to(paddle.float32) + weight_scale_2 = layer.weight_scale_2.max().to(paddle.float32) + alpha = input_scale_2 * weight_scale_2 + input_scale_inv = (1 / input_scale_2).to(paddle.float32) + weight_scale_interleaved = _process_scale_interleaved(layer.weight_scale) + free_tensor(layer.input_scale) + free_tensor(layer.weight_scale_2) + + layer.weight_scale_2 = layer.create_parameter( + shape=weight_scale_2.shape, # output_size + dtype=weight_scale_2.dtype, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.input_scale = layer.create_parameter( + shape=input_scale_2.shape, # output_size + dtype=input_scale_2.dtype, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.alpha = layer.create_parameter( + shape=alpha.shape, # output_size + dtype=alpha.dtype, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.input_scale_inv = layer.create_parameter( + shape=input_scale_inv.shape, # output_size + dtype=input_scale_inv.dtype, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.weight_scale_interleaved = layer.create_parameter( + shape=weight_scale_interleaved.shape, + dtype=weight_scale_interleaved.dtype, + is_bias=False, + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.weight_scale_2.copy_(weight_scale_2, False) + layer.input_scale.copy_(input_scale_2, False) + layer.alpha.copy_(alpha, False) + layer.input_scale_inv.copy_(input_scale_inv, False) + layer.weight_scale_interleaved.copy_(weight_scale_interleaved, False) def apply( self, From 71370546bef68361660c04484b78f5101c699cc9 Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Thu, 6 Nov 2025 10:15:05 +0000 Subject: [PATCH 05/26] update --- fastdeploy/model_executor/layers/quantization/nvfp4.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 16295c4296b..ad751f3b4c1 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -226,10 +226,9 @@ def create_weights( # 2 fp4 items are packed in the input dimension # weight_scale_shape = [layer.weight_shape[1]] # layer.weight_shape.reverse() - dim = -1 if extra_weight_attrs["output_dim"] else 0 extra_weight_attrs["output_dim"] = not extra_weight_attrs["output_dim"] weight_shape = layer.weight_shape[::-1] - weight_shape[dim] = weight_shape[dim] // 2 + weight_shape[1] = weight_shape[1] // 2 layer.weight_dtype = "uint8" input_scale_shape = [1] weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size] From 059409088d6c4b2f9e9a10d13a0712b929b011ed Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Thu, 6 Nov 2025 18:28:56 +0800 Subject: [PATCH 06/26] dense fp4 OK, cudagraph error --- fastdeploy/model_executor/layers/quantization/nvfp4.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index ad751f3b4c1..cce2930b294 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -30,7 +30,7 @@ from .quant_base import QuantConfigBase, QuantMethodBase if has_flashinfer(): - from flashinfer import fp4_quantize as scaled_fp4_quant # need to use vllm version + from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm @@ -353,10 +353,9 @@ def apply( output_dtype = x.dtype # Quantize BF16 or FP16 to (FP4 and interleaved block scale) - x_fp4, x_scale_interleaved = scaled_fp4_quant(x, layer.input_scale_inv) + x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv) assert x_fp4.dtype == paddle.uint8 - assert x_scale_interleaved.dtype == paddle.float8_e4m3fn assert layer.weight.dtype == paddle.uint8 assert layer.weight_scale_interleaved.dtype == paddle.float8_e4m3fn assert layer.alpha.dtype == paddle.float32 From ae80853c1aa11c24c832105459259beafc18c807 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Fri, 7 Nov 2025 18:14:54 +0800 Subject: [PATCH 07/26] [WIP] moe forward part --- fastdeploy/envs.py | 4 +- .../layers/quantization/nvfp4.py | 57 ++++++++++++++++++- 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index f7bd505ca06..fa7bb839fc1 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -124,10 +124,10 @@ "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), + # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is cutlass) + "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None), # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), - # FP4 dense GEMM backend - "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None), } diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index cce2930b294..327d2f06d3d 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -25,6 +25,7 @@ from fastdeploy import envs from fastdeploy.flashinfer import has_flashinfer from fastdeploy.model_executor.layers.moe import FusedMoE +from fastdeploy.model_executor.ops.gpu import moe_topk_select from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs from .quant_base import QuantConfigBase, QuantMethodBase @@ -32,6 +33,7 @@ if has_flashinfer(): from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm + from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor: @@ -77,6 +79,10 @@ def _round_up(x: int, m: int) -> int: return swizzled.reshape(B, M_padded, K_padded) +def next_power_of_2(n: int): + return 1 << (n - 1).bit_length() if n > 0 else 1 + + class ModelOptNvFp4Config(QuantConfigBase): """ quantization config for ModelOpt Nvfp4 datatype @@ -378,7 +384,7 @@ def apply( return out.view(*output_shape) -class ModelOptNvFp4FusedMoE: +class ModelOptNvFp4FusedMoE(QuantMethodBase): """Fused MoE method for Model Optimizer NVFP4. Supports loading NVFP4 checkpoints with the following structure: @@ -392,5 +398,52 @@ class ModelOptNvFp4FusedMoE: layer: The linear layer. """ - def __init__(self): + def __init__(self, quant_config: ModelOptNvFp4Config): + self.quant_config = quant_config + + def create_weights(self, layer): pass + + def apply(self, layer, x, gate): + """ + flashinfer nvfp4 fusedmoe for Model Optimizer + """ + gate_out = gate(x.cast("float32")) + topk_ids, topk_weights = moe_topk_select( + gate_out, + layer.gate_correction_bias, + layer.top_k, + True, # apply_norm_weight, + False, + ) + + output_dtype = x.dtype + x_sf = None + + output = paddle.empty_like(x) + # flashinfer cutlass + _ = flashinfer_cutlass_fused_moe( + input=x, + token_selected_experts=topk_ids.to(paddle.int), + token_final_scales=topk_weights, + fc1_expert_weights=layer.w13_weight.view(paddle.long), + fc2_expert_weights=layer.w2_weight.view(paddle.long), + output_dtype=output_dtype, + input_sf=x_sf, + quant_scales=[ + layer.w13_input_scale_quant, + layer.w13_blockscale_swizzled.view(paddle.int32), + layer.g1_alphas, + layer.w2_input_scale_quant, + layer.w2_blockscale_swizzled.view(paddle.int32), + layer.g2_alphas, + ], + ep_size=layer.ep_size, + ep_rank=layer.ep_rank, + tp_size=layer.tp_size, + tp_rank=layer.tp_rank, + tune_max_num_tokens=next_power_of_2(x.shape[0]), + output=output, + ) + + return output From 6b2ebd66fd9562f6c8599b9fe7907c36e43d4211 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Fri, 14 Nov 2025 17:29:41 +0800 Subject: [PATCH 08/26] with flashinfer-backend --- fastdeploy/envs.py | 6 +- .../layers/quantization/nvfp4.py | 73 ++++++++++++------- 2 files changed, 49 insertions(+), 30 deletions(-) diff --git a/fastdeploy/envs.py b/fastdeploy/envs.py index fa7bb839fc1..3a8c95b708b 100644 --- a/fastdeploy/envs.py +++ b/fastdeploy/envs.py @@ -124,8 +124,10 @@ "FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", ""), # Timeout for cache_transfer_manager process exit "FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")), - # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is cutlass) - "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_GEMM_BACKEND", None), + # FP4 dense GEMM backend, could be flashinfer-cutlass, flashinfer-trtllm, flashinfer-cudnn or None (default is None) + "FD_NVFP4_GEMM_BACKEND": lambda: os.getenv("FD_NVFP4_MOE_BACKEND", None), + # Flahinfer MOE backend, could be flashinfer-cutlass, flashinfer-trtllm or None (default is None) + "FD_FLASHINFER_MOE_BACKEND": lambda: os.getenv("FD_FLASHINFER_MOE_BACKEND", None), # Count for cache_transfer_manager process error "FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")), } diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 327d2f06d3d..0a34d529a95 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -34,6 +34,8 @@ from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe +else: + logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.") def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor: @@ -47,7 +49,7 @@ def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor: Returns ------- - torch.Tensor + paddle.Tensor The swizzled tensor with the same logical shape as *scale*. """ assert scale.dtype == paddle.float8_e4m3fn, ( @@ -206,7 +208,7 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None: if envs.FD_NVFP4_GEMM_BACKEND is None: if has_flashinfer(): self.backend = "flashinfer-cutlass" - elif envs.VLLM_NVFP4_GEMM_BACKEND.startswith("flashinfer-"): + elif envs.FD_NVFP4_GEMM_BACKEND.startswith("flashinfer-"): self.backend = envs.FD_NVFP4_GEMM_BACKEND assert has_flashinfer(), f"FlashInfer is required for {self.backend}" @@ -378,7 +380,6 @@ def apply( x_scale_interleaved = x_scale_interleaved.view(paddle.uint8) w_scale_interleaved = w_scale_interleaved.view(paddle.uint8) out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend) - if layer.with_bias: out = paddle.add(out, layer.bias) return out.view(*output_shape) @@ -400,6 +401,18 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase): def __init__(self, quant_config: ModelOptNvFp4Config): self.quant_config = quant_config + self.backend = "none" + + if envs.FD_FLASHINFER_MOE_BACKEND is None: + # currently support flashinfer-cutlass and flashinfer-trtllm + if has_flashinfer(): + self.backend = "flashinfer-cutlass" + elif envs.FD_FLASHINFER_MOE_BACKEND.startswith("flashinfer-"): + self.backend = envs.FD_FLASHINFER_MOE_BACKEND + assert has_flashinfer(), f"FlashInfer is required for MoE backend {self.backend}" + + if self.backend == "none": + raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.") def create_weights(self, layer): pass @@ -419,31 +432,35 @@ def apply(self, layer, x, gate): output_dtype = x.dtype x_sf = None - output = paddle.empty_like(x) - # flashinfer cutlass - _ = flashinfer_cutlass_fused_moe( - input=x, - token_selected_experts=topk_ids.to(paddle.int), - token_final_scales=topk_weights, - fc1_expert_weights=layer.w13_weight.view(paddle.long), - fc2_expert_weights=layer.w2_weight.view(paddle.long), - output_dtype=output_dtype, - input_sf=x_sf, - quant_scales=[ - layer.w13_input_scale_quant, - layer.w13_blockscale_swizzled.view(paddle.int32), - layer.g1_alphas, - layer.w2_input_scale_quant, - layer.w2_blockscale_swizzled.view(paddle.int32), - layer.g2_alphas, - ], - ep_size=layer.ep_size, - ep_rank=layer.ep_rank, - tp_size=layer.tp_size, - tp_rank=layer.tp_rank, - tune_max_num_tokens=next_power_of_2(x.shape[0]), - output=output, - ) + if self.backend == "flashinfer-cutlass": + # flashinfer cutlass + _ = flashinfer_cutlass_fused_moe( + input=x, + token_selected_experts=topk_ids.to(paddle.int), + token_final_scales=topk_weights, + fc1_expert_weights=layer.w13_weight.view(paddle.long), + fc2_expert_weights=layer.w2_weight.view(paddle.long), + output_dtype=output_dtype, + input_sf=x_sf, + quant_scales=[ + layer.w13_input_scale_quant, + layer.w13_blockscale_swizzled.view(paddle.int32), + layer.g1_alphas, + layer.w2_input_scale_quant, + layer.w2_blockscale_swizzled.view(paddle.int32), + layer.g2_alphas, + ], + ep_size=layer.ep_size, + ep_rank=layer.ep_rank, + tp_size=layer.tp_size, + tp_rank=layer.tp_rank, + tune_max_num_tokens=next_power_of_2(x.shape[0]), + output=output, + ) + + return output + + # flashinfer-trtllm return output From 0b28b4bf2718e2c6c6abe6ec0f2ea0722c8a5676 Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Mon, 17 Nov 2025 12:52:12 +0000 Subject: [PATCH 09/26] qwen3_moe_fp4 --- .../layers/moe/fused_moe_deepgemm_backend.py | 174 +++++-------- fastdeploy/model_executor/layers/moe/moe.py | 37 ++- .../layers/quantization/nvfp4.py | 244 +++++++++++++++++- fastdeploy/model_executor/models/qwen3moe.py | 15 +- fastdeploy/model_executor/utils.py | 13 + 5 files changed, 353 insertions(+), 130 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py index 06cc3294915..9cdd8c30cfe 100644 --- a/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py +++ b/fastdeploy/model_executor/layers/moe/fused_moe_deepgemm_backend.py @@ -22,7 +22,7 @@ from fastdeploy.distributed.communication import tensor_model_parallel_all_reduce from fastdeploy.model_executor.layers.utils import get_tensor from fastdeploy.model_executor.ops.gpu import count_tokens_per_expert_func, deep_gemm -from fastdeploy.model_executor.utils import TensorTracker, set_weight_attrs +from fastdeploy.model_executor.utils import set_weight_attrs from fastdeploy.utils import ceil_div from .fused_moe_backend_base import MoEMethodBase @@ -33,121 +33,69 @@ class DeepGemmFusedMoeMethod(MoEMethodBase): DeepGemmFusedMoeMethod is a class that implements the MoEMethodBase interface for DeepGemm backend. """ - def create_weights(self, layer: nn.Layer, **extra_weight_attrs): - """ - deepgemm create weight process. - """ - self.up_gate_proj_weight_shape = [ - layer.num_local_experts, - layer.moe_intermediate_size * 2, - layer.hidden_size, - ] - self.down_proj_weight_shape = [ - layer.num_local_experts, - layer.hidden_size, - layer.moe_intermediate_size, - ] - self.up_gate_proj_scale_shape = [ - layer.num_local_experts, - ceil_div(layer.moe_intermediate_size * 2, self.quant_config.weight_block_size[0]), - ceil_div(layer.hidden_size, self.quant_config.weight_block_size[1]), - ] - self.down_proj_scale_shape = [ - layer.num_local_experts, - ceil_div(layer.hidden_size, self.quant_config.weight_block_size[0]), - ceil_div(layer.moe_intermediate_size, self.quant_config.weight_block_size[1]), - ] - # TODO(bukejiyu): remove v1 loader check when v0 loader is removed - if self.quant_config.is_checkpoint_bf16 and layer.fd_config.load_config.load_choices == "default_v1": - layer.up_gate_proj_weight = layer.create_parameter( - shape=[layer.num_local_experts, layer.hidden_size, layer.moe_intermediate_size * 2], - dtype=layer.weight_dtype, + def create_weights(self, layer, **extra_weight_attrs): + self.weight_dtype = paddle.float8_e4m3fn + self.added_scale_attrs = ["up_gate_proj_weight_scale_inv", "down_proj_weight_scale_inv"] + up_gate_proj_weight_name = self.added_weight_attrs[0] + down_proj_weight_name = self.added_weight_attrs[1] + up_gate_proj_scale_name = self.added_scale_attrs[0] + down_proj_scale_name = self.added_scale_attrs[1] + setattr( + layer, + up_gate_proj_weight_name, + layer.create_parameter( + shape=self.up_gate_proj_weight_shape, + dtype=self.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), - ) - - layer.down_proj_weight = layer.create_parameter( - shape=[layer.num_local_experts, layer.moe_intermediate_size, layer.hidden_size], - dtype=layer.weight_dtype, + ), + ) + setattr( + layer, + down_proj_weight_name, + layer.create_parameter( + shape=self.down_proj_weight_shape, + dtype=self.weight_dtype, default_initializer=paddle.nn.initializer.Constant(0), - ) - extra_weight_attrs["weight_need_transpose"] = extra_weight_attrs.get("model_format") == "torch" - set_weight_attrs( - layer.up_gate_proj_weight, - { - **extra_weight_attrs, - "tensor_track": TensorTracker(shape=layer.up_gate_proj_weight.shape, output_dim=True), - }, - ) - set_weight_attrs( - layer.down_proj_weight, - { - **extra_weight_attrs, - "tensor_track": TensorTracker(shape=layer.down_proj_weight.shape, output_dim=False), - }, - ) - else: - self.weight_dtype = paddle.float8_e4m3fn - self.added_scale_attrs = ["up_gate_proj_weight_scale_inv", "down_proj_weight_scale_inv"] - up_gate_proj_weight_name = self.added_weight_attrs[0] - down_proj_weight_name = self.added_weight_attrs[1] - up_gate_proj_scale_name = self.added_scale_attrs[0] - down_proj_scale_name = self.added_scale_attrs[1] - setattr( - layer, - up_gate_proj_weight_name, - layer.create_parameter( - shape=self.up_gate_proj_weight_shape, - dtype=self.weight_dtype, - default_initializer=paddle.nn.initializer.Constant(0), - ), - ) - setattr( - layer, - down_proj_weight_name, - layer.create_parameter( - shape=self.down_proj_weight_shape, - dtype=self.weight_dtype, - default_initializer=paddle.nn.initializer.Constant(0), - ), - ) - # weight_scale - setattr( - layer, - up_gate_proj_scale_name, - layer.create_parameter( - shape=self.up_gate_proj_scale_shape, - dtype="float32", - default_initializer=paddle.nn.initializer.Constant(0), - ), - ) - setattr( - layer, - down_proj_scale_name, - layer.create_parameter( - shape=self.down_proj_scale_shape, - dtype="float32", - default_initializer=paddle.nn.initializer.Constant(0), - ), - ) - extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch" - extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}} - set_weight_attrs( - getattr(layer, up_gate_proj_weight_name), - extra_weight_attrs, - ) - set_weight_attrs( - getattr(layer, up_gate_proj_scale_name), - extra_weight_attrs, - ) + ), + ) + # weight_scale + setattr( + layer, + up_gate_proj_scale_name, + layer.create_parameter( + shape=self.up_gate_proj_scale_shape, + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + setattr( + layer, + down_proj_scale_name, + layer.create_parameter( + shape=self.down_proj_scale_shape, + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + extra_weight_attrs["weight_need_transpose"] = not extra_weight_attrs.get("model_format") == "torch" + extra_weight_attrs = {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}} + set_weight_attrs( + getattr(layer, up_gate_proj_weight_name), + extra_weight_attrs, + ) + set_weight_attrs( + getattr(layer, up_gate_proj_scale_name), + extra_weight_attrs, + ) - set_weight_attrs( - getattr(layer, down_proj_weight_name), - extra_weight_attrs, - ) - set_weight_attrs( - getattr(layer, down_proj_scale_name), - extra_weight_attrs, - ) + set_weight_attrs( + getattr(layer, down_proj_weight_name), + extra_weight_attrs, + ) + set_weight_attrs( + getattr(layer, down_proj_scale_name), + extra_weight_attrs, + ) def process_weights_after_loading(self, layer): """ """ diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index 09330e549a7..ec5d9adc2ff 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -288,8 +288,12 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_ ) # To ensure compatibility across backends, apply an extra transpose for GCU and XPU + if expert_param.shape != loaded_weight.shape: - loaded_weight = loaded_weight.transpose([1, 0]) + if len(expert_param.shape) != len(loaded_weight.shape): + loaded_weight = loaded_weight.reshape(expert_param.shape) + else: + loaded_weight = loaded_weight.transpose([1, 0]) assert expert_param.shape == loaded_weight.shape, ( f"Attempted to load weight ({loaded_weight.shape}) " f"into parameter ({expert_param.shape})" ) @@ -352,6 +356,32 @@ def _load_fused_experts_weight(self, param, loaded_weight): for i in range(self.num_local_experts): param.tensor_track.mark(start=0, batch_id=i) + def _load_per_tensor_weight_scale( + self, + param, + expert_id, + loaded_weight, + shard_id, + ): + loaded_weight = get_tensor(loaded_weight) + expert_param = param[expert_id - self.expert_id_offset] + if shard_id in ["gate", "up"]: + idx = 0 if shard_id == "gate" else 1 + if expert_param[idx].shape != loaded_weight.shape: + if len(expert_param[idx].shape) != len(loaded_weight.shape): + loaded_weight = loaded_weight.reshape(expert_param[idx].shape) + else: + loaded_weight = loaded_weight.transpose([1, 0]) + + expert_param[idx].set_value(loaded_weight) + elif shard_id == "down": + if expert_param.shape != loaded_weight.shape: + if len(expert_param.shape) != len(loaded_weight.shape): + loaded_weight = loaded_weight.reshape(expert_param.shape) + else: + loaded_weight = loaded_weight.transpose([1, 0]) + expert_param.set_value(loaded_weight) + def _load_expert_weight( self, param, @@ -360,7 +390,10 @@ def _load_expert_weight( shard_id, shard_dim=None, ): - if shard_id == "down": + weight_type = getattr(param, "weight_type", None) + if weight_type in ["weight_scale_2", "input_scale"]: + self._load_per_tensor_weight_scale(param, expert_id, loaded_weight, shard_id) + elif shard_id == "down": self._load_down_weight(param, expert_id, loaded_weight, shard_id, shard_dim) elif shard_id in ["gate", "up"]: self._load_gate_up_weight(param, expert_id, loaded_weight, shard_id, shard_dim) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 0a34d529a95..f4f12dff7e5 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -26,7 +26,11 @@ from fastdeploy.flashinfer import has_flashinfer from fastdeploy.model_executor.layers.moe import FusedMoE from fastdeploy.model_executor.ops.gpu import moe_topk_select -from fastdeploy.model_executor.utils import free_tensor, set_weight_attrs +from fastdeploy.model_executor.utils import ( + create_parameter_and_copy, + free_tensor, + set_weight_attrs, +) from .quant_base import QuantConfigBase, QuantMethodBase @@ -177,7 +181,7 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: if isinstance(layer, FusedMoE): # if skip_layer: # return None - return ModelOptNvFp4FusedMoE(self, layer.moe_config, layer) + return ModelOptNvFp4FusedMoE(self) else: # LinearBase # if skip_layer: @@ -241,18 +245,37 @@ def create_weights( input_scale_shape = [1] weight_scale_shape = [layer.weight_shape[::-1][0], layer.weight_shape[::-1][1] // self.quant_config.group_size] weight_scale_2_shape = [1] + + self._create_main_weight(layer, weight_shape, extra_weight_attrs) + self._create_input_scale(layer, input_scale_shape) + self._create_weight_scales(layer, weight_scale_shape, weight_scale_2_shape, extra_weight_attrs) + + def _create_main_weight(self, layer, weight_shape, extra_weight_attrs): + """创建主权重参数 + + 参数: + layer: 当前层对象 + weight_shape: 权重形状 + extra_weight_attrs: 额外权重属性 + """ layer.weight = layer.create_parameter( shape=weight_shape, dtype=layer.weight_dtype, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) - set_weight_attrs( layer.weight, extra_weight_attrs, ) - # Input Weight Scale + + def _create_input_scale(self, layer, input_scale_shape): + """创建输入缩放参数 + + 参数: + layer: 当前层对象 + input_scale_shape: 输入缩放形状 + """ layer.input_scale = layer.create_parameter( shape=input_scale_shape, # output_size dtype=paddle.float32, @@ -260,15 +283,21 @@ def create_weights( default_initializer=paddle.nn.initializer.Constant(0), ) - # Global Weight Scale + def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, extra_weight_attrs): + """创建权重缩放参数 + + 参数: + layer: 当前层对象 + weight_scale_shape: 权重缩放形状 + weight_scale_2_shape: 权重缩放2形状 + extra_weight_attrs: 额外权重属性 + """ layer.weight_scale_2 = layer.create_parameter( shape=weight_scale_2_shape, # output_size dtype=paddle.float32, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), ) - - # Per Block Weight Scale layer.weight_scale = layer.create_parameter( shape=weight_scale_shape, dtype=paddle.float8_e4m3fn, @@ -400,6 +429,12 @@ class ModelOptNvFp4FusedMoE(QuantMethodBase): """ def __init__(self, quant_config: ModelOptNvFp4Config): + self.quant_config = quant_config + self.added_weight_attrs = ["up_gate_proj_weight", "down_proj_weight"] + self.added_scale_attrs = [ + "up_gate_proj_weight_scale", + "down_proj_weight_scale", + ] self.quant_config = quant_config self.backend = "none" @@ -414,8 +449,199 @@ def __init__(self, quant_config: ModelOptNvFp4Config): if self.backend == "none": raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.") - def create_weights(self, layer): - pass + def create_weights(self, layer, **extra_weight_attrs): + """ + Triton MoE create weight process. + """ + self.up_gate_proj_weight_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2, + layer.hidden_size // 2, + ] + self.down_proj_weight_shape = [ + layer.num_local_experts, + layer.hidden_size, + layer.moe_intermediate_size // 2, + ] + self.up_gate_proj_scale_shape = [ + layer.num_local_experts, + layer.moe_intermediate_size * 2, + layer.hidden_size // self.quant_config.group_size, + ] + self.down_proj_scale_shape = [ + layer.num_local_experts, + layer.hidden_size, + layer.moe_intermediate_size // self.quant_config.group_size, + ] + + self.weight_scale_dtype = paddle.float8_e4m3fn + self.weight_dtype = paddle.uint8 + self.added_scale_attrs = ["up_gate_proj_weight_scale", "down_proj_weight_scale"] + # self.added_blockscale_swizzled_attrs = ["up_gate_proj_blockscale_swizzled", "down_proj_blockscale_swizzled"] + up_gate_proj_weight_name = self.added_weight_attrs[0] + down_proj_weight_name = self.added_weight_attrs[1] + up_gate_proj_scale_name = self.added_scale_attrs[0] + down_proj_scale_name = self.added_scale_attrs[1] + # up_gate_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[0] + # down_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[1] + setattr( + layer, + up_gate_proj_weight_name, + layer.create_parameter( + shape=self.up_gate_proj_weight_shape, + dtype=self.weight_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + setattr( + layer, + down_proj_weight_name, + layer.create_parameter( + shape=self.down_proj_weight_shape, + dtype=self.weight_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + # weight_scale + setattr( + layer, + up_gate_proj_scale_name, + layer.create_parameter( + shape=self.up_gate_proj_scale_shape, + dtype=self.weight_scale_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + setattr( + layer, + down_proj_scale_name, + layer.create_parameter( + shape=self.down_proj_scale_shape, + dtype=self.weight_scale_dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + # weight_scale_2 + layer.up_gate_proj_weight_scale_2 = layer.create_parameter( + shape=[layer.num_local_experts, 2], + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.down_proj_weight_scale_2 = layer.create_parameter( + shape=[layer.num_local_experts], + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ) + # input_scale + layer.up_gate_proj_input_scale = layer.create_parameter( + shape=[layer.num_local_experts, 2], + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ) + layer.down_proj_input_scale = layer.create_parameter( + shape=[layer.num_local_experts], + dtype="float32", + default_initializer=paddle.nn.initializer.Constant(0), + ) + + set_weight_attrs( + getattr(layer, up_gate_proj_weight_name), + {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}, + ) + set_weight_attrs( + getattr(layer, up_gate_proj_scale_name), + {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}, + ) + + set_weight_attrs( + getattr(layer, down_proj_weight_name), + {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}, + ) + set_weight_attrs( + getattr(layer, down_proj_scale_name), + {**extra_weight_attrs, "SHARD_ID_TO_SHARDED_DIM": {"gate": 0, "down": 1, "up": 0}}, + ) + + set_weight_attrs( + layer.up_gate_proj_weight_scale_2, + {**extra_weight_attrs, "weight_type": "weight_scale_2"}, + ) + set_weight_attrs(layer.down_proj_weight_scale_2, {**extra_weight_attrs, "weight_type": "weight_scale_2"}) + set_weight_attrs(layer.up_gate_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"}) + set_weight_attrs(layer.down_proj_input_scale, {**extra_weight_attrs, "weight_type": "input_scale"}) + + def swizzle_blockscale(self, scale): + assert scale.dtype == paddle.float8_e4m3fn + # Pad and blockwise interleave weight_scale + scale_dim = len(scale.shape) + if len(scale.shape) == 2: + scale = scale.unsqueeze(0) + assert len(scale.shape) == 3 + B, M, K = scale.shape + round_up_multiple = lambda x, m: (x + m - 1) // m * m + M_padded = round_up_multiple(M, 128) + K_padded = round_up_multiple(K, 4) + padded_scale = paddle.empty([B, M_padded, K_padded], dtype=scale.dtype) + padded_scale[:B, :M, :K].copy_(scale) + batches, rows, cols = padded_scale.shape + assert rows % 128 == 0 + assert cols % 4 == 0 + padded_scale = padded_scale.reshape(batches, rows // 128, 4, 32, cols // 4, 4) + swizzled_scale = padded_scale.permute((0, 1, 4, 3, 2, 5)) + swizzled_scale = swizzled_scale.contiguous().to(paddle.device.get_device()) + return ( + swizzled_scale.reshape(M_padded, K_padded) + if scale_dim == 2 + else swizzled_scale.reshape(B, M_padded, K_padded) + ) + + def process_weights_after_loading(self, layer): + """ """ + up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0] + free_tensor(layer.up_gate_proj_weight_scale_2) + create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2) + # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe + # conda2 = self.enable_flashinfer_cutedsl_moe + # conda3 only support now + up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32") + down_proj_input_scale = layer.down_proj_input_scale + + # Create shared parameters + create_parameter_and_copy( + layer, "g1_alphas", (up_gate_proj_input_scale * up_gate_proj_weight_scale_2).cast("float32") + ) + create_parameter_and_copy( + layer, "g2_alphas", (down_proj_input_scale * layer.down_proj_weight_scale_2).cast("float32") + ) + create_parameter_and_copy( + layer, "up_gate_proj_input_scale_quant", (1 / up_gate_proj_input_scale).cast("float32") + ) + create_parameter_and_copy(layer, "down_proj_input_scale_quant", (1 / down_proj_input_scale).cast("float32")) + + # update input_global_scale ? + # layer.dispatcher.set_quant_config( + # {"input_global_scale": layer.w13_input_scale_quant} + # ) + + for name, weight_scale in [ + ("up_gate", layer.up_gate_proj_weight_scale), + ("down", layer.down_proj_weight_scale), + ]: + assert weight_scale.shape[2] % 16 == 0, f"Expected {name}_weight_scale.dim(2) to be divisible by 16" + assert ( + weight_scale.dtype == paddle.float8_e4m3fn + ), f"{name} Weight Blockscale must be represented as FP8-E4M3" + + # trtllm + # cultass + up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale) + free_tensor(layer.up_gate_proj_weight_scale) + create_parameter_and_copy( + layer, name="up_gate_proj_blockscale_swizzled", weight=up_gate_proj_blockscale_swizzled + ) + down_proj_blockscale_swizzled = self.swizzle_blockscale(layer.down_proj_weight_scale) + free_tensor(layer.down_proj_weight_scale) + create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) def apply(self, layer, x, gate): """ diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 8e47a919bc4..3e3c2645693 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -16,9 +16,9 @@ from __future__ import annotations -import re from functools import partial +# import re import paddle from paddle import nn from paddleformers.transformers import PretrainedModel @@ -376,9 +376,8 @@ def load_weights(self, weights_iterator) -> None: weights_iterator (Iterator): An iterator yielding (name, weight) pairs. """ - from fastdeploy.model_executor.utils import ( + from fastdeploy.model_executor.utils import ( # process_weights_after_loading, default_weight_loader, - process_weights_after_loading, ) stacked_params_mapping = [ @@ -393,7 +392,7 @@ def load_weights(self, weights_iterator) -> None: ] expert_params_mapping = self.get_expert_mapping() params_dict = dict(self.named_parameters()) - process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) + # process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) for loaded_weight_name, loaded_weight in weights_iterator: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in loaded_weight_name: @@ -427,8 +426,12 @@ def load_weights(self, weights_iterator) -> None: weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) weight_loader(param, loaded_weight) - model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name) - process_weights_after_loading_fn(model_sublayer_name, param) + # model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name) + # process_weights_after_loading_fn(model_sublayer_name, param) + for name, sublayer in self.named_sublayers(): + quant_method = getattr(sublayer, "quant_method", None) + if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): + quant_method.process_weights_after_loading(sublayer) @paddle.no_grad() def set_state_dict(self, state_dict): diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 15d285212b0..81f076a5b68 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -189,6 +189,19 @@ def free_tensor(tensor): del tensor +def create_parameter_and_copy(layer, name, weight): + setattr( + layer, + name, + layer.create_parameter( + shape=weight.shape, + dtype=weight.dtype, + default_initializer=paddle.nn.initializer.Constant(0), + ), + ) + getattr(layer, name).copy_(weight, False) + + def default_weight_loader(fd_config: FDConfig = None) -> None: """Default weight loader""" From 2d2bd069b4578b4ba8221639d28e304110b8efb0 Mon Sep 17 00:00:00 2001 From: bukejiyu <395822456@qq.com> Date: Tue, 18 Nov 2025 12:13:35 +0000 Subject: [PATCH 10/26] update --- fastdeploy/model_executor/layers/moe/moe.py | 4 ++-- .../model_executor/layers/quantization/nvfp4.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/fastdeploy/model_executor/layers/moe/moe.py b/fastdeploy/model_executor/layers/moe/moe.py index ec5d9adc2ff..2f168db2a09 100644 --- a/fastdeploy/model_executor/layers/moe/moe.py +++ b/fastdeploy/model_executor/layers/moe/moe.py @@ -271,10 +271,10 @@ def _load_gate_up_weight(self, param, expert_id, loaded_weight, shard_id, shard_ expert_param = param[expert_id - self.expert_id_offset] dim = -1 if shard_dim else 0 param_shard_size = expert_param.shape[dim] // 2 - if shard_id == "gate": + switch_w13 = getattr(self.quant_method, "load_up_proj_weight_first", False) + if (shard_id == "gate" and not switch_w13) or (shard_id == "up" and switch_w13): param_shard_offset = 0 else: - # shard_id == "up": param_shard_offset = param_shard_size expert_param = slice_fn( expert_param, shard_dim, start=param_shard_offset, end=param_shard_offset + param_shard_size diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index f4f12dff7e5..c4ddba2974a 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -595,16 +595,24 @@ def swizzle_blockscale(self, scale): else swizzled_scale.reshape(B, M_padded, K_padded) ) + @property + def load_up_proj_weight_first(self) -> bool: + # FlashInfer CUTLASS kernel assumes [Up, Gate] Proj as W13 + # 目前默认给True + return True + def process_weights_after_loading(self, layer): """ """ up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0] free_tensor(layer.up_gate_proj_weight_scale_2) create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2) # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe + up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale).cast("float32") + down_proj_input_scale = paddle.max(layer.down_proj_input_scale).cast("float32") # conda2 = self.enable_flashinfer_cutedsl_moe # conda3 only support now - up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32") - down_proj_input_scale = layer.down_proj_input_scale + # up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32") + # down_proj_input_scale = layer.down_proj_input_scale # Create shared parameters create_parameter_and_copy( @@ -636,11 +644,13 @@ def process_weights_after_loading(self, layer): # cultass up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale) free_tensor(layer.up_gate_proj_weight_scale) + layer.up_gate_proj_weight_scale = None create_parameter_and_copy( layer, name="up_gate_proj_blockscale_swizzled", weight=up_gate_proj_blockscale_swizzled ) down_proj_blockscale_swizzled = self.swizzle_blockscale(layer.down_proj_weight_scale) free_tensor(layer.down_proj_weight_scale) + layer.down_proj_weight_scale = None create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) def apply(self, layer, x, gate): From c329d921e1b57e7a99f6470622e059e0b5c459fb Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Tue, 18 Nov 2025 20:50:31 +0800 Subject: [PATCH 11/26] support flashinfer-cutlass moe, qwen3-moe-fp4 OK --- fastdeploy/flashinfer.py | 4 ++ .../layers/quantization/nvfp4.py | 67 ++----------------- 2 files changed, 10 insertions(+), 61 deletions(-) diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py index 4bc6aa994f2..f30aa028308 100644 --- a/fastdeploy/flashinfer.py +++ b/fastdeploy/flashinfer.py @@ -17,6 +17,7 @@ import functools import importlib import importlib.util +import os import shutil @@ -25,6 +26,9 @@ def has_flashinfer() -> bool: """Return `True` if FlashInfer is available.""" # Use find_spec to check if the module exists without importing it # This avoids potential CUDA initialization side effects + if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]: + # currently must support by Paddle compatible API + return False if importlib.util.find_spec("flashinfer") is None: # logger.debug_once("FlashInfer unavailable since package was not found") return False diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index c4ddba2974a..2a648f673a0 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -42,49 +42,6 @@ logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.") -def swizzle_blockscale(scale: paddle.Tensor) -> paddle.Tensor: - """ - Pad and block-interleave the FP4 block-scales so that they match the data - layout expected by the CUTLASS / FlashInfer kernels. - - Parameters - ---------- - scale: paddle.Tensor - - Returns - ------- - paddle.Tensor - The swizzled tensor with the same logical shape as *scale*. - """ - assert scale.dtype == paddle.float8_e4m3fn, ( - "swizzle_blockscale expects the input tensor to be in " "paddle.float8_e4m3fn format." - ) - - scale_ndim = scale.ndim - if scale_ndim == 2: - scale = scale.unsqueeze(0) # (1, M, K) - assert scale.ndim == 3, "Expected a 2-D or 3-D tensor for block scales." - - B, M, K = scale.shape - - def _round_up(x: int, m: int) -> int: - return (x + m - 1) // m * m - - M_padded = _round_up(M, 128) - K_padded = _round_up(K, 4) - - padded = paddle.zeros((B, M_padded, K_padded), dtype=scale.dtype, device=scale.place) - padded[:B, :M, :K] = scale - - # Reshape / permute to the layout required by the kernel. - padded = padded.reshape(B, M_padded // 128, 4, 32, K_padded // 4, 4) - swizzled = padded.permute(0, 1, 4, 3, 2, 5).contiguous().cuda() - - if scale_ndim == 2: - return swizzled.reshape(M_padded, K_padded) - return swizzled.reshape(B, M_padded, K_padded) - - def next_power_of_2(n: int): return 1 << (n - 1).bit_length() if n > 0 else 1 @@ -606,13 +563,8 @@ def process_weights_after_loading(self, layer): up_gate_proj_weight_scale_2 = layer.up_gate_proj_weight_scale_2[:, 0] free_tensor(layer.up_gate_proj_weight_scale_2) create_parameter_and_copy(layer, name="up_gate_proj_weight_scale_2", weight=up_gate_proj_weight_scale_2) - # conda1 = self.enable_flashinfer_cutlass_moe or self.enable_flashinfer_trtllm_moe up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale).cast("float32") down_proj_input_scale = paddle.max(layer.down_proj_input_scale).cast("float32") - # conda2 = self.enable_flashinfer_cutedsl_moe - # conda3 only support now - # up_gate_proj_input_scale = paddle.max(layer.up_gate_proj_input_scale, axis=1).cast("float32") - # down_proj_input_scale = layer.down_proj_input_scale # Create shared parameters create_parameter_and_copy( @@ -626,11 +578,6 @@ def process_weights_after_loading(self, layer): ) create_parameter_and_copy(layer, "down_proj_input_scale_quant", (1 / down_proj_input_scale).cast("float32")) - # update input_global_scale ? - # layer.dispatcher.set_quant_config( - # {"input_global_scale": layer.w13_input_scale_quant} - # ) - for name, weight_scale in [ ("up_gate", layer.up_gate_proj_weight_scale), ("down", layer.down_proj_weight_scale), @@ -640,8 +587,6 @@ def process_weights_after_loading(self, layer): weight_scale.dtype == paddle.float8_e4m3fn ), f"{name} Weight Blockscale must be represented as FP8-E4M3" - # trtllm - # cultass up_gate_proj_blockscale_swizzled = self.swizzle_blockscale(layer.up_gate_proj_weight_scale) free_tensor(layer.up_gate_proj_weight_scale) layer.up_gate_proj_weight_scale = None @@ -676,16 +621,16 @@ def apply(self, layer, x, gate): input=x, token_selected_experts=topk_ids.to(paddle.int), token_final_scales=topk_weights, - fc1_expert_weights=layer.w13_weight.view(paddle.long), - fc2_expert_weights=layer.w2_weight.view(paddle.long), + fc1_expert_weights=getattr(layer, self.added_weight_attrs[0]).view(paddle.long), + fc2_expert_weights=getattr(layer, self.added_weight_attrs[1]).view(paddle.long), output_dtype=output_dtype, input_sf=x_sf, quant_scales=[ - layer.w13_input_scale_quant, - layer.w13_blockscale_swizzled.view(paddle.int32), + layer.up_gate_proj_input_scale_quant, + layer.up_gate_proj_blockscale_swizzled.view(paddle.int32), layer.g1_alphas, - layer.w2_input_scale_quant, - layer.w2_blockscale_swizzled.view(paddle.int32), + layer.down_proj_input_scale_quant, + layer.down_proj_blockscale_swizzled.view(paddle.int32), layer.g2_alphas, ], ep_size=layer.ep_size, From eb089b38b0f342006cf587fa1de611f0b8afa619 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Wed, 19 Nov 2025 18:59:59 +0800 Subject: [PATCH 12/26] support ernie4.5-fp4 --- .../model_executor/layers/quantization/nvfp4.py | 4 ++++ fastdeploy/model_executor/models/ernie4_5_moe.py | 12 ++++-------- fastdeploy/model_executor/models/qwen3moe.py | 3 --- fastdeploy/model_executor/utils.py | 2 ++ 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 2a648f673a0..0efa9204e3c 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -57,6 +57,7 @@ def __init__( kv_cache_quant_algo: str | None, exclude_modules: list[str], group_size: int = 16, + is_checkpoint_bf16: bool = False, ) -> None: self.is_checkpoint_nvfp4_serialized = is_checkpoint_nvfp4_serialized if is_checkpoint_nvfp4_serialized: @@ -72,6 +73,7 @@ def __init__( self.quant_max_bound = 6 self.quant_min_bound = -6 self.quant_round_type = 1 + self.is_checkpoint_bf16 = is_checkpoint_bf16 def name(self) -> str: return "modelopt_fp4" @@ -406,6 +408,8 @@ def __init__(self, quant_config: ModelOptNvFp4Config): if self.backend == "none": raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.") + logger.info(f"Using {self.backend} for NVFP4 FusedMoE") + def create_weights(self, layer, **extra_weight_attrs): """ Triton MoE create weight process. diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index c2baeb91049..136b144cf94 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -17,7 +17,6 @@ from __future__ import annotations import inspect -import re from functools import partial from typing import Dict, Union @@ -543,7 +542,6 @@ def load_weights(self, weights_iterator) -> None: from fastdeploy.model_executor.utils import ( default_weight_loader, - process_weights_after_loading, rename_offline_ckpt_suffix_to_fd_suffix, ) @@ -590,8 +588,6 @@ def load_weights(self, weights_iterator) -> None: ) params_dict = dict(self.named_parameters()) - process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) - for loaded_weight_name, loaded_weight in weights_iterator: loaded_weight_name = loaded_weight_name.replace("model", "ernie") for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping: @@ -620,10 +616,10 @@ def load_weights(self, weights_iterator) -> None: else: weight_loader(param, loaded_weight, shard_id) - model_sublayer_name = re.sub( - r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name - ) - process_weights_after_loading_fn(model_sublayer_name, param) + for name, sublayer in self.named_sublayers(): + quant_method = getattr(sublayer, "quant_method", None) + if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): + quant_method.process_weights_after_loading(sublayer) if self.tie_word_embeddings: self.lm_head.load_state_dict({self.lm_head.weight_key: self.ernie.embed_tokens.embeddings.weight}) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 3e3c2645693..c9a9d717e6a 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -392,7 +392,6 @@ def load_weights(self, weights_iterator) -> None: ] expert_params_mapping = self.get_expert_mapping() params_dict = dict(self.named_parameters()) - # process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers())) for loaded_weight_name, loaded_weight in weights_iterator: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in loaded_weight_name: @@ -426,8 +425,6 @@ def load_weights(self, weights_iterator) -> None: weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) weight_loader(param, loaded_weight) - # model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name) - # process_weights_after_loading_fn(model_sublayer_name, param) for name, sublayer in self.named_sublayers(): quant_method = getattr(sublayer, "quant_method", None) if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): diff --git a/fastdeploy/model_executor/utils.py b/fastdeploy/model_executor/utils.py index 81f076a5b68..7949d5e0541 100644 --- a/fastdeploy/model_executor/utils.py +++ b/fastdeploy/model_executor/utils.py @@ -371,6 +371,8 @@ def fn(loaded_weight_name, is_moe): # Can be extended to other offline quantization suffixes if needed. if (is_moe and moe_quant_type == "block_wise_fp8") or (not is_moe and dense_quant_type == "block_wise_fp8"): fd_suffix_map = fp8_suffix_map + else: + fd_suffix_map = {} for ckpt_suffix, fd_suffix in fd_suffix_map.items(): if re.search(rf"{ckpt_suffix}$", loaded_weight_name): loaded_weight_name = loaded_weight_name.replace(ckpt_suffix, fd_suffix) From 03aa695a80927d6bb9f2a2c819b079bb4b264b89 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Thu, 20 Nov 2025 12:10:07 +0800 Subject: [PATCH 13/26] fix load error --- fastdeploy/flashinfer.py | 5 +++-- fastdeploy/model_executor/layers/linear.py | 1 - .../layers/quantization/nvfp4.py | 19 +++++++++---------- .../model_executor/models/ernie4_5_moe.py | 14 ++++++++++---- fastdeploy/model_executor/models/qwen3moe.py | 13 ++++++++----- 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py index f30aa028308..23634faed5f 100644 --- a/fastdeploy/flashinfer.py +++ b/fastdeploy/flashinfer.py @@ -20,6 +20,8 @@ import os import shutil +from paddleformers.utils.log import logger + @functools.cache def has_flashinfer() -> bool: @@ -28,12 +30,11 @@ def has_flashinfer() -> bool: # This avoids potential CUDA initialization side effects if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]: # currently must support by Paddle compatible API + logger.warning("FlashInfer is not supported by Paddle compatible API.") return False if importlib.util.find_spec("flashinfer") is None: - # logger.debug_once("FlashInfer unavailable since package was not found") return False # Also check if nvcc is available since it's required to JIT compile flashinfer if shutil.which("nvcc") is None: - # logger.debug_once("FlashInfer unavailable since nvcc was not found") return False return True diff --git a/fastdeploy/model_executor/layers/linear.py b/fastdeploy/model_executor/layers/linear.py index e7725be6d23..3227edac765 100644 --- a/fastdeploy/model_executor/layers/linear.py +++ b/fastdeploy/model_executor/layers/linear.py @@ -82,7 +82,6 @@ def process_loaded_weights(self, layer, weights) -> None: layer.weight.set_value(weights) def apply(self, layer: nn.Layer, x: paddle.Tensor) -> paddle.Tensor: - linear_out = paddle.matmul(x, layer.weight) if layer.with_bias: linear_out = paddle.add(linear_out, layer.bias) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 8a44a93a199..92b47997470 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -17,15 +17,12 @@ from typing import Optional import paddle - -paddle.compat.enable_torch_proxy() - from paddleformers.utils.log import logger +import fastdeploy from fastdeploy import envs from fastdeploy.flashinfer import has_flashinfer from fastdeploy.model_executor.layers.moe import FusedMoE -from fastdeploy.model_executor.ops.gpu import moe_topk_select from fastdeploy.model_executor.utils import ( create_parameter_and_copy, free_tensor, @@ -35,6 +32,7 @@ from .quant_base import QuantConfigBase, QuantMethodBase if has_flashinfer(): + paddle.compat.enable_torch_proxy() from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe @@ -176,7 +174,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config) -> None: assert has_flashinfer(), f"FlashInfer is required for {self.backend}" if self.backend == "none": - raise ValueError("No valid NVFP4 GEMM backend found. " "Please check your platform capability.") + raise ValueError( + "No valid NVFP4 GEMM backend found. Please check your platform capability and installtion of Flashinfer." + ) logger.info(f"Using {self.backend} for NVFP4 GEMM") @@ -394,7 +394,9 @@ def __init__(self, quant_config: ModelOptNvFp4Config): assert has_flashinfer(), f"FlashInfer is required for MoE backend {self.backend}" if self.backend == "none": - raise ValueError("No valid NVFP4 flashinfer MoE backend found. " "Please check your platform capability.") + raise ValueError( + "No valid NVFP4 flashinfer MoE backend found. Please check your platform capability and installtion of FlashInfer." + ) logger.info(f"Using {self.backend} for NVFP4 FusedMoE") @@ -426,13 +428,10 @@ def create_weights(self, layer, **extra_weight_attrs): self.weight_scale_dtype = paddle.float8_e4m3fn self.weight_dtype = paddle.uint8 self.added_scale_attrs = ["up_gate_proj_weight_scale", "down_proj_weight_scale"] - # self.added_blockscale_swizzled_attrs = ["up_gate_proj_blockscale_swizzled", "down_proj_blockscale_swizzled"] up_gate_proj_weight_name = self.added_weight_attrs[0] down_proj_weight_name = self.added_weight_attrs[1] up_gate_proj_scale_name = self.added_scale_attrs[0] down_proj_scale_name = self.added_scale_attrs[1] - # up_gate_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[0] - # down_proj_blockscale_swizzled_name = self.added_blockscale_swizzled_attrs[1] setattr( layer, up_gate_proj_weight_name, @@ -595,7 +594,7 @@ def apply(self, layer, x, gate): flashinfer nvfp4 fusedmoe for Model Optimizer """ gate_out = gate(x.cast("float32")) - topk_ids, topk_weights = moe_topk_select( + topk_ids, topk_weights = fastdeploy.model_executor.ops.gpu.moe_topk_select( gate_out, layer.gate_correction_bias, layer.top_k, diff --git a/fastdeploy/model_executor/models/ernie4_5_moe.py b/fastdeploy/model_executor/models/ernie4_5_moe.py index f773afc4c9a..75947590be8 100644 --- a/fastdeploy/model_executor/models/ernie4_5_moe.py +++ b/fastdeploy/model_executor/models/ernie4_5_moe.py @@ -17,6 +17,7 @@ from __future__ import annotations import inspect +import re from functools import partial from typing import Dict, Union @@ -514,6 +515,7 @@ def load_weights(self, weights_iterator) -> None: from fastdeploy.model_executor.utils import ( default_weight_loader, + process_weights_after_loading, rename_offline_ckpt_suffix_to_fd_suffix, ) @@ -560,6 +562,10 @@ def load_weights(self, weights_iterator) -> None: ) params_dict = dict(self.named_parameters()) + process_weights_after_loading_fn = process_weights_after_loading( + dict(self.named_sublayers()), fd_config=self.fd_config + ) + for loaded_weight_name, loaded_weight in weights_iterator: loaded_weight_name = loaded_weight_name.replace("model", "ernie") for param_name, weight_name, exp_id, shard_id, is_moe in all_param_mapping: @@ -588,10 +594,10 @@ def load_weights(self, weights_iterator) -> None: else: weight_loader(param, loaded_weight, shard_id) - for name, sublayer in self.named_sublayers(): - quant_method = getattr(sublayer, "quant_method", None) - if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): - quant_method.process_weights_after_loading(sublayer) + model_sublayer_name = re.sub( + r"\.(up_gate_proj_weight|down_proj_weight|weight|cache_k_scale|cache_v_scale)$", "", model_param_name + ) + process_weights_after_loading_fn(model_sublayer_name, param) if self.tie_word_embeddings: self.lm_head.linear.weight.set_value(self.ernie.embed_tokens.embeddings.weight.transpose([1, 0])) diff --git a/fastdeploy/model_executor/models/qwen3moe.py b/fastdeploy/model_executor/models/qwen3moe.py index 662483ea15a..9537b84f22c 100644 --- a/fastdeploy/model_executor/models/qwen3moe.py +++ b/fastdeploy/model_executor/models/qwen3moe.py @@ -16,6 +16,7 @@ from __future__ import annotations +import re from functools import partial import paddle @@ -342,7 +343,10 @@ def load_weights(self, weights_iterator) -> None: weights_iterator (Iterator): An iterator yielding (name, weight) pairs. """ - from fastdeploy.model_executor.utils import default_weight_loader + from fastdeploy.model_executor.utils import ( + default_weight_loader, + process_weights_after_loading, + ) stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -356,6 +360,7 @@ def load_weights(self, weights_iterator) -> None: ] expert_params_mapping = self.get_expert_mapping() params_dict = dict(self.named_parameters()) + process_weights_after_loading_fn = process_weights_after_loading(dict(self.named_sublayers()), self.fd_config) for loaded_weight_name, loaded_weight in weights_iterator: for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in loaded_weight_name: @@ -389,10 +394,8 @@ def load_weights(self, weights_iterator) -> None: weight_loader = getattr(param, "weight_loader", default_weight_loader(self.fd_config)) weight_loader(param, loaded_weight) - for name, sublayer in self.named_sublayers(): - quant_method = getattr(sublayer, "quant_method", None) - if quant_method is not None and hasattr(quant_method, "process_weights_after_loading"): - quant_method.process_weights_after_loading(sublayer) + model_sublayer_name = re.sub(r"\.(up_gate_proj_weight|down_proj_weight|weight)$", "", model_param_name) + process_weights_after_loading_fn(model_sublayer_name, param) @paddle.no_grad() def set_state_dict(self, state_dict): From 5233398f8f1dbf651f0013e62cbef5eeafcfcf33 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Thu, 20 Nov 2025 18:07:58 +0800 Subject: [PATCH 14/26] add some ut --- .../layers/quantization/nvfp4.py | 16 +--- tests/quantization/test_modelopt_nvfp4.py | 96 +++++++++++++++++++ 2 files changed, 99 insertions(+), 13 deletions(-) create mode 100644 tests/quantization/test_modelopt_nvfp4.py diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 92b47997470..49c88be7882 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -134,18 +134,9 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]: """ Get quantization method. """ - # skip_layer = self.is_layer_excluded(prefix) if isinstance(layer, FusedMoE): - # if skip_layer: - # return None return ModelOptNvFp4FusedMoE(self) else: - # LinearBase - # if skip_layer: - # return UnquantizedLinearMethod() - # Check if this is a vision model layer that should not be quantized - # if "vision_tower" in prefix or "vision_model" in prefix: - # return UnquantizedLinearMethod() return ModelOptNvFp4LinearMethod(self) return None @@ -224,7 +215,7 @@ def _create_input_scale(self, layer, input_scale_shape): input_scale_shape: 输入缩放形状 """ layer.input_scale = layer.create_parameter( - shape=input_scale_shape, # output_size + shape=input_scale_shape, dtype=paddle.float32, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), @@ -240,7 +231,7 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, extra_weight_attrs: 额外权重属性 """ layer.weight_scale_2 = layer.create_parameter( - shape=weight_scale_2_shape, # output_size + shape=weight_scale_2_shape, dtype=paddle.float32, is_bias=False, default_initializer=paddle.nn.initializer.Constant(0), @@ -257,7 +248,6 @@ def _create_weight_scales(self, layer, weight_scale_shape, weight_scale_2_shape, ) def process_weights_after_loading(self, layer) -> None: - # if def _process_scale_interleaved(scales): scale_dim = len(scales.shape) if scale_dim == 2: @@ -386,7 +376,7 @@ def __init__(self, quant_config: ModelOptNvFp4Config): self.backend = "none" if envs.FD_FLASHINFER_MOE_BACKEND is None: - # currently support flashinfer-cutlass and flashinfer-trtllm + # currently support flashinfer-cutlass, flashinfer-trtllm will support in the future if has_flashinfer(): self.backend = "flashinfer-cutlass" elif envs.FD_FLASHINFER_MOE_BACKEND.startswith("flashinfer-"): diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py new file mode 100644 index 00000000000..b3dc1244715 --- /dev/null +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -0,0 +1,96 @@ +""" +# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" + +import unittest +from unittest import mock + +import paddle + +from fastdeploy.flashinfer import has_flashinfer + +# import fastdeploy +from fastdeploy.model_executor.layers.linear import QKVParallelLinear +from fastdeploy.model_executor.layers.moe import FusedMoE +from fastdeploy.model_executor.layers.quantization.nvfp4 import ( + ModelOptNvFp4Config, + ModelOptNvFp4FusedMoE, + ModelOptNvFp4LinearMethod, +) + + +def get_sm_version(): + prop = paddle.device.cuda.get_device_properties() + cc = prop.major * 10 + prop.minor + return cc + + +@unittest.skipIf( + not paddle.is_compiled_with_cuda() or get_sm_version() < 100, + "Nvfp4 do not support sm < 100.", +) +class TestModelOptNvFp4Config(unittest.TestCase): + def setUp(self): + prop = paddle.device.cuda.get_device_properties() + self.sm_version = prop.major * 10 + prop.minor + + self.raw_config = { + "config_groups": { + "group_0": { + "input_activations": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": 16}, + "weights": {"dynamic": False, "num_bits": 4, "type": "float", "group_size": 16}, + "targets": ["Linear"], + } + }, + "quant_algo": "NVFP4", + "producer": {"name": "modelopt", "version": "0.34.1.dev85+g7a72957d"}, + "quant_method": "modelopt", + } + + self.config = ModelOptNvFp4Config.from_config(self.raw_config) + + def test_name(self): + """Test name() method""" + self.assertEqual(self.config.name(), "modelopt_fp4") + + def test_from_config(self): + """Test from_config with full dict""" + cfg = ModelOptNvFp4Config.from_config(self.raw_config) + self.assertFalse(cfg.is_checkpoint_bf16) + self.assertTrue(cfg.is_checkpoint_nvfp4_serialized) + self.assertEqual(cfg.group_size, 16) + self.assertEqual(cfg.exclude_modules, []) + self.assertEqual(cfg.kv_cache_quant_algo, None) + self.assertEqual(cfg.quant_max_bound, 6) + self.assertEqual(cfg.quant_min_bound, -6) + self.assertEqual(cfg.quant_round_type, 1) + + @unittest.skipIf(not has_flashinfer(), "Skip if no FlashInfer available") + def test_get_quant_method_linear(self): + """Test get_quant_method with a linear layer""" + layer = mock.Mock(spec=QKVParallelLinear) + method = self.config.get_quant_method(layer) + assert isinstance(method, ModelOptNvFp4LinearMethod) + + @unittest.skipIf(not has_flashinfer(), "Skip if no FlashInfer available") + def test_get_quant_method_fused_moe(self): + """Test get_quant_method with a moe layer""" + layer = mock.Mock(spec=FusedMoE) + method = self.config.get_quant_method(layer) + assert isinstance(method, ModelOptNvFp4FusedMoE) + + +if __name__ == "__main__": + unittest.main() From 748e81268bf4f4c128eae7691ff600ea225465d9 Mon Sep 17 00:00:00 2001 From: zoooo0820 Date: Thu, 20 Nov 2025 18:58:38 +0800 Subject: [PATCH 15/26] add docs --- docs/quantization/nvfp4.md | 74 ++++++++++++++++++++++ docs/zh/quantization/nvfp4.md | 75 +++++++++++++++++++++++ tests/quantization/test_modelopt_nvfp4.py | 2 - 3 files changed, 149 insertions(+), 2 deletions(-) create mode 100644 docs/quantization/nvfp4.md create mode 100644 docs/zh/quantization/nvfp4.md diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md new file mode 100644 index 00000000000..c8edd091c79 --- /dev/null +++ b/docs/quantization/nvfp4.md @@ -0,0 +1,74 @@ + +# NVFP4 Quantization +NVFP4 is an innovative 4-bit floating-point format introduced by NVIDIA. For detailed information, please refer to [Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/). + +Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy supports NVFP4 quantized model inference in the format produced by [Modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer). + +- Note: Currently, this feature only supports FP4 quantized models of Ernie/Qwen series. + +## How to Use +### Environment Setup +- **Supported Hardware**: GPU sm >= 100 +- **PaddlePaddle Version**: 3.3.0 or higher +- **Fastdeploy Version**: 2.4.0 or higher + +#### 1. Fastdeploy Installation +First, install the Fastdeploy base environment according to the [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md). + +#### 2. Flashinfer Installation +```bash +git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive + +cd flashinfer +python -m pip install -v . +``` + +### Running Inference Service +- Note: Need to set environment variable `export PADDLE_COMPATIBLE_API=true` and install the corresponding Flashinfer correctly +```bash +export PADDLE_COMPATIBLE_API=true +python -m fastdeploy.entrypoints.openai.api_server \ + --model nv-community/Qwen3-30B-A3B-FP4 \ + --port 8180 \ + --metrics-port 8181 \ + --engine-worker-queue-port 8182 \ + --cache-queue-port 8183 \ + --tensor-parallel-size 1 \ + --max-model-len 32768 \ + --max-num-seqs 128 +``` + +### API Access +Make service requests using the following command + +```shell +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "把李白的静夜思改写为现代诗"} + ] +}' +``` + +FastDeploy service interface is compatible with OpenAI protocol. You can make service requests using the following Python code. + +```python +import openai +host = "0.0.0.0" +port = "8180" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.chat.completions.create( + model="null", + messages=[ + {"role": "system", "content": "I'm a helpful AI assistant."}, + {"role": "user", "content": "把李白的静夜思改写为现代诗"}, + ], + stream=True, +) +for chunk in response: + if chunk.choices[0].delta: + print(chunk.choices[0].delta.content, end='') +print('\n') +```. diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md new file mode 100644 index 00000000000..62e6e36aa57 --- /dev/null +++ b/docs/zh/quantization/nvfp4.md @@ -0,0 +1,75 @@ +[English](../../quantization/nvfp4.md) + +# NVFP4量化 +NVFP4 是 NVIDIA 引入的创新 4 位浮点格式,详细介绍请参考[Introducing NVFP4 for Efficient and Accurate Low-Precision Inference](https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/)。 + +基于[FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy 支持[Modelopt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) 产出格式的NVFP4量化模型推理。 + +- 注:目前该功能仅支持Ernie / Qwen系列的FP4量化模型。 + +## 如何使用 +### 环境安装 +- **支持硬件**:GPU sm >= 100 +- **PaddlePaddle 版本**:3.3.0 或更高版本 +- **Fastdeploy 版本**:2.4.0 或更高版本 + +#### 1. Fastdeploy 安装 +首先请根据[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md),安装Fastdeploy基础环境。 + +#### 2. Flashinfer 安装 +```bash +git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive + +cd flashinfer +python -m pip install -v . +``` + +### 运行推理服务 +- 注意:需要指定环境变量`export PADDLE_COMPATIBLE_API=true`并正确安装对应Flashinfer +```bash +export PADDLE_COMPATIBLE_API=true +python -m fastdeploy.entrypoints.openai.api_server \ + --model nv-community/Qwen3-30B-A3B-FP4 \ + --port 8180 \ + --metrics-port 8181 \ + --engine-worker-queue-port 8182 \ + --cache-queue-port 8183 \ + --tensor-parallel-size 1 \ + --max-model-len 32768 \ + --max-num-seqs 128 +``` + +### 接口访问 +通过如下命令发起服务请求 + +```shell +curl -X POST "http://0.0.0.0:8180/v1/chat/completions" \ +-H "Content-Type: application/json" \ +-d '{ + "messages": [ + {"role": "user", "content": "把李白的静夜思改写为现代诗"} + ] +}' +``` + +FastDeploy服务接口兼容OpenAI协议,可以通过如下Python代码发起服务请求。 + +```python +import openai +host = "0.0.0.0" +port = "8180" +client = openai.Client(base_url=f"http://{host}:{port}/v1", api_key="null") + +response = client.chat.completions.create( + model="null", + messages=[ + {"role": "system", "content": "I'm a helpful AI assistant."}, + {"role": "user", "content": "把李白的静夜思改写为现代诗"}, + ], + stream=True, +) +for chunk in response: + if chunk.choices[0].delta: + print(chunk.choices[0].delta.content, end='') +print('\n') +``` diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py index b3dc1244715..6015a0dff03 100644 --- a/tests/quantization/test_modelopt_nvfp4.py +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -20,8 +20,6 @@ import paddle from fastdeploy.flashinfer import has_flashinfer - -# import fastdeploy from fastdeploy.model_executor.layers.linear import QKVParallelLinear from fastdeploy.model_executor.layers.moe import FusedMoE from fastdeploy.model_executor.layers.quantization.nvfp4 import ( From be11fc33e1b19ed0a194f4dd3121481e9f19788a Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Mon, 12 Jan 2026 11:30:49 +0000 Subject: [PATCH 16/26] fix CLA, test --- build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/build.sh b/build.sh index 5597aec2d0f..b78f5f54e33 100644 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # From 509fc3314f58fef4d0db39b53b3fc40da806b589 Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Mon, 12 Jan 2026 11:32:05 +0000 Subject: [PATCH 17/26] fix the apply() in ModelOptNvFp4FusedMoE --- build.sh | 2 +- fastdeploy/model_executor/layers/quantization/nvfp4.py | 8 ++++++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/build.sh b/build.sh index b78f5f54e33..5597aec2d0f 100644 --- a/build.sh +++ b/build.sh @@ -1,4 +1,4 @@ -#!/usr/bin/env bash +#!/usr/bin/env bash # Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. # diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 49c88be7882..5a5f13297d9 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -32,7 +32,8 @@ from .quant_base import QuantConfigBase, QuantMethodBase if has_flashinfer(): - paddle.compat.enable_torch_proxy() + # 加一个scope + paddle.compat.enable_torch_proxy(scope={"flashinfer"}) from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe @@ -579,7 +580,7 @@ def process_weights_after_loading(self, layer): layer.down_proj_weight_scale = None create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) - def apply(self, layer, x, gate): + def apply(self, layer, x, gate, topk_ids_hookfunc=None,): """ flashinfer nvfp4 fusedmoe for Model Optimizer """ @@ -591,6 +592,9 @@ def apply(self, layer, x, gate): True, # apply_norm_weight, False, ) + + if topk_ids_hookfunc is not None: + topk_ids_hookfunc(topk_ids) output_dtype = x.dtype x_sf = None From 798cb6b36abb09d30338c8dc8a40a1c361b57896 Mon Sep 17 00:00:00 2001 From: nyx-c-language Date: Tue, 13 Jan 2026 00:07:06 +0800 Subject: [PATCH 18/26] fix CodeStyle --- fastdeploy/model_executor/layers/quantization/nvfp4.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index 5a5f13297d9..fa95433f5a3 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -580,7 +580,13 @@ def process_weights_after_loading(self, layer): layer.down_proj_weight_scale = None create_parameter_and_copy(layer, name="down_proj_blockscale_swizzled", weight=down_proj_blockscale_swizzled) - def apply(self, layer, x, gate, topk_ids_hookfunc=None,): + def apply( + self, + layer, + x, + gate, + topk_ids_hookfunc=None, + ): """ flashinfer nvfp4 fusedmoe for Model Optimizer """ @@ -592,7 +598,7 @@ def apply(self, layer, x, gate, topk_ids_hookfunc=None,): True, # apply_norm_weight, False, ) - + if topk_ids_hookfunc is not None: topk_ids_hookfunc(topk_ids) From ca2a6991f92dd788c56392c6c048a6bba9c18b38 Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Tue, 13 Jan 2026 07:28:53 +0000 Subject: [PATCH 19/26] del the PADDLE_COMPATIBLE_API --- docs/quantization/nvfp4.md | 17 +++++------------ docs/zh/quantization/nvfp4.md | 19 +++++-------------- fastdeploy/flashinfer.py | 7 ------- .../layers/quantization/nvfp4.py | 3 +-- tests/quantization/test_modelopt_nvfp4.py | 2 +- 5 files changed, 12 insertions(+), 36 deletions(-) diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md index c8edd091c79..2f1831133b6 100644 --- a/docs/quantization/nvfp4.md +++ b/docs/quantization/nvfp4.md @@ -8,23 +8,16 @@ Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy s ## How to Use ### Environment Setup +#### Supported Environment - **Supported Hardware**: GPU sm >= 100 - **PaddlePaddle Version**: 3.3.0 or higher -- **Fastdeploy Version**: 2.4.0 or higher +- **Fastdeploy Version**: 2.5.0 or higher -#### 1. Fastdeploy Installation -First, install the Fastdeploy base environment according to the [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md). - -#### 2. Flashinfer Installation -```bash -git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive - -cd flashinfer -python -m pip install -v . -``` +#### FastDeploy Installation +Please ensure that FastDeploy is installed with NVIDIA GPU support. +Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md). ### Running Inference Service -- Note: Need to set environment variable `export PADDLE_COMPATIBLE_API=true` and install the corresponding Flashinfer correctly ```bash export PADDLE_COMPATIBLE_API=true python -m fastdeploy.entrypoints.openai.api_server \ diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md index 62e6e36aa57..845a5e6e4a0 100644 --- a/docs/zh/quantization/nvfp4.md +++ b/docs/zh/quantization/nvfp4.md @@ -8,26 +8,17 @@ NVFP4 是 NVIDIA 引入的创新 4 位浮点格式,详细介绍请参考[Intro - 注:目前该功能仅支持Ernie / Qwen系列的FP4量化模型。 ## 如何使用 -### 环境安装 +### 环境准备 +#### 支持环境 - **支持硬件**:GPU sm >= 100 - **PaddlePaddle 版本**:3.3.0 或更高版本 -- **Fastdeploy 版本**:2.4.0 或更高版本 +- **Fastdeploy 版本**:2.5.0 或更高版本 -#### 1. Fastdeploy 安装 -首先请根据[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md),安装Fastdeploy基础环境。 - -#### 2. Flashinfer 安装 -```bash -git clone -b support-paddlepaddle-with-compatible-api-and-tvmffi https://github.com/PFCCLab/flashinfer/ --recursive - -cd flashinfer -python -m pip install -v . -``` +#### Fastdeploy 安装 +FastDeploy 需以 NVIDIA GPU 模式安装,具体安装方式请参考官方文档:[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)。 ### 运行推理服务 -- 注意:需要指定环境变量`export PADDLE_COMPATIBLE_API=true`并正确安装对应Flashinfer ```bash -export PADDLE_COMPATIBLE_API=true python -m fastdeploy.entrypoints.openai.api_server \ --model nv-community/Qwen3-30B-A3B-FP4 \ --port 8180 \ diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py index 23634faed5f..2d3d1befce1 100644 --- a/fastdeploy/flashinfer.py +++ b/fastdeploy/flashinfer.py @@ -17,21 +17,14 @@ import functools import importlib import importlib.util -import os import shutil -from paddleformers.utils.log import logger - @functools.cache def has_flashinfer() -> bool: """Return `True` if FlashInfer is available.""" # Use find_spec to check if the module exists without importing it # This avoids potential CUDA initialization side effects - if os.environ.get("PADDLE_COMPATIBLE_API", "0").lower() not in ["1", "on", "true"]: - # currently must support by Paddle compatible API - logger.warning("FlashInfer is not supported by Paddle compatible API.") - return False if importlib.util.find_spec("flashinfer") is None: return False # Also check if nvcc is available since it's required to JIT compile flashinfer diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index fa95433f5a3..d45bf69b99a 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -1,5 +1,5 @@ """ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -32,7 +32,6 @@ from .quant_base import QuantConfigBase, QuantMethodBase if has_flashinfer(): - # 加一个scope paddle.compat.enable_torch_proxy(scope={"flashinfer"}) from flashinfer import fp4_quantize from flashinfer import mm_fp4 as fp4_gemm diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py index 6015a0dff03..609726659b0 100644 --- a/tests/quantization/test_modelopt_nvfp4.py +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -1,5 +1,5 @@ """ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. From 14fc296e4810c3054f0ed88ec9f1c4013abce2f1 Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Tue, 13 Jan 2026 08:17:46 +0000 Subject: [PATCH 20/26] fix broken url: nvidia_gpu.md --- docs/quantization/nvfp4.md | 2 +- docs/zh/quantization/nvfp4.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md index 2f1831133b6..241bf931241 100644 --- a/docs/quantization/nvfp4.md +++ b/docs/quantization/nvfp4.md @@ -15,7 +15,7 @@ Based on [FlashInfer](https://github.com/flashinfer-ai/flashinfer), Fastdeploy s #### FastDeploy Installation Please ensure that FastDeploy is installed with NVIDIA GPU support. -Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](../../get_started/installation/nvidia_gpu.md). +Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU Environment Installation Guide](https://paddlepaddle.github.io/FastDeploy/get_started/installation/nvidia_gpu/). ### Running Inference Service ```bash diff --git a/docs/zh/quantization/nvfp4.md b/docs/zh/quantization/nvfp4.md index 845a5e6e4a0..656cc267af1 100644 --- a/docs/zh/quantization/nvfp4.md +++ b/docs/zh/quantization/nvfp4.md @@ -15,7 +15,7 @@ NVFP4 是 NVIDIA 引入的创新 4 位浮点格式,详细介绍请参考[Intro - **Fastdeploy 版本**:2.5.0 或更高版本 #### Fastdeploy 安装 -FastDeploy 需以 NVIDIA GPU 模式安装,具体安装方式请参考官方文档:[Fastdeploy NVIDIA GPU 环境安装指南](../../get_started/installation/nvidia_gpu.md)。 +FastDeploy 需以 NVIDIA GPU 模式安装,具体安装方式请参考官方文档:[Fastdeploy NVIDIA GPU 环境安装指南](https://paddlepaddle.github.io/FastDeploy/zh/get_started/installation/nvidia_gpu/)。 ### 运行推理服务 ```bash From a25fea0da153eb9ad7b22fd6ba7a8e82e22c8d0e Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Tue, 13 Jan 2026 10:44:32 +0000 Subject: [PATCH 21/26] fix docs --- docs/quantization/nvfp4.md | 1 - fastdeploy/flashinfer.py | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/docs/quantization/nvfp4.md b/docs/quantization/nvfp4.md index 241bf931241..e89b31dcd89 100644 --- a/docs/quantization/nvfp4.md +++ b/docs/quantization/nvfp4.md @@ -19,7 +19,6 @@ Follow the official guide to set up the base environment: [Fastdeploy NVIDIA GPU ### Running Inference Service ```bash -export PADDLE_COMPATIBLE_API=true python -m fastdeploy.entrypoints.openai.api_server \ --model nv-community/Qwen3-30B-A3B-FP4 \ --port 8180 \ diff --git a/fastdeploy/flashinfer.py b/fastdeploy/flashinfer.py index 2d3d1befce1..c76564196a2 100644 --- a/fastdeploy/flashinfer.py +++ b/fastdeploy/flashinfer.py @@ -1,5 +1,5 @@ """ -# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. From b3e600d35fda0392938133c78783c46b3fba6039 Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Mon, 19 Jan 2026 11:22:46 +0000 Subject: [PATCH 22/26] fix token_ids --- .../layers/quantization/__init__.py | 10 ++++++++-- fastdeploy/worker/gpu_model_runner.py | 19 +++++++++++-------- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index 53fdb7ea0dd..da88bc8330a 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -154,7 +154,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .block_wise_fp8 import BlockWiseFP8Config from .kv_cache import KvCacheQuantConfig from .mix_quant import MixQuantConfig - from .nvfp4 import ModelOptNvFp4Config from .tensor_wise_fp8 import TensorWiseFP8Config from .w4a8 import W4A8Config from .w4afp8 import W4AFP8Config @@ -163,6 +162,14 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .wfp8afp8 import WFP8AFP8Config from .wint2 import WINT2Config + if quantization == "modelopt_fp4": + try: + from .nvfp4 import ModelOptNvFp4Config + + return ModelOptNvFp4Config + except ImportError as e: + raise ImportError(f"Failed to import ModelOptNvFp4Config. Details: {e}") + method_to_config: Dict[str, Type[QuantConfigBase]] = { "wint2": WINT2Config, "wint4": WINT4Config, @@ -176,7 +183,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: "tensor_wise_fp8": TensorWiseFP8Config, "kvcache": KvCacheQuantConfig, "mix_quant": MixQuantConfig, - "modelopt_fp4": ModelOptNvFp4Config, } return method_to_config[quantization] diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 768e59b2460..511dccb2d77 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1274,12 +1274,6 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32") self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - # NOTE(liuzichang): token after \n\n\n must be 100973 or 100975 - # It is a hard code to cover up model's performance - # Detailed notes can be found in FastDeploy/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu - self.share_inputs["reasoning_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") - self.share_inputs["reasoning_allowed_tokens"] = paddle.to_tensor([100973, 100975], dtype="int64") - # Initialize rotary position embedding if not self.enable_mm: self.share_inputs["rope_emb"] = get_rope( @@ -2017,6 +2011,17 @@ def _dummy_run( self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph self.padding_cudagraph_inputs() + # Replace uninitialized tensors with valid random token IDs. + if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None: + vocab_size = getattr(self.model_config, "vocab_size", 32000) + + self.forward_meta.ids_remove_padding = paddle.randint( + low=0, + high=vocab_size, + shape=self.forward_meta.ids_remove_padding.shape, + dtype=self.forward_meta.ids_remove_padding.dtype, + ) + # 3. Run model if self.enable_mm: model_output = self.model( @@ -2766,8 +2771,6 @@ def clear_requests(self): self.prompt_logprobs_reqs.clear() self.in_progress_prompt_logprobs.clear() self.forward_batch_reqs_list = [None for _ in range(self.scheduler_config.max_num_seqs)] - if self.fd_config.routing_replay_config.enable_routing_replay: - self.routing_replay_manager.put_table_to_store() def update_parameters(self, pid): """Dynamic model loader use to update parameters use for RL""" From ee8f622ba6282ba45f0edf5cb1415aad11470c0b Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Mon, 19 Jan 2026 11:41:08 +0000 Subject: [PATCH 23/26] fix CI in Hopper --- fastdeploy/worker/gpu_model_runner.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index fde07760d46..e979a1ee1ca 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -1274,6 +1274,12 @@ def _init_share_inputs(self, max_num_seqs: int): self.share_inputs["max_think_lens"] = paddle.full(shape=[max_num_seqs, 1], fill_value=-1, dtype="int32") self.share_inputs["limit_think_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + # NOTE(liuzichang): token after \n\n\n must be 100973 or 100975 + # It is a hard code to cover up model's performance + # Detailed notes can be found in FastDeploy/custom_ops/gpu_ops/reasoning_phase_token_constraint.cu + self.share_inputs["reasoning_status"] = paddle.full(shape=[max_num_seqs, 1], fill_value=0, dtype="int32") + self.share_inputs["reasoning_allowed_tokens"] = paddle.to_tensor([100973, 100975], dtype="int64") + # Initialize rotary position embedding if not self.enable_mm: self.share_inputs["rope_emb"] = get_rope( @@ -2015,10 +2021,8 @@ def _dummy_run( self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph self.padding_cudagraph_inputs() - # Replace uninitialized tensors with valid random token IDs. if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None: vocab_size = getattr(self.model_config, "vocab_size", 32000) - self.forward_meta.ids_remove_padding = paddle.randint( low=0, high=vocab_size, @@ -2775,6 +2779,8 @@ def clear_requests(self): self.prompt_logprobs_reqs.clear() self.in_progress_prompt_logprobs.clear() self.forward_batch_reqs_list = [None for _ in range(self.scheduler_config.max_num_seqs)] + if self.fd_config.routing_replay_config.enable_routing_replay: + self.routing_replay_manager.put_table_to_store() def update_parameters(self, pid): """Dynamic model loader use to update parameters use for RL""" From 4057e1e1ab17eac96962bba23413c3d49428c7bb Mon Sep 17 00:00:00 2001 From: Echo-Nie Date: Tue, 20 Jan 2026 08:26:35 +0000 Subject: [PATCH 24/26] move flashinfer imports inside the function --- .../model_executor/layers/quantization/__init__.py | 10 ++-------- .../model_executor/layers/quantization/nvfp4.py | 11 ++++++++--- 2 files changed, 10 insertions(+), 11 deletions(-) diff --git a/fastdeploy/model_executor/layers/quantization/__init__.py b/fastdeploy/model_executor/layers/quantization/__init__.py index da88bc8330a..53fdb7ea0dd 100644 --- a/fastdeploy/model_executor/layers/quantization/__init__.py +++ b/fastdeploy/model_executor/layers/quantization/__init__.py @@ -154,6 +154,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .block_wise_fp8 import BlockWiseFP8Config from .kv_cache import KvCacheQuantConfig from .mix_quant import MixQuantConfig + from .nvfp4 import ModelOptNvFp4Config from .tensor_wise_fp8 import TensorWiseFP8Config from .w4a8 import W4A8Config from .w4afp8 import W4AFP8Config @@ -162,14 +163,6 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: from .wfp8afp8 import WFP8AFP8Config from .wint2 import WINT2Config - if quantization == "modelopt_fp4": - try: - from .nvfp4 import ModelOptNvFp4Config - - return ModelOptNvFp4Config - except ImportError as e: - raise ImportError(f"Failed to import ModelOptNvFp4Config. Details: {e}") - method_to_config: Dict[str, Type[QuantConfigBase]] = { "wint2": WINT2Config, "wint4": WINT4Config, @@ -183,6 +176,7 @@ def get_quantization_config(quantization: str) -> Type[QuantConfigBase]: "tensor_wise_fp8": TensorWiseFP8Config, "kvcache": KvCacheQuantConfig, "mix_quant": MixQuantConfig, + "modelopt_fp4": ModelOptNvFp4Config, } return method_to_config[quantization] diff --git a/fastdeploy/model_executor/layers/quantization/nvfp4.py b/fastdeploy/model_executor/layers/quantization/nvfp4.py index d45bf69b99a..cd17b53f34b 100644 --- a/fastdeploy/model_executor/layers/quantization/nvfp4.py +++ b/fastdeploy/model_executor/layers/quantization/nvfp4.py @@ -33,9 +33,6 @@ if has_flashinfer(): paddle.compat.enable_torch_proxy(scope={"flashinfer"}) - from flashinfer import fp4_quantize - from flashinfer import mm_fp4 as fp4_gemm - from flashinfer.fused_moe import cutlass_fused_moe as flashinfer_cutlass_fused_moe else: logger.warning("FlashInfer is not installed. For nvFp4 inference, please install Flashinfer.") @@ -327,6 +324,8 @@ def apply( output_dtype = x.dtype # Quantize BF16 or FP16 to (FP4 and interleaved block scale) + from flashinfer import fp4_quantize + x_fp4, x_scale_interleaved = fp4_quantize(x, layer.input_scale_inv) assert x_fp4.dtype == paddle.uint8 @@ -345,6 +344,8 @@ def apply( if backend == "cutlass": x_scale_interleaved = x_scale_interleaved.view(paddle.uint8) w_scale_interleaved = w_scale_interleaved.view(paddle.uint8) + from flashinfer import mm_fp4 as fp4_gemm + out = fp4_gemm(x_fp4, w, x_scale_interleaved, w_scale_interleaved, layer.alpha, output_dtype, backend=backend) if layer.with_bias: out = paddle.add(out, layer.bias) @@ -607,6 +608,10 @@ def apply( if self.backend == "flashinfer-cutlass": # flashinfer cutlass + from flashinfer.fused_moe import ( + cutlass_fused_moe as flashinfer_cutlass_fused_moe, + ) + _ = flashinfer_cutlass_fused_moe( input=x, token_selected_experts=topk_ids.to(paddle.int), From f9ec3445ab98f2471918dce11a80baf7fcd5c9b7 Mon Sep 17 00:00:00 2001 From: xxxuan <157974576+Echo-Nie@users.noreply.github.com> Date: Wed, 21 Jan 2026 00:14:01 +0800 Subject: [PATCH 25/26] fix model_runner Removed the logic for generating random padding IDs. --- fastdeploy/worker/gpu_model_runner.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py index 116f6f5270a..26872b4aa06 100644 --- a/fastdeploy/worker/gpu_model_runner.py +++ b/fastdeploy/worker/gpu_model_runner.py @@ -2030,15 +2030,6 @@ def _dummy_run( self.forward_meta.step_use_cudagraph = in_capturing and self.forward_meta.step_use_cudagraph self.padding_cudagraph_inputs() - if hasattr(self.forward_meta, "ids_remove_padding") and self.forward_meta.ids_remove_padding is not None: - vocab_size = getattr(self.model_config, "vocab_size", 32000) - self.forward_meta.ids_remove_padding = paddle.randint( - low=0, - high=vocab_size, - shape=self.forward_meta.ids_remove_padding.shape, - dtype=self.forward_meta.ids_remove_padding.dtype, - ) - # 3. Run model if self.enable_mm: model_output = self.model( From fb71ccaf511dcbac21ca06d770edf1f1c896466f Mon Sep 17 00:00:00 2001 From: xxxuan <157974576+Echo-Nie@users.noreply.github.com> Date: Thu, 22 Jan 2026 17:47:40 +0800 Subject: [PATCH 26/26] Remove skip condition for CUDA version in nvfp4 test --- tests/quantization/test_modelopt_nvfp4.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/quantization/test_modelopt_nvfp4.py b/tests/quantization/test_modelopt_nvfp4.py index 609726659b0..577864adc15 100644 --- a/tests/quantization/test_modelopt_nvfp4.py +++ b/tests/quantization/test_modelopt_nvfp4.py @@ -35,10 +35,6 @@ def get_sm_version(): return cc -@unittest.skipIf( - not paddle.is_compiled_with_cuda() or get_sm_version() < 100, - "Nvfp4 do not support sm < 100.", -) class TestModelOptNvFp4Config(unittest.TestCase): def setUp(self): prop = paddle.device.cuda.get_device_properties()