diff --git a/MANIFEST.in b/MANIFEST.in index 7f0629d20..8dd1afe0e 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,4 @@ recursive-include gptqmodel_ext/awq *.h *.cuh *.cu *.cpp -recursive-include gptqmodel_ext/exllama *.h *.cuh *.cu *.cpp recursive-include gptqmodel_ext/exllamav2 *.h *.cuh *.cu *.cpp recursive-include gptqmodel_ext/exllama_eora/eora *.h *.cuh *.cu *.cpp *.py recursive-include gptqmodel_ext/marlin *.h *.cuh *.cu *.cpp *.hpp diff --git a/examples/README.md b/examples/README.md index e42d5b3d4..29b841be3 100644 --- a/examples/README.md +++ b/examples/README.md @@ -27,7 +27,7 @@ python basic_usage_bitblas.py To Execute `basic_usage_exllama.py`, using command like this: ```shell -python basic_usage_exllama.py --backend EXLLAMA/EXLLAMA_V2 +python basic_usage_exllama.py --backend EXLLAMA_V2 ``` To Execute `basic_usage_marlin.py`, using command like this: @@ -104,4 +104,3 @@ CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_id_or_path PATH/TO/MOD ``` Use `--help` flag to see detailed descriptions for more command arguments. - diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py index 4c0978702..80f0f004d 100644 --- a/examples/benchmark/perplexity.py +++ b/examples/benchmark/perplexity.py @@ -42,7 +42,7 @@ parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?") parser.add_argument("--use_fast_tokenizer", action="store_true", help="Whether to use fast tokenizer") parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code") - parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v1', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format") + parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format") args = parser.parse_args() tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=args.use_fast_tokenizer) diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py index 0da3fa820..ff85a5a3d 100644 --- a/gptqmodel/__init__.py +++ b/gptqmodel/__init__.py @@ -52,7 +52,6 @@ from .models.auto import ASCII_LOGO from .quantization import BaseQuantizeConfig, GPTAQConfig, QuantizeConfig from .utils import BACKEND -from .utils.exllama import exllama_set_max_input_length from .version import __version__ diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py index 2085d44bb..d0f0c4f06 100644 --- a/gptqmodel/models/_const.py +++ b/gptqmodel/models/_const.py @@ -128,6 +128,4 @@ def get_best_device(backend: BACKEND = BACKEND.AUTO) -> torch.device: else: return CPU -EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048 - EXPERT_INDEX_PLACEHOLDER = "{expert_index}" diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py deleted file mode 100644 index 8f677dbaa..000000000 --- a/gptqmodel/nn_modules/qlinear/exllama.py +++ /dev/null @@ -1,170 +0,0 @@ -# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai -# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai -# SPDX-License-Identifier: Apache-2.0 -# Contact: qubitium@modelcloud.ai, x.com/qubitium - -# Adapted from turboderp exllama: https://github.com/turboderp/exllama - -from typing import List, Optional, Tuple - -import torch - -from ...adapter.adapter import Adapter, Lora -from ...models._const import DEVICE, PLATFORM -from ...quantization import FORMAT, METHOD -from ...utils.backend import BACKEND -from ...utils.logger import setup_logger -from . import BaseQuantLinear - - -log = setup_logger() - -# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension -NONE_TENSOR = torch.empty((1, 1), device="meta") - -class ExllamaQuantLinear(BaseQuantLinear): - SUPPORTS_BACKENDS = [BACKEND.EXLLAMA_V1] - SUPPORTS_METHODS = [METHOD.GPTQ] - SUPPORTS_FORMATS = {FORMAT.GPTQ: 70, FORMAT.GPTQ_V2: 70} - SUPPORTS_BITS = [4] - SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128] - SUPPORTS_DESC_ACT = [True, False] - SUPPORTS_SYM = [True, False] - SUPPORTS_SHARDS = True - SUPPORTS_TRAINING = False - SUPPORTS_AUTO_PADDING = False - SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32] - SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32] - - SUPPORTS_DEVICES = [DEVICE.CUDA, DEVICE.ROCM] - SUPPORTS_PLATFORM = [PLATFORM.LINUX] - SUPPORTS_PACK_DTYPES = [torch.int32] - SUPPORTS_ADAPTERS = [Lora] - - SUPPORTS_DTYPES = [torch.float16, torch.bfloat16] - - REQUIRES_FORMAT_V2 = True - - # for transformers/optimum tests compat - QUANT_TYPE = "exllama" - - gptqmodel_exllama_kernels = None - - """Linear layer implementation with per-group 4-bit quantization of the weights""" - - def __init__( - self, - bits: int, - group_size: int, - desc_act: bool, - sym: bool, - in_features: int, - out_features: int, - bias: bool = False, - pack_dtype: torch.dtype = torch.int32, - adapter: Adapter = None, - register_buffers: bool = True, - **kwargs, - ): - # backup original values - # self.original_out_features = out_features - # self.original_in_features = in_features - # - # # auto pad - # group_size = group_size if group_size != -1 else in_features - # out_features = out_features + (-out_features % 32) - # in_features = in_features + (-in_features % group_size) - # self.in_features_padding_size = in_features - self.original_in_features - # self.in_features_padding_shape = (0, self.in_features_padding_size) - - super().__init__( - bits=bits, - group_size=group_size, - sym=sym, desc_act=desc_act, - in_features=in_features, - out_features=out_features, - bias=bias, - pack_dtype=pack_dtype, - backend=kwargs.pop("backend", BACKEND.EXLLAMA_V1), - adapter=adapter, - register_buffers=register_buffers, - register_buffers_in_features=in_features, - register_buffers_out_feature=out_features, - **kwargs) - - @classmethod - def validate_once(cls) -> Tuple[bool, Optional[Exception]]: - try: - import gptqmodel_exllama_kernels - cls.gptqmodel_exllama_kernels = gptqmodel_exllama_kernels - return True, None - except ImportError as e: - return False, e - - def post_init(self): - # resize due to padding after model weights have been loaded - # if self.out_features != self.original_out_features or self.in_features != self.original_in_features: - # self.qweight.resize_(self.in_features // self.pack_dtype_bits * self.bits, self.out_features) - # self.qzeros.resize_( - # math.ceil(self.in_features / self.group_size), - # self.out_features // self.pack_dtype_bits * self.bits - # ) - # self.scales.resize_((math.ceil(self.in_features / self.group_size), self.out_features), ) - # self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32, device=self.g_idx.device) - # if self.bias is not None: - # self.bias.resize_(self.out_features) - - # ext_make_q4 only accept float16 scales - self.scales = self.scales.to(dtype=torch.float16) - - self.width = self.qweight.shape[1] - - # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx. - self.q4 = self.gptqmodel_exllama_kernels.make_q4(self.qweight, self.qzeros, self.scales, self.g_idx.to("cpu") if self._use_act_order else NONE_TENSOR, self.qweight.device.index) - - super().post_init() - - def list_buffers(self) -> List: - buf = super().list_buffers() - if hasattr(self, "q4") and self.q4 is not None: - buf.append(self.q4) - return buf - - def ext_q4_matmul(self, x, q4, q4_width): - """Matrix multiplication, returns x @ q4""" - outshape = x.shape[:-1] + (q4_width,) - x = x.view(-1, x.shape[-1]) - - output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device) - self.gptqmodel_exllama_kernels.q4_matmul(x, q4, output) - - if self.bias is not None: - output.add_(self.bias) - - if self.adapter: - output = self.adapter.apply(x=x, out=output) - - return output.view(outshape) - - def forward(self, x: torch.Tensor): - # TODO FIXME: parent should never call us if there is no data to process - # check: https://github.com/ModelCloud/GPTQModel/issues/1361 - if x.shape[0] == 0: - return torch.empty((0, self.out_features), dtype=x.dtype, device=x.device) - - x_dtype = x.dtype - if x_dtype != torch.float16: - #log.warn.once( - # f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model." - #) - - x = x.to(dtype=torch.float16) - - # TODO: need to run checks to make sure there is no performance regression padding with F.pad - # if in_features is padded, we need to pad the input as well - # if x.size(-1) != self.in_features: - # x = F.pad(x, self.in_features_padding_shape) - - out = self.ext_q4_matmul(x, self.q4, self.width) - - return out.to(x_dtype) diff --git a/gptqmodel/nn_modules/qlinear/exllama_awq.py b/gptqmodel/nn_modules/qlinear/exllama_awq.py deleted file mode 100644 index a875388ac..000000000 --- a/gptqmodel/nn_modules/qlinear/exllama_awq.py +++ /dev/null @@ -1,146 +0,0 @@ -# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai -# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai -# SPDX-License-Identifier: Apache-2.0 -# Contact: qubitium@modelcloud.ai, x.com/qubitium - -import torch - -from ...adapter.adapter import Adapter, Lora -from ...models._const import DEVICE, PLATFORM -from ...nn_modules.qlinear import AWQuantLinear -from ...quantization import FORMAT, METHOD -from ...quantization.awq.utils.module import try_import -from ...quantization.awq.utils.packing_utils import unpack_reorder_pack -from ...utils.backend import BACKEND -from ...utils.logger import setup_logger - - -log = setup_logger() - -exl_ext, msg = try_import("gptqmodel_exllama_kernels") - -# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension -none_tensor = torch.empty((1, 1), device="meta") - - -class AwqExllamaQuantLinear(AWQuantLinear): - SUPPORTS_BACKENDS = [BACKEND.EXLLAMA_V1] - SUPPORTS_METHODS = [METHOD.AWQ] - SUPPORTS_FORMATS = {FORMAT.GEMM: 70} - SUPPORTS_BITS = [4] - SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128] - SUPPORTS_DESC_ACT = [True, False] - SUPPORTS_SYM = [True, False] - SUPPORTS_SHARDS = True - SUPPORTS_TRAINING = False - SUPPORTS_AUTO_PADDING = False - SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32] - SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32] - - SUPPORTS_DEVICES = [DEVICE.CUDA, DEVICE.ROCM] - SUPPORTS_PLATFORM = [PLATFORM.LINUX] - SUPPORTS_PACK_DTYPES = [torch.int32] - SUPPORTS_ADAPTERS = [Lora] - - SUPPORTS_DTYPES = [torch.float16, torch.bfloat16] - - REQUIRES_FORMAT_V2 = True - - # for transformers/optimum tests compat - QUANT_TYPE = "awq_exllama" - - def __init__( - self, - bits: int, - group_size: int, - sym: bool, - desc_act: bool, - in_features: int, - out_features: int, - bias: bool = False, - pack_dtype: torch.dtype = torch.int32, - adapter: Adapter = None, - **kwargs, - ): - super().__init__( - bits=bits, - group_size=group_size, - sym=sym, - desc_act=desc_act, - in_features=in_features, - out_features=out_features, - bias=bias, - pack_dtype=pack_dtype, - backend=kwargs.pop("backend", BACKEND.EXLLAMA_V1), - adapter=adapter, - **kwargs) - - def post_init(self): - # if self.padded_infeatures != self.in_features: - # self.qweight.resize_(self.padded_infeatures // self.pack_dtype_bits * self.bits, self.out_features) - # self.qzeros.resize_( - # math.ceil(self.padded_infeatures / self.group_size), - # self.out_features // self.pack_dtype_bits * self.bits - # ) - # self.scales.resize_((math.ceil(self.padded_infeatures / self.group_size), self.out_features), ) - # self.g_idx = torch.tensor([i // self.group_size for i in range(self.padded_infeatures)], dtype=torch.int32, - # device=self.g_idx.device) - - if exl_ext is None: - raise ModuleNotFoundError("External ExLlama kernels are not properly installed." + msg) - - # awq only accepts float16 - self.scales = self.scales.to(dtype=torch.float16) - - assert self.qweight.device.type == "cuda" - assert self.qweight.device.index is not None - - self.qweight, self.qzeros = unpack_reorder_pack( - self.qweight, self.qzeros, self.bits - ) - self.q4 = exl_ext.make_q4( - self.qweight, - self.qzeros, - self.scales, - none_tensor, # g_idx - self.qweight.device.index, # device index - ) - - super().post_init() - - def forward(self, x: torch.Tensor): - assert self.q4 is not None, ( - "module.post_init() must be called before module.forward(). " - "Use exllama_post_init() on the whole model." - ) - if exl_ext is None: - raise ModuleNotFoundError("External ExLlama kernels are not properly installed." + msg) - - input_dtype = x.dtype - out_shape = x.shape[:-1] + (self.out_features,) - - if input_dtype != torch.float16: - x = x.to(dtype=torch.float16) - - x = x.view(-1, x.shape[-1]) - - out = torch.empty( - (x.shape[0], self.out_features), - dtype=torch.float16, - device=x.device, - ) - exl_ext.q4_matmul(x, self.q4, out) - - if input_dtype != torch.float16: - out = out.to(dtype=input_dtype) - - if self.bias is not None: - out.add_(self.bias) - - if self.adapter: - out = self.adapter.apply(x=x, out=out) - - return out.view(out_shape) - - -__all__ = ["AwqExllamaQuantLinear"] diff --git a/gptqmodel/utils/backend.py b/gptqmodel/utils/backend.py index b715e035e..5912dfd04 100644 --- a/gptqmodel/utils/backend.py +++ b/gptqmodel/utils/backend.py @@ -15,7 +15,6 @@ class BACKEND(str, Enum): TORCH_INT8 = "torch_int8" # optimized CPU int8 fused kernel TORCH = "torch" # GOOD: about 80% of triton TRITON = "triton" # VERY GOOD: all-around kernel - EXLLAMA_V1 = "exllama_v1" # FAST: optimized for batching == 1 EXLLAMA_V2 = "exllama_v2" # FASTER: optimized for batching > 1 EXLLAMA_EORA = "exllama_eora" MACHETE = "machete" # CUTLASS-based kernel optimized for Hopper (SM90+) diff --git a/gptqmodel/utils/exllama.py b/gptqmodel/utils/exllama.py deleted file mode 100644 index 053bbc894..000000000 --- a/gptqmodel/utils/exllama.py +++ /dev/null @@ -1,83 +0,0 @@ -# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai -# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai -# SPDX-License-Identifier: Apache-2.0 -# Contact: qubitium@modelcloud.ai, x.com/qubitium - -import torch - -from ..nn_modules.qlinear.exllama import ExllamaQuantLinear -from .torch import torch_empty_cache - - -def exllama_set_max_input_length(model, max_input_length: int): - """ - This method does not necessarily require `model` to inherit from BaseGPTQForCausalLM. - - When using the exllama backend with act-order, it is necessary to initialize a buffer that depends on the maximum expected input length. In case the - default used (EXLLAMA_DEFAULT_MAX_INPUT_LENGTH) is too short, this method can be called to extend the buffer size without reloading the whole model. - """ - - # The import is set here to avoid a global import. Arguably this is quite ugly, it would be better to have lazy loading. - from gptqmodel_exllama_kernels import cleanup_buffers_cuda, prepare_buffers - - if not model.quantize_config.desc_act: - raise ValueError( - "The method exllama_set_max_input_length should be called only when using the exllama backend **with act-order**." - ) - - uses_exllama_v1 = False - for name, submodule in model.named_modules(): - if isinstance(submodule, ExllamaQuantLinear): - uses_exllama_v1 = True - break - - if not uses_exllama_v1: - raise ValueError( - f"The function exllama_set_max_input_length was called, but the model (instance of {model.__class__.__name__}) does not use the exllama backend for GPTQ. An other implementation is used (exllamav2, triton) and that the call to exllama_set_max_input_length is unnecessary. Please remove the call to exllama_set_max_input_length or use the exllama v1 backend." - ) - - device_to_buffers_size = {} - for device, buffers in model.device_to_buffers.items(): - device_to_buffers_size[device] = { - "max_dq_buffer_size": buffers["max_dq_buffer_size"], - "max_inner_outer_dim": buffers["max_inner_outer_dim"], - } - - # For an unknown reason calling just `del model.device_to_buffers` raises an AttributeError. - for key in list(model.device_to_buffers.keys()): - del model.device_to_buffers[key] - model.device_to_buffers = None - del model.device_to_buffers - - torch_empty_cache() - cleanup_buffers_cuda() - - device_to_buffers = {} - for device, buffers_size in device_to_buffers_size.items(): - # The temp_state buffer is required to reorder X in the act-order case. - # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. - device_to_buffers[device] = { - "temp_state": torch.zeros( - (max_input_length, buffers_size["max_inner_outer_dim"]), - dtype=torch.float16, - device=device, - ), - "temp_dq": torch.zeros( - (1, buffers_size["max_dq_buffer_size"]), - dtype=torch.float16, - device=device, - ), - "max_dq_buffer_size": buffers_size["max_dq_buffer_size"], - "max_inner_outer_dim": buffers_size["max_inner_outer_dim"], - } - - prepare_buffers( - device, - device_to_buffers[device]["temp_state"], - device_to_buffers[device]["temp_dq"], - ) - - # Buffers need to be persistent to avoid any bug. - model.device_to_buffers = device_to_buffers - - return model diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py index 3ba42dde7..072216a16 100644 --- a/gptqmodel/utils/model.py +++ b/gptqmodel/utils/model.py @@ -41,12 +41,10 @@ from ..models._const import ( CPU, DEVICE, - EXLLAMA_DEFAULT_MAX_INPUT_LENGTH, EXPERT_INDEX_PLACEHOLDER, SUPPORTS_MODULE_TYPES, ) from ..nn_modules.qlinear import BaseQuantLinear -from ..nn_modules.qlinear.exllama import ExllamaQuantLinear from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear from ..nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear from ..quantization import FORMAT, QuantizeConfig @@ -975,14 +973,8 @@ def hf_gptqmodel_post_init(model, use_act_order: bool, quantize_config: Quantize def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeConfig = None, max_input_length: Optional[int] = None): """ - The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state. + Initialize model-persistent backend scratch buffers after quantized weights are loaded. """ - # post init for bitblas backend. - device_to_buffers_size = {} - # exllama - model_uses_exllama = False - - # exllamav2 fixed_bytes = {} model_uses_exllamav2 = False @@ -1000,86 +992,6 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon max_batch_size=int(os.getenv("AWQ_BATCH_SIZE", 1)) ) fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0)) - elif isinstance(submodule, ExllamaQuantLinear): - model_uses_exllama = True - device = submodule.qweight.device - if device not in device_to_buffers_size: - device_to_buffers_size[device] = { - "max_dq_buffer_size": 1, - "max_inner_outer_dim": 1, - } - submodule._use_act_order = True if use_act_order else False - - # Disable this heuristic for detecting act_order, but it could be used instead of the config. - """ - if submodule.g_idx is None: - submodule.act_order = False - elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))): - submodule.g_idx = None - submodule.act_order = False - else: - submodule.act_order = True - """ - - device_to_buffers_size[device]["max_dq_buffer_size"] = max( - device_to_buffers_size[device]["max_dq_buffer_size"], - submodule.qweight.numel() * 8, - ) - - if use_act_order: - device_to_buffers_size[device]["max_inner_outer_dim"] = max( - device_to_buffers_size[device]["max_inner_outer_dim"], - submodule.in_features, - submodule.out_features, - ) - - if model_uses_exllama: - # To be honest this is quite ugly, not proud of this. - from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params - - device_to_buffers = {} - - if use_act_order: - if max_input_length is None: - max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH - else: - max_input_len = max_input_length - else: - if max_input_length is not None: - log.info( - "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored." - ) - max_input_len = 1 - - for device, buffers_size in device_to_buffers_size.items(): - # The temp_state buffer is required to reorder X in the act-order case. - # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill. - device_to_buffers[device] = { - "temp_state": torch.zeros( - (max_input_len, buffers_size["max_inner_outer_dim"]), - dtype=torch.float16, - device=device, - ), - "temp_dq": torch.zeros( - (1, buffers_size["max_dq_buffer_size"]), - dtype=torch.float16, - device=device, - ), - "max_dq_buffer_size": buffers_size["max_dq_buffer_size"], - "max_inner_outer_dim": buffers_size["max_inner_outer_dim"], - } - - # Buffers need to be persistent to avoid any bug. - model.device_to_buffers = device_to_buffers - - for device, buffers in model.device_to_buffers.items(): - prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"]) - - # Using the default from exllama repo here. - matmul_recons_thd = 16 - matmul_fused_remap = False - matmul_no_half2 = False - set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) if model_uses_exllamav2: from ..utils.exllamav2 import ScratchSpace @@ -1102,9 +1014,6 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon torch_empty_cache() - # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear): - # model = exllama_set_max_input_length(model, max_input_length) - return model diff --git a/gptqmodel_ext/exllama/cu_compat.cuh b/gptqmodel_ext/exllama/cu_compat.cuh deleted file mode 100644 index c5258813e..000000000 --- a/gptqmodel_ext/exllama/cu_compat.cuh +++ /dev/null @@ -1,58 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _cuda_compat_cuh -#define _cuda_compat_cuh - -// atomicAdd for half types, to support CC < 7.x - -__device__ __forceinline__ void atomicAdd_half(half* address, half val) -{ - unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2)); - unsigned int old = *address_as_ui; - unsigned int assumed; - - do - { - assumed = old; - __half_raw hsum; - hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff); - half tmpres = __hadd(hsum, val); - hsum = __half_raw(tmpres); - old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x; - old = atomicCAS(address_as_ui, assumed, old); - } - while (assumed != old); -} - -// atomicAdd for half2 types - -__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val) -{ - unsigned int* address_as_ui = (unsigned int*)address; - unsigned int old = *address_as_ui; - unsigned int assumed; - do - { - assumed = old; - half2 old_val = *((half2*)&old); - half2 new_val = __hadd2(old_val, val); - old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val)); - } - while (assumed != old); -} - -// - -#if defined(__CUDA_ARCH__) || defined(USE_ROCM) -#if __CUDA_ARCH__ < 700 || defined(USE_ROCM) - -__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); } - -#if __CUDA_ARCH__ < 600 || defined(USE_ROCM) -__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); } -#endif - -#endif -#endif - -#endif diff --git a/gptqmodel_ext/exllama/cuda_buffers.cu b/gptqmodel_ext/exllama/cuda_buffers.cu deleted file mode 100644 index 4416027c8..000000000 --- a/gptqmodel_ext/exllama/cuda_buffers.cu +++ /dev/null @@ -1,75 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#define _cuda_buffers_cu -#include "cuda_buffers.cuh" - -CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL}; -// __constant__ half2 q4_table[16][256]; -// half2 q4_table_host[16][256]; -// bool q4_table_init = false; - -CudaBuffers::CudaBuffers -( - int _device, - int _temp_state_size, - half* _temp_state, - half* _temp_dq -) : - device(_device), - temp_state_size(_temp_state_size), - temp_state(_temp_state), - temp_dq(_temp_dq) -{ - cudaSetDevice(_device); - - cudaStreamCreate(&alt_stream_1); - cudaStreamCreate(&alt_stream_2); - cudaStreamCreate(&alt_stream_3); - cudaEventCreate(&alt_stream_1_done); - cudaEventCreate(&alt_stream_2_done); - cudaEventCreate(&alt_stream_3_done); -} - -CudaBuffers::~CudaBuffers() -{ - cudaStreamDestroy(alt_stream_1); - cudaStreamDestroy(alt_stream_2); - cudaStreamDestroy(alt_stream_3); - cudaEventDestroy(alt_stream_1_done); - cudaEventDestroy(alt_stream_2_done); - cudaEventDestroy(alt_stream_3_done); -} - -CudaBuffers* get_buffers(const int device_index) -{ - return g_buffers[device_index]; -} - -void prepare_buffers_cuda -( - int _device, - int _temp_state_size, - half* _temp_state, - half* _temp_dq -) -{ - CudaBuffers* buffers = new CudaBuffers - ( - _device, - _temp_state_size, - _temp_state, - _temp_dq - ); - - g_buffers[_device] = buffers; -} - -void cleanup_buffers_cuda() -{ - for (int i = 0; i < CUDA_MAX_DEVICES; i++) - { - if (!g_buffers[i]) continue; - delete g_buffers[i]; - g_buffers[i] = NULL; - } -} diff --git a/gptqmodel_ext/exllama/cuda_buffers.cuh b/gptqmodel_ext/exllama/cuda_buffers.cuh deleted file mode 100644 index 0bf2057c6..000000000 --- a/gptqmodel_ext/exllama/cuda_buffers.cuh +++ /dev/null @@ -1,55 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _cuda_buffers_cuh -#define _cuda_buffers_cuh - -#include -#include -#include -#include - -const int CUDA_MAX_DEVICES = 16; - -// #ifndef _cuda_buffers_cu -// extern __constant__ half2 q4_table[16][256]; -// #endif - -class CudaBuffers -{ -public: - int device; - - half* temp_state; // [max_hidden_rows * intermediate_size] - int temp_state_size; - half* temp_dq; // size of largest quant tensor * 8 - - cudaStream_t alt_stream_1; - cudaStream_t alt_stream_2; - cudaStream_t alt_stream_3; - cudaEvent_t alt_stream_1_done; - cudaEvent_t alt_stream_2_done; - cudaEvent_t alt_stream_3_done; - - CudaBuffers - ( - int _device, - int _temp_state_size, - half* _temp_state, - half* _temp_dq - ); - ~CudaBuffers(); -}; - -CudaBuffers* get_buffers(const int device_index); - -void prepare_buffers_cuda -( - int _device, - int _temp_state_size, - half* _temp_state, - half* _temp_dq -); - -void cleanup_buffers_cuda(); - -#endif diff --git a/gptqmodel_ext/exllama/cuda_func/column_remap.cu b/gptqmodel_ext/exllama/cuda_func/column_remap.cu deleted file mode 100644 index 30e4039dd..000000000 --- a/gptqmodel_ext/exllama/cuda_func/column_remap.cu +++ /dev/null @@ -1,63 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include "column_remap.cuh" -#include "../util.cuh" - -const int SHUF_BLOCKSIZE_X = 256; -const int SHUF_BLOCKSIZE_Y = 16; - -__global__ void column_remap_kernel -( - const half* __restrict__ x, - half* __restrict__ x_new, - const int x_width, - const int x_height, - const uint32_t* x_map -) -{ - int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; - int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y; - if (x_column >= x_width) return; - //if (x_row >= x_height) return; - - int x_stride = x_width; - int x_idx = x_row * x_stride + x_column; - - int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height); - int x_idx_end = x_row_end * x_stride + x_column; - - int s_column = x_map[x_column]; - int s_idx = x_row * x_stride + s_column; - - while (x_idx < x_idx_end) - { - x_new[x_idx] = x[s_idx]; - x_idx += x_stride; - s_idx += x_stride; - } -} - -// Remap columns in x to correspond to sequential group index before matmul -// -// perform x -> seq_x such that seq_x @ seq_w == x @ w - -void column_remap_cuda -( - const half* x, - half* x_new, - const int x_height, - const int x_width, - const uint32_t* x_map -) -{ - dim3 threads(SHUF_BLOCKSIZE_X, 1, 1); - - dim3 blocks - ( - (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X, - (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y, - 1 - ); - - column_remap_kernel<<>>(x, x_new, x_width, x_height, x_map); -} diff --git a/gptqmodel_ext/exllama/cuda_func/column_remap.cuh b/gptqmodel_ext/exllama/cuda_func/column_remap.cuh deleted file mode 100644 index 6571c17d6..000000000 --- a/gptqmodel_ext/exllama/cuda_func/column_remap.cuh +++ /dev/null @@ -1,19 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _column_remap_cuh -#define _column_remap_cuh - -#include -#include -#include - -void column_remap_cuda -( - const half* x, - half* x_new, - const int x_height, - const int x_width, - const uint32_t* x_map -); - -#endif \ No newline at end of file diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu b/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu deleted file mode 100644 index 7136a1a85..000000000 --- a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu +++ /dev/null @@ -1,260 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include "q4_matmul.cuh" -#include "column_remap.cuh" -#include "../util.cuh" -#include "../matrix.cuh" -#include "../cu_compat.cuh" -#include "../cuda_buffers.cuh" -#if defined(USE_ROCM) -#include "../hip_compat.cuh" -#endif - -const int THREADS_X = 32; // Block size and thread count along columns in w and out -const int THREADS_Y = 1; // Block size and thread count along rows in x and out - -typedef void (*fp_q4_matmul_kernel) -( - const half*, - const uint32_t*, - half*, - const half*, - const uint32_t*, - const int, - const int, - const int, - const int, - const int, - const uint32_t*, - bool -); - -template -__global__ void q4_matmul_kernel -( - const half* __restrict__ x, - const uint32_t* __restrict__ w, - half* __restrict__ out, - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int height, - const int dim, - const int width, - const int groupsize, - const int block_size_z, - const uint32_t* __restrict__ x_map, - bool no_zero -) -{ - // Start of block - - int x_column = block_size_z * blockIdx.z; - int x_column_end = min(dim, block_size_z * (blockIdx.z + 1)); - - int w_column = THREADS_X * blockIdx.x + threadIdx.x; - int x_row = THREADS_Y * blockIdx.y + threadIdx.y; - - int iterations = (x_column_end - x_column) / 8; - - // Views - - MatrixView_half x_(x, height, dim); - MatrixView_half w_scales_(w_scales, dim / groupsize, width); - MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width); - MatrixView_q4_column w_(w, dim, width); - MatrixView_half_rw out_(out, height, width); - - // Zero output - - if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0) - { - *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0; - __syncthreads(); - } - - // Loop over part of x row (and w column) - - half2 acc = {}; - half acc_h = {}; - - if constexpr (use_groupsize) - { - // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this - // could be slightly faster - - for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize) - { - if constexpr (use_half2) - { - half2 w_scale = w_scales_.item_half2half2(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f; // Avoid overflows. - - if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map); - else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8); - } - else - { - half w_scale = w_scales_.item(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f; // Avoid overflows. - - if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map); - else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8); - } - } - } - else - { - // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache - - for (int k = x_column; k < x_column + iterations * 8; k += 8) - { - if constexpr (use_half2) - { - int group = k / groupsize; - half2 w_scale = w_scales_.item_half2half2(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f; // Avoid overflows. - - if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map); - else acc = dot_product_8 (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1); - } - else - { - int group = k / groupsize; - half w_scale = w_scales_.item(group, w_column); - uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f; // Avoid overflows. - - if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map); - else acc_h = dot_product_8_h (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1); - } - } - } - - // Add to block result - - if constexpr (use_half2) - { - half result = __hadd(__low2half(acc), __high2half(acc)); - atomicAdd(out_.item_ptr(x_row, w_column), result); - } - else - { - atomicAdd(out_.item_ptr(x_row, w_column), acc_h); - } -} - -fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map) -{ - // - if (tuningParams->matmul_no_half2) { - if (block_size_z % groupsize == 0) { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } else { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } - } else { - if (block_size_z % groupsize == 0) - { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } else { - if (x_map) return q4_matmul_kernel; - else return q4_matmul_kernel; - } - } -}; - -// Compute y = x @ w - -void q4_matmul_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - const Q4Matrix* w, - half* out, - bool no_zero, - cudaStream_t alt_stream -) -{ - int height = x_height; - int dim = w->height; - int width = w->width; - - cudaSetDevice(w->device); - - uint32_t* x_map = w->cuda_x_map; - const half* x_mapped = x; - if (x_map && !tuningParams->matmul_fused_remap && !alt_stream) - { - CudaBuffers* buffers = get_buffers(w->device); - column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map); - x_mapped = buffers->temp_state; - x_map = NULL; - } - - int block_size_z; - if (w->width == 4096) block_size_z = 384; // 7B - else if (w->width == 11008) block_size_z = 256; - else if (w->width == 5120) block_size_z = 384; // 13B - else if (w->width == 13824) block_size_z = 256; - else if (w->width == 6656) block_size_z = 256; // 33B - else if (w->width == 17920) block_size_z = 128; - else block_size_z = 256; - - //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half)); - - dim3 threads(THREADS_X, THREADS_Y, 1); - - dim3 blocks - ( - (width + threads.x - 1) / threads.x, - (height + threads.y - 1) / threads.y, - (dim + block_size_z - 1) / block_size_z - ); - - fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map); - - kernel<<>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero); -} - -void q4_matmul_recons_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - Q4Matrix* w, - half* out, - const cublasHandle_t handle, - bool no_zero -) -{ - int height = x_height; - int dim = w->height; - int width = w->width; - - cudaSetDevice(w->device); - CudaBuffers* buffers = get_buffers(w->device); - - const half* x_mapped = x; - if (w->cuda_x_map) - { - TORCH_CHECK(buffers->temp_state_size >= x_height * dim, "The temp_state buffer is too small in the exllama backend for GPTQ with act-order. Please call the exllama_set_max_input_length function to increase the buffer size for a sequence length >=", x_height, ":\nfrom gptqmodel import exllama_set_max_input_length\nmodel = exllama_set_max_input_length(model, max_input_length=", x_height, ")"); - column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map); - x_mapped = buffers->temp_state; - } - - w->reconstruct(buffers->temp_dq); - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700 - const float alpha = 1.0f; - const float beta = no_zero ? 1.0f : 0.0f; - cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width, - x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width); -#else - const half alpha = __float2half(1.0f); - const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f); - cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width); -#endif -} diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh b/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh deleted file mode 100644 index 49967648f..000000000 --- a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh +++ /dev/null @@ -1,43 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matmul_cuh -#define _q4_matmul_cuh - -#include -#include -#include -#include -#include - -#include "q4_matrix.cuh" -#include "../tuning.h" - -// Workaround for hipify_python using rocblas instead of hipblas. -#if defined(USE_ROCM) -#include -#define rocblas_handle hipblasHandle_t -#endif - -void q4_matmul_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - const Q4Matrix* w, - half* out, - bool no_zero = false, - cudaStream_t alt_stream = NULL -); - -void q4_matmul_recons_cuda -( - ExLlamaTuning* tuningParams, - const half* x, - const int x_height, - Q4Matrix* w, - half* out, - const cublasHandle_t handle, - bool no_zero = false -); - -#endif diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu b/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu deleted file mode 100644 index d6e441ae6..000000000 --- a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu +++ /dev/null @@ -1,225 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include "q4_matrix.cuh" -#include -#include "../util.cuh" -#include "../matrix.cuh" - -using namespace std; - -const int UNSHUF_BLOCKSIZE_X = 64; - -const int RECONS_THREADS_X = 64; // Block size and thread count along columns in out, each thread converts 1 column -const int RECONS_THREADS_Y = 1; // Block size and thread count along rows in x and out, each thread converts 8 rows - -vector g_q4_matrices; - -void g_q4_keep_matrix(Q4Matrix* m) -{ - g_q4_matrices.push_back(m); -} - -void g_q4_free_matrices() -{ - for (const auto& m : g_q4_matrices) delete m; - g_q4_matrices.clear(); -} - -Q4Matrix::Q4Matrix -( - const int _height, - const int _width, - const int _groups, - - uint32_t* _qweight, - uint32_t* _qzeros, - half* _scales, - uint32_t* _g_idx, - - const int _device -) : - height(_height), - width(_width), - groups(_groups), - device(_device) -{ - cudaSetDevice(device); - - cuda_qweight = _qweight; - cuda_qzeros = _qzeros; - cuda_scales = _scales; - - groupsize = height / groups; - - if (_g_idx) make_sequential(_g_idx); -} - -Q4Matrix::~Q4Matrix() -{ -} - -// Make sequential - -__global__ void make_sequential_kernel -( - const uint32_t* __restrict__ w, - uint32_t* __restrict__ w_new, - const uint32_t* __restrict__ x_map, - const int w_height, - const int w_width -) -{ - const uint64_t* w2 = (uint64_t*) w; - uint64_t* w_new2 = (uint64_t*) w_new; - int w2_stride = w_width >> 1; - - int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x; - if (w2_column >= w2_stride) return; - - int w_new2_row = blockIdx.y; - - int x_map_idx = w_new2_row << 3; - - uint64_t dst = 0; - - #pragma unroll - for (int i = 0; i < 8; i++) - { - int source_row = x_map[x_map_idx++]; - - int w2_row = source_row >> 3; - int w2_subrow = source_row & 0x07; - int w2_row_shift = w2_subrow << 2; - int wnew2_row_shift = i << 2; - - uint64_t src = w2[w2_row * w2_stride + w2_column]; - src >>= w2_row_shift; - src &= 0x0000000f0000000f; - src <<= wnew2_row_shift; - dst |= src; - } - - w_new2[w_new2_row * w2_stride + w2_column] = dst; -} - -void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx) -{ - uint32_t* cuda_new_qweight = NULL; - cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t)); - cudaMalloc(&cuda_x_map, height * sizeof(uint32_t)); // TODO: Should probably be allocated in PyTorch - - uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t)); - uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t)); - uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t)); - - // Group histogram - - for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++; - - // Group map - - for (int i = 0, acc = 0; i < groups; i++) - { - short tmp = cpu_g_idx_map[i]; - cpu_g_idx_map[i] = acc; - acc += tmp; - } - - // X map (inverse) - - for (int row = 0; row < height; row++) - { - uint32_t target_group = cpu_g_idx[row]; - uint32_t target_row = cpu_g_idx_map[target_group]; - cpu_g_idx_map[target_group]++; - cpu_x_map_inv[row] = target_row; - } - - // X map - - for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row; - - // Move to CUDA - - cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice); - - // Rearrange rows in w - - dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1); - dim3 blocks - ( - (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2), - height / 8, - 1 - ); - - make_sequential_kernel<<>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width); - - // Replace qweights - - cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice); - - // Cleanup - - cudaDeviceSynchronize(); - cudaFree(cuda_new_qweight); - free(cpu_g_idx_map); - free(cpu_x_map); - free(cpu_x_map_inv); -} - -__global__ void reconstruct_kernel -( - const uint32_t* __restrict__ w, - half* __restrict__ out, // (y) - const half* __restrict__ w_scales, - const uint32_t* __restrict__ w_zeros, - const int height, - const int width, - const int groupsize -) -{ - // Start of block - - int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x; - int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8; - if (column >= width) return; - - // Views - - MatrixView_q4_column w_(w, height, width); - MatrixView_half_rw out_(out, height, width); - MatrixView_half w_scales_(w_scales, height / groupsize, width); - MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width); - - // Groupsize version - - int group = row / groupsize; - - half w_scale = w_scales_.item(group, column); - uint32_t w_zero = w_zeros_.item(group, column) & 0x0f; - - uint32_t w_read = w_.item_uint32_t(row, column); - half* out_ptr = out_.item_ptr(row, column); - - #pragma unroll - for (int s = 0; s < 32; s += 4) - { - half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale); - *out_ptr = w_item; out_ptr += out_.width; - } -} - -void Q4Matrix::reconstruct(half* out) -{ - dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1); - - dim3 blocks - ( - (width + threads.x - 1) / threads.x, - (height / 8 + threads.y - 1) / threads.y, - 1 - ); - - reconstruct_kernel<<>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize); -} diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh b/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh deleted file mode 100644 index 50cb72a41..000000000 --- a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh +++ /dev/null @@ -1,53 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _q4_matrix_cuh -#define _q4_matrix_cuh - -#include -#include -#include - -class Q4Matrix -{ -public: - - int device; - - int height; - int width; - int groups; - int groupsize; - - uint32_t* cuda_qweight = NULL; - uint32_t* cuda_qzeros = NULL; - half* cuda_scales = NULL; - uint32_t* cuda_x_map = NULL; - - Q4Matrix - ( - const int _height, - const int _width, - const int _groups, - - uint32_t* _qweight, - uint32_t* _qzeros, - half* _scales, - uint32_t* _g_idx, - - const int _device - ); - - ~Q4Matrix(); - - void reconstruct(half* out); - -private: - - void make_sequential(const uint32_t* cpu_g_idx); - -}; - -void g_q4_keep_matrix(Q4Matrix* m); -void g_q4_free_matrices(); - -#endif \ No newline at end of file diff --git a/gptqmodel_ext/exllama/exllama_ext.cpp b/gptqmodel_ext/exllama/exllama_ext.cpp deleted file mode 100644 index 020fee4e1..000000000 --- a/gptqmodel_ext/exllama/exllama_ext.cpp +++ /dev/null @@ -1,260 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#include -#include -#include -#include -#include -#include -#include -#include "util.cuh" -#include "tuning.h" -#include "cuda_buffers.cuh" -#include "cuda_func/q4_matrix.cuh" -#include "cuda_func/q4_matmul.cuh" -#include "cuda_func/column_remap.cuh" - -#include -#include -#include - -// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a -// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of -// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console. - -void check_cuda(cudaError_t ret) -{ - switch (ret) - { - case cudaSuccess: - break; - - case cudaUnspecified: - printf(" **** Unspecified error\n"); - TORCH_CHECK(false, "CUDA error"); - break; - - default: - printf(" **** CUDA error\n"); \ - printf(" **** %s\n", cudaGetErrorString(ret)); \ - TORCH_CHECK(false, "CUDA error"); \ - break; - } -} - -// Some decluttering macros - -#define STRINGIFY_(__x) #__x -#define STRINGIFY(__x) STRINGIFY_(__x) -#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype) -#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes") -#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod)) -#define TORCH_CHECK_BUFFER_SIZE(__buffer, __minimum_size) TORCH_CHECK((__buffer).numel() >= __minimum_size, #__buffer " is too small") - -#define TORCH_CHECK_DEVICE_INDEX(__index) \ -do { \ - TORCH_CHECK(__index >= 0, "no device index"); \ - TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \ -} while(0) - -#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \ -do { \ - TORCH_CHECK_DTYPE(__w, kInt); \ - TORCH_CHECK_DTYPE(__w_scales, kHalf); \ - TORCH_CHECK_DTYPE(__w_zeros, kInt); \ - TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \ - TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \ - TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \ - TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \ -} while(0) - -int get_groupsize(torch::Tensor w, torch::Tensor w_zeros) -{ - int groupsize = w.size(0) * 8 / w_zeros.size(0); - TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]") - return groupsize; -} - - -// Tuning parameters - -ExLlamaTuning tuningParams; - -void set_tuning_params -( - int matmul_recons_thd, - bool matmul_fused_remap, - bool matmul_no_half2 -) -{ - tuningParams.matmul_recons_thd = matmul_recons_thd; - tuningParams.matmul_fused_remap = matmul_fused_remap; - tuningParams.matmul_no_half2 = matmul_no_half2; -} - - -// Release all unmanaged objects allocated by the extension - -void cleanup() -{ - cleanup_buffers_cuda(); - g_q4_free_matrices(); -} - - -// Prepare buffers for forward pass - -void prepare_buffers -( - torch::Device device, - torch::Tensor temp_state, - torch::Tensor temp_dq -) -{ - int device_index = device.index(); - TORCH_CHECK_DEVICE_INDEX(device_index); - const at::cuda::OptionalCUDAGuard device_guard(device); - const long max_int = std::numeric_limits::max(); - - prepare_buffers_cuda - ( - device_index, - // buffer size used for sanity checks - std::clamp((long)temp_state.numel(), (long)0, max_int), - (half*) temp_state.data_ptr(), - (half*) temp_dq.data_ptr() - ); -} - - -// Create Q4Matrix, return handle - -uintptr_t make_q4 -( - torch::Tensor qweight, - torch::Tensor qzeros, - torch::Tensor scales, - torch::Tensor g_idx, - int device -) -{ - TORCH_CHECK_DTYPE(qweight, kInt); - TORCH_CHECK_DTYPE(qzeros, kInt); - TORCH_CHECK_DTYPE(scales, kHalf); - TORCH_CHECK_DTYPE_OPT(g_idx, kInt); - TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8); - TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1); - TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1); - - int width = qweight.size(1); - int height = qweight.size(0) * 8; - int groups = qzeros.size(0); - - Q4Matrix* m = new Q4Matrix - ( - height, - width, - groups, - - (uint32_t*) qweight.data_ptr(), - (uint32_t*) qzeros.data_ptr(), - (half*) scales.data_ptr(), - g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(), - - device - ); - - g_q4_keep_matrix(m); - return reinterpret_cast (m); -} - - -// Matmul half @ quant -> half - -void q4_matmul -( - torch::Tensor x, - uintptr_t w, - torch::Tensor out -) -{ - Q4Matrix* wm = reinterpret_cast (w); - - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(out, kHalf); - TORCH_CHECK_SHAPES(x, 0, out, 0, 1); - TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes") - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - int x_height = x.size(0); - - if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd) - { - q4_matmul_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr() - ); - } - else - { - q4_matmul_recons_cuda - ( - &tuningParams, - (half*) x.data_ptr(), - x_height, - wm, - (half*) out.data_ptr(), - at::cuda::getCurrentCUDABlasHandle() - ); - } -} - - -// Remap columns in half tensor - -void column_remap -( - torch::Tensor x, - torch::Tensor x_new, - torch::Tensor x_map -) -{ - TORCH_CHECK_DTYPE(x, kHalf); - TORCH_CHECK_DTYPE(x_new, kHalf); - TORCH_CHECK_DTYPE(x_map, kInt); - TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1); - - int height = x.size(0); - int width = x.size(1); - - TORCH_CHECK_BUFFER_SIZE(x_new, height * width); - - const at::cuda::OptionalCUDAGuard device_guard(device_of(x)); - - column_remap_cuda - ( - (half*) x.data_ptr(), - (half*) x_new.data_ptr(), - height, - width, - (uint32_t*) x_map.data_ptr() - ); -} - - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) -{ - m.def("set_tuning_params", &set_tuning_params, "set_tuning_params"); - m.def("prepare_buffers", &prepare_buffers, "prepare_buffers"); - m.def("cleanup", &cleanup, "cleanup"); - m.def("make_q4", &make_q4, "make_q4"); - m.def("q4_matmul", &q4_matmul, "q4_matmul"); - m.def("cleanup_buffers_cuda", &cleanup_buffers_cuda, "cleanup_buffers_cuda"); -} diff --git a/gptqmodel_ext/exllama/hip_compat.cuh b/gptqmodel_ext/exllama/hip_compat.cuh deleted file mode 100644 index 680274787..000000000 --- a/gptqmodel_ext/exllama/hip_compat.cuh +++ /dev/null @@ -1,53 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _hip_compat_cuh -#define _hip_compat_cuh - -// Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6. -__device__ __forceinline__ __half __compat_hrcp(__half x) { - return __half_raw{ - static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))}; -} - -// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708 -__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) { - return _Float16_2{ - _Float16_2{static_cast<_Float16>(1.0f), - static_cast<_Float16>(1.0f)} / x.data}; -} - -#define hrcp __compat_hrcp -#define h2rcp __compat_h2rcp - -// Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf. -__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t handle, - hipblasOperation_t transA, - hipblasOperation_t transB, - int m, - int n, - int k, - const half* alpha, - const half* AP, - int lda, - const half* BP, - int ldb, - const half* beta, - half* CP, - int ldc) { - return hipblasHgemm(handle, transA, transB, m, n, k, - reinterpret_cast(alpha), - reinterpret_cast(AP), lda, - reinterpret_cast(BP), ldb, - reinterpret_cast(beta), - reinterpret_cast(CP), ldc); -} -#define hipblasHgemm __compat_hipblasHgemm - -// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS. -#define rocblas_handle hipblasHandle_t -#define rocblas_operation_none HIPBLAS_OP_N -#define rocblas_get_stream hipblasGetStream -#define rocblas_set_stream hipblasSetStream -#define rocblas_hgemm __compat_hipblasHgemm - -#endif diff --git a/gptqmodel_ext/exllama/matrix.cuh b/gptqmodel_ext/exllama/matrix.cuh deleted file mode 100644 index 2fd5ab0b3..000000000 --- a/gptqmodel_ext/exllama/matrix.cuh +++ /dev/null @@ -1,294 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _matrix_cuh -#define _matrix_cuh - -#include -#include - -class MatrixView_half -{ -public: - const half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); } - __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; } -}; - -class MatrixView_half_rw -{ -public: - half* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; } - __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; } - __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; } - __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; } - __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; } -}; - -class MatrixView_q4_row -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (column & 0x07) * 4; - return (data[row * width / 8 + column / 8] >> shift) & 0x0f; - } -}; - -class MatrixView_q4_column -{ -public: - const uint32_t* data; - const int height; - const int width; - - __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width) - : data(data), height(height), width(width) - { } - - __device__ __forceinline__ int item(int row, int column) const - { - int shift = (row & 0x07) * 4; - return (data[row / 8 * width + column] >> shift) & 0x0f; - } - - __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; } - __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; } -}; - -// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale - -__device__ __forceinline__ half2 dot_product_8 -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - -// half2 v_01 = q4_table[v_zero - 1][(v_read ) & 0xff]; // (constant memory is too slow apparently) -// half2 v_23 = q4_table[v_zero - 1][(v_read >> 8) & 0xff]; -// half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff]; -// half2 v_67 = q4_table[v_zero - 1][(v_read >> 24) ]; - - half2 tmp = __hmul2(*h_ptr++, v_01); - tmp = __hfma2(*h_ptr++, v_23, tmp); - tmp = __hfma2(*h_ptr++, v_45, tmp); - tmp = __hfma2(*h_ptr++, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count -) -{ - const half* h_ptr = h_.item_ptr(h_row, h_column); - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(*h_ptr++, v_0); - tmp = __hfma(*h_ptr++, v_1, tmp); - tmp = __hfma(*h_ptr++, v_2, tmp); - tmp = __hfma(*h_ptr++, v_3, tmp); - tmp = __hfma(*h_ptr++, v_4, tmp); - tmp = __hfma(*h_ptr++, v_5, tmp); - tmp = __hfma(*h_ptr++, v_6, tmp); - tmp = __hfma(*h_ptr++, v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map - -__device__ __forceinline__ half2 dot_product_8_x_map -( - const half2 acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half2 v_scale_2, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half2 result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half2 v_01 = __halves2half2(v_0, v_1); - half2 v_23 = __halves2half2(v_2, v_3); - half2 v_45 = __halves2half2(v_4, v_5); - half2 v_67 = __halves2half2(v_6, v_7); - - half h_0 = h_ptr[*x_map_ptr++]; - half h_1 = h_ptr[*x_map_ptr++]; - half h_2 = h_ptr[*x_map_ptr++]; - half h_3 = h_ptr[*x_map_ptr++]; - half h_4 = h_ptr[*x_map_ptr++]; - half h_5 = h_ptr[*x_map_ptr++]; - half h_6 = h_ptr[*x_map_ptr++]; - half h_7 = h_ptr[*x_map_ptr++]; - - half2 h_01 = __halves2half2(h_0, h_1); - half2 h_23 = __halves2half2(h_2, h_3); - half2 h_45 = __halves2half2(h_4, h_5); - half2 h_67 = __halves2half2(h_6, h_7); - - half2 tmp = __hmul2(h_01, v_01); - tmp = __hfma2(h_23, v_23, tmp); - tmp = __hfma2(h_45, v_45, tmp); - tmp = __hfma2(h_67, v_67, tmp); - result = __hfma2(v_scale_2, tmp, result); - } - - return result; -} - -__device__ __forceinline__ half dot_product_8_x_map_h -( - const half acc, - MatrixView_half& h_, - const int h_row, - const int h_column, // divisible by 8 - MatrixView_q4_column& v_, - const int v_row, // divisible by 8 - const int v_column, - const half v_scale, - const uint32_t v_zero, // + 1 (!!) - const int count, - const uint32_t* x_map -) -{ - const half* h_ptr = h_.item_ptr(h_row, 0); - const uint32_t* x_map_ptr = x_map + h_column; - const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column); - half result = acc; - - for (int i = 0; i < count; i++) - { - uint32_t v_read = *v_ptr; v_ptr += v_.width; - - half v_0 = __int2half_rn((int)((v_read ) & 0x0f) - v_zero); - half v_1 = __int2half_rn((int)((v_read >> 4) & 0x0f) - v_zero); - half v_2 = __int2half_rn((int)((v_read >> 8) & 0x0f) - v_zero); - half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero); - half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero); - half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero); - half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero); - half v_7 = __int2half_rn((int)((v_read >> 28) ) - v_zero); - - half tmp = __hmul(h_ptr[*x_map_ptr++], v_0); - tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp); - tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp); - result = __hfma(v_scale, tmp, result); - } - - return result; -} - -#endif diff --git a/gptqmodel_ext/exllama/tuning.h b/gptqmodel_ext/exllama/tuning.h deleted file mode 100644 index 770ca46aa..000000000 --- a/gptqmodel_ext/exllama/tuning.h +++ /dev/null @@ -1,13 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _tuning_h -#define _tuning_h - -struct ExLlamaTuning -{ - int matmul_recons_thd; - bool matmul_fused_remap; - bool matmul_no_half2; -}; - -#endif diff --git a/gptqmodel_ext/exllama/util.cuh b/gptqmodel_ext/exllama/util.cuh deleted file mode 100644 index 7b3975732..000000000 --- a/gptqmodel_ext/exllama/util.cuh +++ /dev/null @@ -1,33 +0,0 @@ -// Adapted from turboderp exllama: https://github.com/turboderp/exllama - -#ifndef _util_cuh -#define _util_cuh - -#include -#include -#include -#include - -#if defined(USE_ROCM) -#define cudaUnspecified hipErrorUnknown -#else -#define cudaUnspecified cudaErrorApiFailureBase -#endif - -// React to failure on return code != cudaSuccess - -#define _cuda_check(fn) \ -do { \ - {_cuda_err = fn;} \ - if (_cuda_err != cudaSuccess) goto _cuda_fail; \ -} while(false) - -// React to failure on return code == 0 - -#define _alloc_check(fn) \ -do { \ - if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \ - else _cuda_err = cudaSuccess; \ -} while(false) - -#endif diff --git a/gptqmodel_ext/exllama_eora/test_actual_value.py b/gptqmodel_ext/exllama_eora/test_actual_value.py deleted file mode 100644 index 8d2291df9..000000000 --- a/gptqmodel_ext/exllama_eora/test_actual_value.py +++ /dev/null @@ -1,276 +0,0 @@ -# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai -# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai -# SPDX-License-Identifier: Apache-2.0 -# Contact: qubitium@modelcloud.ai, x.com/qubitium - -import math - -import gptqmodel_exllama_eora -import torch -# from eora_test import fused_concurrent, fused_sequential, cublas_reference, gptq_gemm_eora, gptq_gemm -from gptqmodel_exllama_eora import gptq_gemm, gptq_gemm_eora -from gptqmodel_exllama_kernels import make_q4, q4_matmul -from safetensors import safe_open - -# model_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/" -# lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors" - -target = 'model.layers.6.self_attn.q_proj' -eora_tensors = {} -with safe_open("/mnt/home/shihyangl/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors", framework="pt", device=0) as f: - for k in f.keys(): - if target in k: - eora_tensors[k] = f.get_tensor(k) -# print(eora_tensors) - -qw_tensors = {} -with safe_open("/mnt/home/shihyangl/llama3.2-1b-4bit-group128/model.safetensors", framework="pt", device=0) as f: - for k in f.keys(): - if target in k: - qw_tensors[k] = f.get_tensor(k) - - - - -m = 1 -k = eora_tensors[f'{target}.lora_A.weight'].shape[1] -n = eora_tensors[f'{target}.lora_B.weight'].shape[0] -r = 128 - - - -bit = 4 -use_exllama = True - -x = torch.rand((m, k), device='cuda', dtype=torch.float16) -eora_a = eora_tensors[f'{target}.lora_A.weight'].to('cuda:0').T -eora_b = torch.clone(eora_tensors[f'{target}.lora_B.weight'].T, memory_format=torch.contiguous_format) -# torch.zeros((r, n),device='cuda' -# eora_b.data = torch.transpose(eora_tensors[f'{target}.lora_B.weight'], 0, 1) -# eora_b = eora_tensors[f'{target}.lora_B.weight'].to('cuda:0').T - - - -# print(eora_b) -# print(eora_b) - -# eora_b = torch.rand((r, n), device='cuda', dtype=torch.float16) / (100 * 4) -# eora_b = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(r, n), device='cuda', dtype=torch.float16) - -# eora_b = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(n, r), device='cuda', dtype=torch.float16).T - -# eora_b.data = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(n, r), device='cuda', dtype=torch.float16).T - - -# eora_b[[0,2,4,6,8],:] = 0 -# eora_b_mean = eora_b.mean() -# eora_b_std = eora_b.std() -# sample_range = 2040 -# sample_idx = torch.randint(0,2048,(1,sample_range)).flatten() -# eora_b[:,sample_idx] = 0 - -# list_range = list(range(1,128)) -# eora_b[list_range,:] = 0 -# print(eora_b) - - -# print(eora_b.shape) -# print(eora_b) -# print(eora_b) -# print(f"eora_b max {eora_b.max()}") -# print(f"eora_b max {eora_b.min()}") -# print(f"eora_b mean {eora_b.mean()}") -# print(f"eora_b std {eora_b.std()}") - -# real_eora_a = eora_tensors[f'{target}.lora_A.weight'].to('cuda:0').T -# real_eora_b = eora_tensors[f'{target}.lora_B.weight'].to('cuda:0').T -# print(f"real eora_a max {real_eora_a.max()}") -# print(f"real eora_a min {real_eora_a.min()}") -# print(f"real eora_a mean {real_eora_a.mean()}") -# print(f"real eora_a std {real_eora_a.std()}") -# print(f"real eora_b max {real_eora_b.max()}") -# print(f"real eora_b min {real_eora_b.min()}") -# print(f"real eora_b mean {real_eora_b.mean()}") -# print(f"real eora_b std {real_eora_b.std()}") - -# eora_a = torch.randn((k, r), device='cuda', dtype=torch.float16) / (100 * 4) -# eora_b = torch.randn((r, n), device='cuda', dtype=torch.float16) / (5) - -# eora_a = torch.rand((k, r), device='cuda', dtype=torch.float16) / (100 * 4) -# eora_b = torch.rand((r, n), device='cuda', dtype=torch.float16) / (100 * 4) -# print(f"dummy eora_a max {eora_a.max()}") -# print(f"dummy eora_a max {eora_a.min()}") -# print(f"dummy eora_a mean {eora_a.mean()}") -# print(f"dummy eora_a std {eora_a.std()}") - -# print(f"dummy eora_b max {eora_b.max()}") -# print(f"dummy eora_b max {eora_b.min()}") -# print(f"dummy eora_b mean {eora_b.mean()}") -# print(f"dummy eora_b std {eora_b.std()}") - -gptq_groups = 128 -# weight = qw_tensors[f'{target}.qweight'].to('cuda:0') -# zeros = qw_tensors[f'{target}.qzeros'].to('cuda:0') -# scales = qw_tensors[f'{target}.scales'].to('cuda:0') -# idx = qw_tensors[f'{target}.g_idx'].to('cuda:0') - -weight = torch.zeros_like(qw_tensors[f'{target}.qweight'], device='cuda', dtype=torch.int32) -zeros = torch.zeros_like(qw_tensors[f'{target}.qzeros'], device='cuda', dtype=torch.int32) -scales = torch.zeros_like(qw_tensors[f'{target}.scales'], device='cuda', dtype=torch.float16) -idx = qw_tensors[f'{target}.g_idx'].to('cuda:0') - - -pack_dtype_bits = 32 -bits = 4 - -pack_factor = pack_dtype_bits // bits -wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0).to("cuda:0") -maxq = 2 ** bits - 1 -num_itr = idx.shape[0] // 2048 -def dequantize_weight(bits, wf, qweight, qzeros, maxq, scales, g_idx, dequant_dtype = torch.int8, num_itr: int=1): - if bits in [2, 4, 8]: - zeros = torch.bitwise_right_shift( - torch.unsqueeze(qzeros, 2).expand(-1, -1, pack_factor), - wf.unsqueeze(0), - ).to(dequant_dtype) - zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape) - - weight = torch.bitwise_and( - torch.bitwise_right_shift( - torch.unsqueeze(qweight, 1).expand(-1, pack_factor, -1), - wf.unsqueeze(-1), - ).to(dequant_dtype), - maxq - ) - elif bits == 3: - zeros = qzeros.reshape(qzeros.shape[0], qzeros.shape[1] // 3, 3, 1).expand( - -1, -1, -1, 12 - ) - zeros = zeros >> wf.unsqueeze(0) - zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4) - zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6) - zeros = zeros & 0x7 - zeros = torch.cat( - [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]], - dim=2, - ).reshape(scales.shape) - - weight = qweight.reshape(qweight.shape[0] // 3, 3, 1, qweight.shape[1]).expand( - -1, -1, 12, -1 - ) - weight = (weight >> wf.unsqueeze(-1)) & 0x7 - weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4) - weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6) - weight = weight & 0x7 - weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1) - weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2]) - - if num_itr == 1: - weights = scales[g_idx.long()] * (weight - zeros[g_idx.long()]) - else: - num_dim = g_idx.shape[0] // num_itr - weights = [] - for i in range(num_itr): - scale_i = scales[:, i * num_dim: (i + 1) * num_dim] - weight_i = weight[:, i * num_dim: (i + 1) * num_dim] - zeros_i = zeros[:, i * num_dim: (i + 1) * num_dim] - g_idx_i = g_idx[i * num_dim: (i + 1) * num_dim].long() - weights.append(scale_i[g_idx_i] * (weight_i - zeros_i[g_idx_i])) - weights = torch.cat(weights, dim=1) - - return weights - -def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor, - bit: int) -> None: - gptqmodel_exllama_eora.gptq_shuffle(q_weight, q_perm, bit) - -## exllama GPTQModel -def exllama_output( x , pack_dtype_bits, bits, group_size, qweight, qzeros, scales,g_idx, out_features, in_features ): - NON_TENSOR = torch.empty((1, 1), device="meta") - def ext_make_q4(qweight, qzeros, scales, g_idx, device): - """Construct Q4Matrix, return handle""" - return make_q4(qweight, qzeros, scales, g_idx if g_idx is not None else NON_TENSOR, device) - - - def ext_q4_matmul(x, q4, q4_width): - """Matrix multiplication, returns x @ q4""" - outshape = x.shape[:-1] + (q4_width,) - x = x.view(-1, x.shape[-1]) - output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device) - - q4_matmul(x, q4, output) - - return output.view(outshape) - - original_out_features = out_features - original_in_features = in_features - - # auto pad - group_size = group_size if group_size != -1 else in_features - out_features = out_features + (-out_features % 32) - in_features = in_features + (-in_features % group_size) - in_features_padding_size = in_features - original_in_features - in_features_padding_shape = (0, in_features_padding_size) - - if out_features != original_out_features or in_features != original_in_features: - qweight.resize_(in_features // pack_dtype_bits * bits, out_features) - qzeros.resize_( - math.ceil(in_features / group_size), - out_features // pack_dtype_bits * bits - ) - scales.resize_((math.ceil(in_features / group_size), out_features), ) - g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32, device=g_idx.device) - - width = qweight.shape[1] - - # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx. - q4 = ext_make_q4( - qweight, - qzeros, - scales, - None, - qweight.device.index, - ) - - out = ext_q4_matmul(x, q4, width) - - return out - - - -ax = x @ eora_a -def test_eora_kernel(): - - # zeros_copy = zeros.clone() + 0b00010001000100010001000100010001 - # exllama_out = exllama_output( x , pack_dtype_bits, bits, group_size=gptq_groups, qweight = weight, qzeros = zeros_copy, scales = scales,g_idx =idx , out_features = n, in_features = k) - # exllama_out = exllama_out + (ax @ eora_b) - - # deq_weight = dequantize_weight(bits=4, wf = wf, qweight=weight, qzeros=zeros_copy, maxq=maxq, scales=scales, g_idx=idx, dequant_dtype=torch.int8, num_itr=num_itr) - # torch_kernel_out = torch.matmul(x, deq_weight).reshape(m,512) + (ax @ eora_b) - - idx.data = torch.argsort(idx).to(torch.int32) - gptq_shuffle(weight, idx, bits) - - ## I confirmed this part to be identical to that of test_gptq.py - - - out_shape = x.shape[:-1] + (weight.shape[-1],) - reshaped_x = x.reshape(-1, x.shape[-1]) - - gptq_pytorch_out = gptq_gemm(reshaped_x, weight, zeros, scales, idx, use_exllama, bit) + (ax @ eora_b) - - gptq_eora_fused_out = gptq_gemm_eora(reshaped_x, weight, zeros, scales, idx, use_exllama, bit, ax, eora_b) - torch.set_printoptions(precision=6) - # print("gptq exllama kernel out: ") - # print(exllama_out[0][:10]) - # print("gptq torch kernel out: ") - # print(torch_kernel_out[0][:10]) - # I want this to match the above two output - print("vllm exllama_pytorch_out: ") - print(gptq_pytorch_out[0][:10]) - - print("vllm exllama_eora_fused_out: ") - print(gptq_eora_fused_out[0][:10]) - torch.testing.assert_close(gptq_pytorch_out, gptq_eora_fused_out, rtol=0.05, atol=0.5) # 5 % relative tolerance, 0.5 absolute tolerance - -test_eora_kernel() \ No newline at end of file diff --git a/setup.py b/setup.py index 0ecdb7e88..92acb90ca 100644 --- a/setup.py +++ b/setup.py @@ -568,7 +568,6 @@ def _env_enabled_any(names, default="1") -> bool: # Optional kernels and not build by default. Enable compile with env flags BUILD_EORA = _env_enabled(os.environ.get("GPTQMODEL_BUILD_EORA", "0")) -BUILD_EXLLAMA_V1 = _env_enabled(os.environ.get("GPTQMODEL_BUILD_EXLLAMA_V1", "0")) if BUILD_CUDA_EXT == "1": # Import torch's cpp_extension only if we're truly building GPU extensions @@ -816,23 +815,6 @@ def _hipify_compile_flags(flags): ) ] - # both CUDA and ROCm compatible - if BUILD_EXLLAMA_V1: - extensions += [ - cpp_ext.CUDAExtension( - "gptqmodel_exllama_kernels", - [ - "gptqmodel_ext/exllama/exllama_ext.cpp", - "gptqmodel_ext/exllama/cuda_buffers.cu", - "gptqmodel_ext/exllama/cuda_func/column_remap.cu", - "gptqmodel_ext/exllama/cuda_func/q4_matmul.cu", - "gptqmodel_ext/exllama/cuda_func/q4_matrix.cu", - ], - extra_link_args=extra_link_args, - extra_compile_args=extra_compile_args, - ) - ] - if BUILD_AWQ: if ROCM_VERSION: print("Skipping AWQ kernels on ROCm: inline PTX is CUDA-only.") diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py index 8e671f427..212e0e20a 100644 --- a/tests/kernels/test_awq.py +++ b/tests/kernels/test_awq.py @@ -34,7 +34,6 @@ AwqGEMMTritonQuantLinear = None # type: ignore[assignment] awq_triton_import_exception = exc -from gptqmodel.nn_modules.qlinear.exllama_awq import AwqExllamaQuantLinear from gptqmodel.nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear from gptqmodel.utils.exllamav2 import ScratchSpace @@ -72,7 +71,6 @@ class TestAwqKernelOutput(unittest.TestCase): (BACKEND.MARLIN, torch.float16, 0.006), (BACKEND.TORCH_FUSED_AWQ, torch.float16, 0.004), # (BACKEND.MARLIN, torch.bfloat16, 0.05), - (BACKEND.EXLLAMA_V1, torch.float16, 0.006), (BACKEND.EXLLAMA_V2, torch.float16, 0.0068), ] @@ -87,7 +85,6 @@ def setUpClass(cls) -> None: cls.backend_skip_reason[BACKEND.GEMM] = "CUDA is required for GEMM backend." cls.backend_skip_reason[BACKEND.TRITON] = "CUDA is required for AWQ Triton backend." cls.backend_skip_reason[BACKEND.MARLIN] = "CUDA is required for AWQ Marlin kernel." - cls.backend_skip_reason[BACKEND.EXLLAMA_V1] = "CUDA is required for ExLlama v1 AWQ kernel." cls.backend_skip_reason[BACKEND.EXLLAMA_V2] = "CUDA is required for ExLlama v2 AWQ kernel." if awq_triton_import_exception is not None: cls.backend_skip_reason[BACKEND.TRITON] = ( @@ -141,12 +138,6 @@ def setUpClass(cls) -> None: else None ) - cls.modules[BACKEND.EXLLAMA_V1] = ( - cls._build_exllama_v1_module(qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu) - if cls.cuda_available - else None - ) - cls.modules[BACKEND.EXLLAMA_V2] = ( cls._build_exllama_v2_module(qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu) if cls.cuda_available @@ -350,42 +341,6 @@ def _build_torch_awq_module( module.post_init() return module - @classmethod - def _build_exllama_v1_module( - cls, - qweight_cpu: torch.Tensor, - qzeros_cpu: torch.Tensor, - scales_cpu: torch.Tensor, - bias_cpu: torch.Tensor, - ) -> Optional[AwqExllamaQuantLinear]: - try: - module = AwqExllamaQuantLinear( - bits=cls.BITS, - group_size=cls.GROUP_SIZE, - sym=True, - desc_act=False, - in_features=cls.in_features, - out_features=cls.out_features, - bias=True, - adapter=None, - register_buffers=True, - ).to(cls.device) - - module.qweight.copy_(qweight_cpu.to(cls.device)) - module.qzeros.copy_(qzeros_cpu.to(cls.device)) - module.scales.copy_(scales_cpu.to(torch.float16).to(cls.device)) - module.bias.copy_(bias_cpu.to(torch.float16).to(cls.device)) - - module.eval() - module.post_init() - return module - except Exception as exc: - cls.backend_skip_reason[BACKEND.EXLLAMA_V1] = ( - f"ExLlama v1 AWQ kernel unavailable: {exc}" - ) - return None - - @classmethod def _build_exllama_v2_module( cls, qweight_cpu: torch.Tensor, diff --git a/tests/test_bits.py b/tests/test_bits.py index 8f1ae4d5f..20ababc53 100644 --- a/tests/test_bits.py +++ b/tests/test_bits.py @@ -20,7 +20,6 @@ from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 -from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 @@ -35,7 +34,6 @@ class TestBits(unittest.TestCase): QLINEAR_DICT = { - BACKEND.EXLLAMA_V1: ExllamaQuantLinear, BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear, BACKEND.TRITON: TritonV2QuantLinear, BACKEND.TORCH: TorchQuantLinear, @@ -69,11 +67,6 @@ def check_results(self, bits: int, task_results): @classmethod def setUpClass(cls): - # cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH, BACKEND.BITBLAS, - # BACKEND.IPEX] - # cls.backends = list(cls.pack_backends) - # cls.backends.extend([BACKEND.EXLLAMA_V2, BACKEND.MARLIN, ]) - # TODO Only CUDA Quant Linear is tested for now cls.pack_backends = [BACKEND.TRITON] cls.backends = [BACKEND.MARLIN] diff --git a/tests/test_bits_new.py b/tests/test_bits_new.py index b9951f383..fe4a50520 100644 --- a/tests/test_bits_new.py +++ b/tests/test_bits_new.py @@ -167,7 +167,6 @@ def test_quant_and_eora(self): del model torch_empty_cache() - # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, for backend in [ BACKEND.TORCH ]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN base_bench = bench(path=save_path, backend=backend, adapter=None) # inference using qweights only # eora_bench = bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora) diff --git a/tests/test_group_size.py b/tests/test_group_size.py index 32380ece0..4f96bb97c 100644 --- a/tests/test_group_size.py +++ b/tests/test_group_size.py @@ -19,7 +19,6 @@ from gptqmodel import BACKEND, GPTQModel, QuantizeConfig # noqa: E402 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear # noqa: E402 -from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear # noqa: E402 @@ -34,7 +33,6 @@ class TestGroupSize(unittest.TestCase): QLINEAR_DICT = { - BACKEND.EXLLAMA_V1: ExllamaQuantLinear, BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear, BACKEND.TRITON: TritonV2QuantLinear, BACKEND.TORCH: TorchQuantLinear, @@ -45,7 +43,7 @@ class TestGroupSize(unittest.TestCase): @classmethod def setUpClass(cls): - cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS] + cls.pack_backends = [BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS] cls.backends = list(cls.pack_backends) cls.backends.extend([BACKEND.EXLLAMA_V2, BACKEND.MARLIN, ]) diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py index f0128098f..8731381ae 100644 --- a/tests/test_inference_speed.py +++ b/tests/test_inference_speed.py @@ -23,7 +23,6 @@ (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 748), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.CUDA, 493), -(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 717), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 775), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 296), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 295), @@ -41,7 +40,6 @@ class TestInferenceSpeed(InferenceSpeed): (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 286.74, False, False), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 176.00, False, False), # (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 53, False, False), - (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 282.64, False, False), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 290.60, False, False), (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 239.58, False, False), (InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 2167.38, False, False), # Second time running bitblas, there is cache diff --git a/tests/test_lora.py b/tests/test_lora.py index 270403508..dd4b546a1 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -48,7 +48,6 @@ def setUpClass(cls): #BACKEND.TORCH, # BACKEND.CUDA, # BACKEND.TRITON, - # BACKEND.EXLLAMA_V1, # BACKEND.EXLLAMA_V2, BACKEND.MARLIN, # # (BACKEND.IPEX), <-- not tested yet diff --git a/tests/test_packable.py b/tests/test_packable.py index 6061e357e..c84f4a2c8 100644 --- a/tests/test_packable.py +++ b/tests/test_packable.py @@ -13,7 +13,6 @@ from safetensors.torch import load_file from gptqmodel import BACKEND, GPTQModel -from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear # noqa: E402 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear # noqa: E402 @@ -28,7 +27,6 @@ class TestPackable(unittest.TestCase): QLINEAR_DICT = { BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear, - BACKEND.EXLLAMA_V1: ExllamaQuantLinear, BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear, BACKEND.TRITON: TritonV2QuantLinear, BACKEND.TORCH: TorchQuantLinear, @@ -54,7 +52,6 @@ def setUpClass(cls): @parameterized.expand( [ (BACKEND.EXLLAMA_EORA, {"qweight": False, "qzeros": True, "scales": True, "g_idx": False}), - (BACKEND.EXLLAMA_V1, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}), (BACKEND.EXLLAMA_V2, {"qweight": False, "qzeros": True, "scales": True, "g_idx": True}), (BACKEND.TRITON, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}), (BACKEND.TORCH, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}), diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py index 57715e597..7d1d3318e 100644 --- a/tests/test_packing_speed.py +++ b/tests/test_packing_speed.py @@ -159,7 +159,6 @@ def _time_pack_impl(self, qlinearCls, backend, impl: str, repeats: int, threads: @parameterized.expand( [ - # [ExllamaQuantLinear, BACKEND.EXLLAMA, 9.63], # [TritonV2QuantLinear, BACKEND.TRITON, 9.67], [TorchQuantLinear, BACKEND.TORCH, 21.05], # A100 Z3 33.56 # 4090? 27.0297 ] @@ -178,7 +177,6 @@ def test_pack_speed_single_thread(self, qlinearCls, backend, expect_time): @parameterized.expand( [ - # [ExllamaQuantLinear, BACKEND.EXLLAMA, 9.63], # [TritonV2QuantLinear, BACKEND.TRITON, 9.67], [TorchQuantLinear, BACKEND.TORCH, 14.71], # A100 Z3 33.56 # 4090? 27.0297 ] diff --git a/tests/test_post_quant_eora.py b/tests/test_post_quant_eora.py index aa6d8c73e..7a1d039c4 100644 --- a/tests/test_post_quant_eora.py +++ b/tests/test_post_quant_eora.py @@ -107,7 +107,6 @@ def test_post_quant_eora(self): calibration_dataset_concat_size=calibration_dataset_concat_size, ) - # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, # for backend in [BACKEND.MARLIN]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN # base_bench = bench(path=self.QUANTIZED_MODEL_PATH, backend=backend, adapter=None) # inference using qweights only # eora_bench = bench(path=self.QUANTIZED_MODEL_PATH, backend=backend, adapter=eora) # inference using eora (lora) diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py deleted file mode 100644 index d1570c76e..000000000 --- a/tests/test_q4_exllama_v1.py +++ /dev/null @@ -1,1208 +0,0 @@ -# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai -# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai -# SPDX-License-Identifier: Apache-2.0 -# Contact: qubitium@modelcloud.ai, x.com/qubitium - -# -- do not touch -import os - - -os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" -# -- end do not touch - - -import torch # noqa: E402 -from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params # noqa: E402 -from models.model_test import ModelTest # noqa: E402 -from transformers import AutoTokenizer # noqa: E402 - -from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length # noqa: E402 -from gptqmodel.models._const import DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH # noqa: E402 -from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear # noqa: E402 -from gptqmodel.quantization import FORMAT, METHOD # noqa: E402 -from gptqmodel.utils.importer import select_quant_linear # noqa: E402 -from gptqmodel.utils.model import gptqmodel_post_init # noqa: E402 - - -REFERENCE = torch.Tensor( - [ - 5.8398, - 6.8555, - 7.2734, - 6.4219, - 6.2070, - 5.8203, - 6.5664, - 6.4219, - 6.2148, - 5.3281, - 5.7578, - 7.5312, - 8.1016, - 6.1133, - 7.2031, - 6.6484, - 6.5156, - 6.0117, - 6.0312, - 6.1914, - 6.2109, - 6.8125, - 5.8125, - 7.1172, - 7.3125, - 6.7305, - 5.9961, - 6.5117, - 6.1914, - 5.9648, - 7.1680, - 6.4766, - 7.2070, - 6.5469, - 6.7734, - 6.4219, - 6.8086, - 7.0469, - 5.9297, - 6.4727, - 6.2539, - 5.9570, - 7.2383, - 5.8945, - 6.0820, - 5.7969, - 7.1094, - 6.2188, - 6.7500, - 7.3555, - 6.2930, - 6.7734, - 5.9219, - 7.4805, - 6.8750, - 6.4102, - 6.5898, - 6.5469, - 7.6016, - 6.7461, - 5.9492, - 7.2227, - 5.8164, - 5.4570, - 6.2930, - 7.3984, - 6.0938, - 7.3984, - 5.9609, - 6.3516, - 6.5664, - 5.7969, - 7.1250, - 6.0781, - 6.7930, - 5.9492, - 6.1641, - 6.5898, - 6.0586, - 6.3359, - 6.7930, - 7.0469, - 6.0664, - 6.3320, - 5.4414, - 6.7617, - 5.1641, - 7.2891, - 6.8516, - 6.5312, - 5.6914, - 7.3711, - 6.8203, - 5.9492, - 7.0781, - 6.3164, - 7.1992, - 7.1133, - 7.4219, - 7.5586, - 7.1836, - 6.9102, - 6.4844, - 6.9805, - 6.1953, - 6.5156, - 5.4844, - 6.6602, - 6.6719, - 7.9844, - 6.4727, - 6.6367, - 6.2227, - 6.4531, - 5.0625, - 6.4609, - 6.7031, - 6.6445, - 6.5234, - 6.8633, - 6.6055, - 5.6055, - 6.4453, - 7.2617, - 6.3945, - 6.6367, - 6.1055, - 7.0664, - 6.0820, - 6.6875, - 6.1445, - 6.8672, - 6.2070, - 6.8828, - 6.1484, - 6.7070, - 6.8516, - 6.2734, - 7.1055, - 7.0586, - 6.9648, - 5.9727, - 6.1016, - 6.8750, - 7.0078, - 7.1523, - 5.7383, - 5.9531, - 6.5508, - 7.5352, - 6.1602, - 6.2578, - 6.3906, - 5.7383, - 6.7031, - 5.7344, - 6.3516, - 5.2852, - 7.5312, - 6.4531, - 6.6406, - 6.2266, - 6.1094, - 5.9102, - 5.7617, - 6.3789, - 7.0508, - 6.3750, - 6.3320, - 6.8555, - 6.7266, - 7.0352, - 7.7695, - 6.3984, - 6.5039, - 6.8320, - 6.1602, - 6.0312, - 6.3828, - 6.9023, - 7.4336, - 7.3711, - 6.1016, - 7.0703, - 6.3281, - 6.8281, - 6.4922, - 5.9453, - 5.1016, - 6.7188, - 6.1406, - 6.6289, - 7.2695, - 6.2070, - 6.7070, - 7.2930, - 7.1836, - 6.3828, - 6.1992, - 6.7070, - 7.8008, - 7.7773, - 5.6602, - 7.0273, - 6.6172, - 6.0898, - 5.3516, - 7.3359, - 5.9727, - 6.0078, - 7.0586, - 6.3086, - 6.8555, - 7.2617, - 7.3477, - 6.3828, - 7.1133, - 6.6328, - 7.3516, - 6.9141, - 7.2031, - 6.9805, - 6.1719, - 6.7812, - 8.3047, - 6.5898, - 6.3633, - 6.2539, - 7.2773, - 6.5938, - 6.4141, - 6.8203, - 6.8906, - 7.8828, - 5.9609, - 6.4180, - 7.3984, - 5.7539, - 7.1758, - 6.6641, - 6.9062, - 6.2578, - 7.5508, - 6.1719, - 6.5742, - 5.9375, - 6.7891, - 6.2109, - 6.5039, - 6.8750, - 6.2031, - 6.8828, - 7.1094, - 5.9570, - 7.2969, - 6.6797, - 6.8828, - 5.5430, - 6.9648, - 5.8398, - 6.5430, - 6.3945, - 6.5664, - 5.8086, - 6.6172, - 7.0586, - 6.8867, - 6.0820, - 5.8125, - 6.7070, - 7.5742, - 6.2578, - 6.1328, - 6.5391, - 5.4531, - 6.8242, - 6.6953, - 6.8008, - 6.3398, - 6.4805, - 7.2266, - 6.3281, - 6.6875, - 6.4688, - 5.9414, - 7.4297, - 5.8711, - 6.0625, - 5.8750, - 6.5664, - 5.8867, - 6.3477, - 6.1133, - 6.9453, - 5.0547, - 6.7812, - 6.4922, - 7.2422, - 5.4688, - 6.2109, - 7.2148, - 6.1758, - 5.9297, - 7.1953, - 5.5195, - 6.3203, - 5.9961, - 7.9297, - 6.2695, - 6.4414, - 6.7266, - 7.1875, - 7.3203, - 5.4062, - 6.0625, - 7.0898, - 5.3828, - 5.6133, - 6.0742, - 6.6836, - 5.7109, - 7.2852, - 7.7539, - 7.5820, - 6.4258, - 5.9336, - 6.3750, - 6.3555, - 7.5469, - 6.2539, - 6.5898, - 6.4102, - 7.0469, - 5.7344, - 7.2031, - 6.7969, - 5.6836, - 7.6523, - 6.9297, - 7.8672, - 6.4766, - 6.3008, - 7.0977, - 6.5430, - 7.0938, - 5.8398, - 6.9883, - 6.5312, - 6.3203, - 6.3594, - 5.4062, - 6.9688, - 5.7930, - 6.3164, - 6.5547, - 7.1992, - 5.8750, - 6.3008, - 6.7930, - 6.0391, - 7.4766, - 6.6094, - 6.5625, - 5.9805, - 6.2422, - 7.2109, - 6.6875, - 5.3047, - 7.6211, - 5.9453, - 6.5625, - 6.1641, - 6.1250, - 6.5977, - 7.7422, - 7.0742, - 5.6875, - 6.2656, - 6.6250, - 6.8945, - 5.7070, - 6.3203, - 5.7500, - 6.2695, - 6.2773, - 6.8516, - 6.4883, - 7.0000, - 6.7578, - 6.1875, - 5.9844, - 5.5703, - 6.7188, - 5.5273, - 5.3438, - 7.2500, - 6.7852, - 6.5195, - 6.8125, - 6.0664, - 6.7852, - 7.0000, - 7.0781, - 6.8477, - 7.2930, - 6.3438, - 7.1523, - 6.3281, - 6.8047, - 7.3203, - 5.3359, - 6.1484, - 6.5586, - 7.3828, - 6.2344, - 7.1523, - 6.4102, - 5.5898, - 7.0195, - 7.1172, - 5.8008, - 6.5742, - 6.2891, - 8.0312, - 6.9023, - 6.5898, - 7.1953, - 6.7266, - 6.0078, - 5.5430, - 6.4766, - 6.4258, - 5.9648, - 8.0859, - 5.0547, - 7.2188, - 7.4375, - 6.5156, - 5.9922, - 6.3281, - 6.2852, - 6.7734, - 6.2461, - 6.9805, - 5.4648, - 5.8867, - 6.8242, - 6.3008, - 6.3281, - 7.3047, - 7.1836, - 6.5195, - 6.6328, - 6.7188, - 5.4336, - 6.5078, - 5.3477, - 5.5508, - 7.3125, - 5.8750, - 6.5195, - 6.2383, - 6.3594, - 6.0898, - 6.4141, - 5.9844, - 6.6250, - 7.7109, - 6.0391, - 7.2344, - 5.9453, - 5.9453, - 7.0586, - 5.6641, - 7.2773, - 6.5195, - 7.2227, - 6.3359, - 5.3203, - 6.4375, - 7.2383, - 6.4023, - 6.2148, - 7.3750, - 5.8164, - 6.2109, - 6.5430, - 5.8164, - 6.1680, - 6.7656, - 6.0820, - 6.1094, - 6.5312, - 6.8906, - 6.8320, - 6.1289, - 6.3125, - 7.6797, - 6.3008, - 6.0000, - 7.3320, - 6.7852, - 6.9297, - 6.6328, - 6.2266, - 5.1602, - 6.2031, - 7.0547, - 5.9492, - 6.0703, - 6.0977, - 6.8086, - 6.0742, - 6.0195, - 7.0625, - 6.5781, - 5.7461, - 6.1562, - 7.0430, - 6.7148, - 6.5312, - 6.5820, - 6.4570, - 7.5508, - 5.6289, - 6.0547, - 6.5000, - 7.3125, - 5.8477, - 5.9297, - 6.2578, - 6.0078, - 5.9922, - 7.3398, - 7.4922, - 7.8906, - 7.5547, - 5.4648, - 6.5156, - 6.3242, - 6.1094, - 6.9219, - 6.7227, - 6.6836, - 7.4023, - 5.9648, - 7.2383, - 6.7695, - 6.6797, - 7.0547, - 6.3047, - 6.4688, - 6.9961, - 6.0391, - 5.9727, - 6.8398, - 6.7422, - 5.7656, - 5.4766, - 6.7852, - 7.0820, - 5.3516, - 7.6523, - 5.1562, - 6.6445, - 6.1211, - 6.2695, - 6.0703, - 6.3594, - 6.4062, - 6.3398, - 5.7578, - 6.5391, - 6.2500, - 6.5742, - 6.5000, - 7.5625, - 7.0117, - 6.5547, - 7.1250, - 6.4453, - 6.6094, - 6.1875, - 6.4219, - 6.6172, - 6.4336, - 6.5703, - 6.1758, - 6.4219, - 6.6016, - 6.7383, - 6.7070, - 6.1328, - 5.5586, - 6.6367, - 6.3789, - 6.2578, - 5.5039, - 6.6172, - 6.4648, - 5.8086, - 7.2031, - 5.8125, - 6.3711, - 7.6758, - 7.1289, - 5.8086, - 6.3008, - 6.2109, - 6.1602, - 6.1797, - 7.2305, - 6.7266, - 6.2422, - 5.6719, - 6.7070, - 6.9414, - 6.8594, - 7.4023, - 7.2109, - 6.0156, - 6.6680, - 6.6172, - 7.1250, - 6.6523, - 6.9531, - 6.7617, - 6.4961, - 6.9414, - 5.7188, - 7.6367, - 6.5469, - 6.2305, - 6.4414, - 7.4648, - 5.9102, - 6.2461, - 6.1367, - 6.8203, - 6.5703, - 6.8867, - 7.0000, - 6.7539, - 6.1719, - 6.5469, - 6.2422, - 5.4297, - 5.7305, - 5.1641, - 6.1875, - 7.0312, - 6.6484, - 6.0234, - 7.4102, - 6.8711, - 6.3086, - 6.3711, - 6.7344, - 6.6992, - 5.9766, - 7.3906, - 7.1875, - 6.4883, - 6.3984, - 7.3438, - 6.9688, - 6.9062, - 6.4375, - 6.7891, - 7.0117, - 6.4883, - 5.7500, - 7.0898, - 7.0742, - 6.7070, - 5.8750, - 6.0469, - 6.6445, - 5.2773, - 6.8984, - 6.1641, - 7.0508, - 7.4609, - 5.0273, - 6.7734, - 6.4531, - 5.7656, - 6.5312, - 7.4648, - 6.1250, - 6.5625, - 7.1367, - 6.0625, - 6.1211, - 6.9766, - 6.6758, - 6.3164, - 6.8828, - 6.8203, - 6.7500, - 6.5352, - 7.3008, - 6.7852, - 6.1914, - 5.0508, - 6.7188, - 7.1172, - 6.8008, - 6.8086, - 5.4883, - 6.9180, - 6.5742, - 6.1719, - 7.0469, - 7.1523, - 5.9492, - 5.8594, - 6.8320, - 6.1719, - 6.2031, - 6.8398, - 7.3008, - 6.6289, - 6.4922, - 6.0000, - 5.4766, - 6.3320, - 6.5117, - 6.2812, - 7.5742, - 6.3516, - 7.0039, - 6.4570, - 7.1523, - 7.6289, - 6.2578, - 7.1875, - 6.4844, - 5.7930, - 6.7070, - 7.5508, - 7.1797, - 6.0430, - 6.8711, - 6.5742, - 7.5781, - 6.4766, - 6.5391, - 6.9453, - 6.1992, - 6.6367, - 6.2812, - 6.0234, - 6.6953, - 7.0312, - 6.2031, - 6.5625, - 6.6719, - 6.1719, - 6.5586, - 5.7031, - 7.4609, - 6.6211, - 7.7227, - 6.9141, - 6.0469, - 6.2500, - 5.3828, - 6.0078, - 5.8164, - 5.8867, - 6.1523, - 6.6523, - 6.6953, - 7.3125, - 6.4844, - 5.9570, - 5.9531, - 6.2109, - 5.5039, - 6.5117, - 6.8203, - 6.6133, - 6.4766, - 5.9297, - 7.1445, - 7.1914, - 6.0117, - 6.8281, - 6.7422, - 6.1328, - 6.9805, - 6.5625, - 6.9180, - 7.1133, - 7.3359, - 5.7617, - 5.8711, - 6.4961, - 6.5859, - 6.2422, - 6.5273, - 6.7461, - 6.6992, - 6.7695, - 6.6289, - 5.9453, - 5.9805, - 7.1172, - 6.6719, - 6.0039, - 7.6875, - 6.7812, - 7.8359, - 6.9531, - 7.4336, - 7.6602, - 6.8164, - 7.3945, - 7.1602, - 6.8789, - 5.0078, - 6.0547, - 6.8086, - 6.7070, - 6.4688, - 6.4492, - 6.6172, - 5.5625, - 6.6914, - 6.4297, - 5.7461, - 5.3359, - 6.8750, - 6.4609, - 7.4062, - 5.2070, - 6.0820, - 6.7383, - 6.5703, - 6.1797, - 6.7070, - 6.5977, - 5.9961, - 6.6328, - 6.9375, - 6.3906, - 6.6484, - 4.9609, - 6.6445, - 6.5898, - 7.1875, - 7.5195, - 6.7969, - 6.1367, - 6.8906, - 7.4297, - 6.3633, - 6.0508, - 6.5000, - 6.4648, - 6.7539, - 6.7109, - 5.8086, - 6.6016, - 7.1133, - 4.8672, - 6.6367, - 6.1641, - 5.1758, - 6.9453, - 6.3242, - 7.0664, - 6.4805, - 6.3516, - 6.7383, - 8.4688, - 6.7305, - 5.9844, - 6.5938, - 7.2969, - 6.5977, - 7.5898, - 6.2969, - 6.8672, - 6.6680, - 7.1289, - 6.6875, - 5.4258, - 8.1875, - 8.0391, - 7.7969, - 6.6445, - 7.0703, - 7.3359, - 6.9805, - 6.6328, - 6.5352, - 6.2422, - 5.5820, - 6.8633, - 6.8047, - 6.5703, - 6.0117, - 6.7539, - 7.1719, - 6.8438, - 7.3633, - 6.6016, - 7.2070, - 6.4727, - 5.8008, - 7.4062, - 7.4805, - 6.6445, - 5.9023, - 6.3984, - 6.9961, - 6.6680, - 6.8242, - 6.7148, - 6.6172, - 6.9727, - 6.8320, - 5.9766, - 6.6133, - 5.5977, - 6.7773, - 7.3906, - 6.9219, - 7.0781, - 6.6914, - 5.7539, - 6.7969, - 6.8008, - 5.8047, - 7.1055, - 6.4961, - 6.0352, - 5.6211, - 7.4414, - 7.0703, - 6.1172, - 6.7461, - 6.4492, - 7.7148, - 6.4258, - 6.0039, - 6.5156, - 7.2188, - 7.4531, - 7.4844, - 7.5938, - 7.4023, - 6.7617, - 6.0078, - 6.3320, - 5.8906, - 7.5977, - 5.6523, - 6.7734, - 6.3008, - 5.2227, - 7.1719, - 7.1289, - 6.6602, - 5.4609, - 7.0312, - 6.0820, - 6.1719, - 6.0000, - 6.5547, - 6.6328, - 7.0547, - 7.0859, - 6.2656, - 5.5234, - 6.0273, - 6.7891, - 7.1875, - 6.9531, - 6.8203, - 6.3516, - 6.1172, - 6.4648, - 6.9180, - 7.3906, - 6.2812, - 5.7109, - 6.1484, - 6.9102, - 6.8711, - 7.0156, - 6.1445, - 5.8867, - 6.3828, - 5.9961, - 6.6914, - 6.7891, - 7.0820, - 6.6719, - 6.9297, - 6.3750, - 6.7578, - 6.4883, - 6.2227, - 6.2305, - 6.0508, - 6.6484, - 5.7578, - 7.2070, - 7.2383, - 6.9375, - 7.2578, - 6.5312, - 6.0312, - 6.7930, - 6.2578, - 7.0625, - 7.2148, - 6.4961, - 7.0703, - 6.4727, - 7.3906, - ] -).to(torch.float16) - -MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit" - -def get_diff(a, ref): - eps = 1e-6 - return f"Maxdiff: {(a - ref).abs().max()}, Mean relative diff: {((a - ref).abs() / (ref.abs() + eps)).mean()}" - -class TestsQ4ExllamaV1(ModelTest): - def test_exllama(self): - group_size = 128 - - m = 1 - k = 1024 - n = 1024 - device = torch.device("cuda:0") - - pack_dtype = torch.int32 - - linear_class = select_quant_linear( - bits=4, - group_size=group_size, - desc_act=False, - sym=True, - backend=BACKEND.EXLLAMA_V1, - format=FORMAT.GPTQ, - quant_method=METHOD.GPTQ, - pack_dtype=pack_dtype, - device=DEVICE.CUDA, - ) - - linear = linear_class( - bits=4, - group_size=group_size, - desc_act=False, - sym=True, - in_features=k, - out_features=n, - bias=False, - pack_dtype=pack_dtype, - backend=BACKEND.EXLLAMA_V1, - ) - self.assertTrue(isinstance(linear, ExllamaQuantLinear)) - - torch.manual_seed(42) - - linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32) - linear.scales = linear.scales + 0.002 - linear.qzeros += 0b00010001000100010001000100010001 # for new weight format - - linear = linear.eval() - linear = linear.to(device) - - linear = gptqmodel_post_init(linear, use_act_order=False) - - max_inner_outer_dim = max(k, n) - max_dq_buffer_size = linear.in_features * linear.out_features - max_input_len = 2048 - buffers = { - "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device), - "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device), - } - - prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"]) - - # Using the default from exllama repo here. - matmul_recons_thd = 8 - matmul_fused_remap = False - matmul_no_half2 = False - set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2) - - inp = torch.rand(1, m, k, dtype=torch.float16).to(device) - - with torch.inference_mode(): - res = linear(inp)[0][0] - - reference = REFERENCE.to(device) - - self.assertTrue( - torch.allclose(res, reference, rtol=3e-5, atol=2e-2), - get_diff(res, reference), - ) - - def test_exllama_buffer_size(self): - model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-sharded/" - prompt = "I am in Paris and" * 450 - device = torch.device("cuda:0") - - revision = "desc_act_true" - - model_q = GPTQModel.from_quantized( - model_id, - revision=revision, - device="cuda:0", - backend=BACKEND.EXLLAMA_V1, - ) - tokenizer = AutoTokenizer.from_pretrained(model_id) - - inp = tokenizer(prompt, return_tensors="pt").to(device) - - self.assertTrue( - inp["input_ids"].shape[1] > EXLLAMA_DEFAULT_MAX_INPUT_LENGTH - ) # 2048 is the default max_input_length - - model_q = exllama_set_max_input_length(model_q, 4096) - - _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) - - model_q = exllama_set_max_input_length(model_q, 1034) - - with self.assertRaises(RuntimeError) as cm: - _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3) - self.assertIn("temp_state buffer is too small", str(cm.exception)) - - def test_generation_desc_act_false(self): - prompt = "The capital of France is" - device = torch.device("cuda:0") - - model_q = GPTQModel.from_quantized( - MODEL_ID, - device="cuda:0", - backend=BACKEND.EXLLAMA_V1, - ) - tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - - predicted_text = ModelTest.generate_stable_with_limit( - model_q, - tokenizer, - prompt, - min_new_tokens=60, - max_new_tokens=60, - skip_special_tokens=False, - ) - - print("predicted_text", predicted_text) - assert "paris" in predicted_text.lower() or "city" in predicted_text.lower() or "country" in predicted_text.lower() - - def test_generation_desc_act_true(self): - revision = "desc_act_true" - - model_q = GPTQModel.from_quantized( - MODEL_ID, - revision=revision, - device="cuda:0", - backend=BACKEND.EXLLAMA_V1, - ) - - self.NATIVE_ARC_CHALLENGE_ACC = 0.285 - self.NATIVE_ARC_CHALLENGE_ACC_NORM = 0.314 - task_results = self.lm_eval(model=model_q, delete_quantized_model=False) - self.check_results(task_results) - - def test_multigpu(self): - # TODO - pass diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py index 4f973c39c..c02d6d34c 100644 --- a/tests/test_q4_exllama_v2.py +++ b/tests/test_q4_exllama_v2.py @@ -16,7 +16,7 @@ import torch # noqa: E402 from models.model_test import ModelTest # noqa: E402 -from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 +from test_q4_reference import REFERENCE, get_diff # noqa: E402 from transformers import AutoTokenizer # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402 diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py index 812929eea..d2f1d2372 100644 --- a/tests/test_quant_and_eora.py +++ b/tests/test_quant_and_eora.py @@ -123,7 +123,6 @@ def test_quant_and_eora(self, quant_method: METHOD, format: FORMAT): del model torch_empty_cache() - # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, for backend in [BACKEND.MARLIN]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN base_bench = self.bench(path=tmpdir, backend=backend, adapter=None) # inference using qweights only eora_bench = self.bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora) @@ -266,7 +265,6 @@ def test_quant_and_eora(self): del model torch_empty_cache() - # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, for backend in [BACKEND.MARLIN]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN eora_bench = self.bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora) base_bench = self.bench(path=tmpdir, backend=backend, adapter=None) # inference using qweights only diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py index 88c1c4bdd..1b828d652 100644 --- a/tests/test_save_loaded_quantized_model.py +++ b/tests/test_save_loaded_quantized_model.py @@ -48,7 +48,6 @@ def _generate_or_skip(self, model, backend: BACKEND, tokenizer, prompt, **kwargs [ (BACKEND.AUTO), (BACKEND.EXLLAMA_V2), - # (BACKEND.EXLLAMA_V1), (BACKEND.TRITON), (BACKEND.BITBLAS), (BACKEND.MARLIN),