diff --git a/MANIFEST.in b/MANIFEST.in
index 7f0629d20..8dd1afe0e 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,4 @@
 recursive-include gptqmodel_ext/awq *.h *.cuh *.cu *.cpp
-recursive-include gptqmodel_ext/exllama *.h *.cuh *.cu *.cpp
 recursive-include gptqmodel_ext/exllamav2 *.h *.cuh *.cu *.cpp
 recursive-include gptqmodel_ext/exllama_eora/eora *.h *.cuh *.cu *.cpp *.py
 recursive-include gptqmodel_ext/marlin *.h *.cuh *.cu *.cpp *.hpp
diff --git a/examples/README.md b/examples/README.md
index e42d5b3d4..29b841be3 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -27,7 +27,7 @@ python basic_usage_bitblas.py
 
 To Execute `basic_usage_exllama.py`, using command like this:
 ```shell
-python basic_usage_exllama.py --backend EXLLAMA/EXLLAMA_V2
+python basic_usage_exllama.py --backend EXLLAMA_V2
 ```
 
 To Execute `basic_usage_marlin.py`, using command like this:
@@ -104,4 +104,3 @@ CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_id_or_path PATH/TO/MOD
 ```
 
 Use `--help` flag to see detailed descriptions for more command arguments.
-
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
index 4c0978702..80f0f004d 100644
--- a/examples/benchmark/perplexity.py
+++ b/examples/benchmark/perplexity.py
@@ -42,7 +42,7 @@
     parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
     parser.add_argument("--use_fast_tokenizer", action="store_true", help="Whether to use fast tokenizer")
     parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
-    parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v1', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
+    parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
     args = parser.parse_args()
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=args.use_fast_tokenizer)
diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
index 0da3fa820..ff85a5a3d 100644
--- a/gptqmodel/__init__.py
+++ b/gptqmodel/__init__.py
@@ -52,7 +52,6 @@
 from .models.auto import ASCII_LOGO
 from .quantization import BaseQuantizeConfig, GPTAQConfig, QuantizeConfig
 from .utils import BACKEND
-from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
 
diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
index 2085d44bb..d0f0c4f06 100644
--- a/gptqmodel/models/_const.py
+++ b/gptqmodel/models/_const.py
@@ -128,6 +128,4 @@ def get_best_device(backend: BACKEND = BACKEND.AUTO) -> torch.device:
     else:
         return CPU
 
-EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
-
 EXPERT_INDEX_PLACEHOLDER = "{expert_index}"
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
deleted file mode 100644
index 8f677dbaa..000000000
--- a/gptqmodel/nn_modules/qlinear/exllama.py
+++ /dev/null
@@ -1,170 +0,0 @@
-# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
-# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
-# SPDX-License-Identifier: Apache-2.0
-# Contact: qubitium@modelcloud.ai, x.com/qubitium
-
-# Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-from typing import List, Optional, Tuple
-
-import torch
-
-from ...adapter.adapter import Adapter, Lora
-from ...models._const import DEVICE, PLATFORM
-from ...quantization import FORMAT, METHOD
-from ...utils.backend import BACKEND
-from ...utils.logger import setup_logger
-from . import BaseQuantLinear
-
-
-log = setup_logger()
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-NONE_TENSOR = torch.empty((1, 1), device="meta")
-
-class ExllamaQuantLinear(BaseQuantLinear):
-    SUPPORTS_BACKENDS = [BACKEND.EXLLAMA_V1]
-    SUPPORTS_METHODS = [METHOD.GPTQ]
-    SUPPORTS_FORMATS = {FORMAT.GPTQ: 70, FORMAT.GPTQ_V2: 70}
-    SUPPORTS_BITS = [4]
-    SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128]
-    SUPPORTS_DESC_ACT = [True, False]
-    SUPPORTS_SYM = [True, False]
-    SUPPORTS_SHARDS = True
-    SUPPORTS_TRAINING = False
-    SUPPORTS_AUTO_PADDING = False
-    SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32]
-    SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
-
-    SUPPORTS_DEVICES = [DEVICE.CUDA, DEVICE.ROCM]
-    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
-    SUPPORTS_PACK_DTYPES = [torch.int32]
-    SUPPORTS_ADAPTERS = [Lora]
-
-    SUPPORTS_DTYPES = [torch.float16, torch.bfloat16]
-
-    REQUIRES_FORMAT_V2 = True
-
-    # for transformers/optimum tests compat
-    QUANT_TYPE = "exllama"
-
-    gptqmodel_exllama_kernels = None
-
-    """Linear layer implementation with per-group 4-bit quantization of the weights"""
-
-    def __init__(
-        self,
-        bits: int,
-        group_size: int,
-        desc_act: bool,
-        sym: bool,
-        in_features: int,
-        out_features: int,
-        bias: bool = False,
-        pack_dtype: torch.dtype = torch.int32,
-        adapter: Adapter = None,
-        register_buffers: bool = True,
-        **kwargs,
-    ):
-        # backup original values
-        # self.original_out_features = out_features
-        # self.original_in_features = in_features
-        #
-        # # auto pad
-        # group_size = group_size if group_size != -1 else in_features
-        # out_features = out_features + (-out_features % 32)
-        # in_features = in_features + (-in_features % group_size)
-        # self.in_features_padding_size = in_features - self.original_in_features
-        # self.in_features_padding_shape = (0, self.in_features_padding_size)
-
-        super().__init__(
-            bits=bits,
-            group_size=group_size,
-            sym=sym, desc_act=desc_act,
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            pack_dtype=pack_dtype,
-            backend=kwargs.pop("backend", BACKEND.EXLLAMA_V1),
-            adapter=adapter,
-            register_buffers=register_buffers,
-            register_buffers_in_features=in_features,
-            register_buffers_out_feature=out_features,
-            **kwargs)
-
-    @classmethod
-    def validate_once(cls) -> Tuple[bool, Optional[Exception]]:
-        try:
-            import gptqmodel_exllama_kernels
-            cls.gptqmodel_exllama_kernels = gptqmodel_exllama_kernels
-            return True, None
-        except ImportError as e:
-            return False, e
-
-    def post_init(self):
-        # resize due to padding after model weights have been loaded
-        # if self.out_features != self.original_out_features or self.in_features != self.original_in_features:
-        #     self.qweight.resize_(self.in_features // self.pack_dtype_bits * self.bits, self.out_features)
-        #     self.qzeros.resize_(
-        #         math.ceil(self.in_features / self.group_size),
-        #         self.out_features // self.pack_dtype_bits * self.bits
-        #     )
-        #     self.scales.resize_((math.ceil(self.in_features / self.group_size), self.out_features), )
-        #     self.g_idx = torch.tensor([i // self.group_size for i in range(self.in_features)], dtype=torch.int32, device=self.g_idx.device)
-        #     if self.bias is not None:
-        #         self.bias.resize_(self.out_features)
-
-        # ext_make_q4 only accept float16 scales
-        self.scales = self.scales.to(dtype=torch.float16)
-
-        self.width = self.qweight.shape[1]
-
-        # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-        self.q4 = self.gptqmodel_exllama_kernels.make_q4(self.qweight, self.qzeros, self.scales, self.g_idx.to("cpu") if self._use_act_order else NONE_TENSOR, self.qweight.device.index)
-
-        super().post_init()
-
-    def list_buffers(self) -> List:
-        buf = super().list_buffers()
-        if hasattr(self, "q4") and self.q4 is not None:
-            buf.append(self.q4)
-        return buf
-
-    def ext_q4_matmul(self, x, q4, q4_width):
-        """Matrix multiplication, returns x @ q4"""
-        outshape = x.shape[:-1] + (q4_width,)
-        x = x.view(-1, x.shape[-1])
-
-        output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-        self.gptqmodel_exllama_kernels.q4_matmul(x, q4, output)
-
-        if self.bias is not None:
-            output.add_(self.bias)
-
-        if self.adapter:
-            output = self.adapter.apply(x=x, out=output)
-
-        return output.view(outshape)
-
-    def forward(self, x: torch.Tensor):
-        # TODO FIXME: parent should never call us if there is no data to process
-        # check: https://github.com/ModelCloud/GPTQModel/issues/1361
-        if x.shape[0] == 0:
-            return torch.empty((0, self.out_features), dtype=x.dtype, device=x.device)
-
-        x_dtype = x.dtype
-        if x_dtype != torch.float16:
-            #log.warn.once(
-            #    f"Exllama kernel requires a float16 input activation, while {x.dtype} was passed. Casting to float16.\nMake sure you loaded your model with torch_dtype=torch.float16, that the model definition does not inadvertently cast to float32, or disable AMP Autocast that may produce float32 intermediate activations in the model."
-            #)
-
-            x = x.to(dtype=torch.float16)
-
-        # TODO: need to run checks to make sure there is no performance regression padding with F.pad
-        # if in_features is padded, we need to pad the input as well
-        # if x.size(-1) != self.in_features:
-        #     x = F.pad(x, self.in_features_padding_shape)
-
-        out = self.ext_q4_matmul(x, self.q4, self.width)
-
-        return out.to(x_dtype)
diff --git a/gptqmodel/nn_modules/qlinear/exllama_awq.py b/gptqmodel/nn_modules/qlinear/exllama_awq.py
deleted file mode 100644
index a875388ac..000000000
--- a/gptqmodel/nn_modules/qlinear/exllama_awq.py
+++ /dev/null
@@ -1,146 +0,0 @@
-# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
-# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
-# SPDX-License-Identifier: Apache-2.0
-# Contact: qubitium@modelcloud.ai, x.com/qubitium
-
-import torch
-
-from ...adapter.adapter import Adapter, Lora
-from ...models._const import DEVICE, PLATFORM
-from ...nn_modules.qlinear import AWQuantLinear
-from ...quantization import FORMAT, METHOD
-from ...quantization.awq.utils.module import try_import
-from ...quantization.awq.utils.packing_utils import unpack_reorder_pack
-from ...utils.backend import BACKEND
-from ...utils.logger import setup_logger
-
-
-log = setup_logger()
-
-exl_ext, msg = try_import("gptqmodel_exllama_kernels")
-
-# Dummy tensor to pass instead of g_idx since there is no way to pass "None" to a C++ extension
-none_tensor = torch.empty((1, 1), device="meta")
-
-
-class AwqExllamaQuantLinear(AWQuantLinear):
-    SUPPORTS_BACKENDS = [BACKEND.EXLLAMA_V1]
-    SUPPORTS_METHODS = [METHOD.AWQ]
-    SUPPORTS_FORMATS = {FORMAT.GEMM: 70}
-    SUPPORTS_BITS = [4]
-    SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128]
-    SUPPORTS_DESC_ACT = [True, False]
-    SUPPORTS_SYM = [True, False]
-    SUPPORTS_SHARDS = True
-    SUPPORTS_TRAINING = False
-    SUPPORTS_AUTO_PADDING = False
-    SUPPORTS_IN_FEATURES_DIVISIBLE_BY = [32]
-    SUPPORTS_OUT_FEATURES_DIVISIBLE_BY = [32]
-
-    SUPPORTS_DEVICES = [DEVICE.CUDA, DEVICE.ROCM]
-    SUPPORTS_PLATFORM = [PLATFORM.LINUX]
-    SUPPORTS_PACK_DTYPES = [torch.int32]
-    SUPPORTS_ADAPTERS = [Lora]
-
-    SUPPORTS_DTYPES = [torch.float16, torch.bfloat16]
-
-    REQUIRES_FORMAT_V2 = True
-
-    # for transformers/optimum tests compat
-    QUANT_TYPE = "awq_exllama"
-
-    def __init__(
-            self,
-            bits: int,
-            group_size: int,
-            sym: bool,
-            desc_act: bool,
-            in_features: int,
-            out_features: int,
-            bias: bool = False,
-            pack_dtype: torch.dtype = torch.int32,
-            adapter: Adapter = None,
-            **kwargs,
-    ):
-        super().__init__(
-            bits=bits,
-            group_size=group_size,
-            sym=sym,
-            desc_act=desc_act,
-            in_features=in_features,
-            out_features=out_features,
-            bias=bias,
-            pack_dtype=pack_dtype,
-            backend=kwargs.pop("backend", BACKEND.EXLLAMA_V1),
-            adapter=adapter,
-            **kwargs)
-
-    def post_init(self):
-        # if self.padded_infeatures != self.in_features:
-        #     self.qweight.resize_(self.padded_infeatures // self.pack_dtype_bits * self.bits, self.out_features)
-        #     self.qzeros.resize_(
-        #         math.ceil(self.padded_infeatures / self.group_size),
-        #         self.out_features // self.pack_dtype_bits * self.bits
-        #     )
-        #     self.scales.resize_((math.ceil(self.padded_infeatures / self.group_size), self.out_features), )
-        #     self.g_idx = torch.tensor([i // self.group_size for i in range(self.padded_infeatures)], dtype=torch.int32,
-        #                               device=self.g_idx.device)
-
-        if exl_ext is None:
-            raise ModuleNotFoundError("External ExLlama kernels are not properly installed." + msg)
-
-        # awq only accepts float16
-        self.scales = self.scales.to(dtype=torch.float16)
-
-        assert self.qweight.device.type == "cuda"
-        assert self.qweight.device.index is not None
-
-        self.qweight, self.qzeros = unpack_reorder_pack(
-            self.qweight, self.qzeros, self.bits
-        )
-        self.q4 = exl_ext.make_q4(
-            self.qweight,
-            self.qzeros,
-            self.scales,
-            none_tensor,  # g_idx
-            self.qweight.device.index,  # device index
-        )
-
-        super().post_init()
-
-    def forward(self, x: torch.Tensor):
-        assert self.q4 is not None, (
-            "module.post_init() must be called before module.forward(). "
-            "Use exllama_post_init() on the whole model."
-        )
-        if exl_ext is None:
-            raise ModuleNotFoundError("External ExLlama kernels are not properly installed." + msg)
-
-        input_dtype = x.dtype
-        out_shape = x.shape[:-1] + (self.out_features,)
-
-        if input_dtype != torch.float16:
-            x = x.to(dtype=torch.float16)
-
-        x = x.view(-1, x.shape[-1])
-
-        out = torch.empty(
-            (x.shape[0], self.out_features),
-            dtype=torch.float16,
-            device=x.device,
-        )
-        exl_ext.q4_matmul(x, self.q4, out)
-
-        if input_dtype != torch.float16:
-            out = out.to(dtype=input_dtype)
-
-        if self.bias is not None:
-            out.add_(self.bias)
-
-        if self.adapter:
-            out = self.adapter.apply(x=x, out=out)
-
-        return out.view(out_shape)
-
-
-__all__ = ["AwqExllamaQuantLinear"]
diff --git a/gptqmodel/utils/backend.py b/gptqmodel/utils/backend.py
index b715e035e..5912dfd04 100644
--- a/gptqmodel/utils/backend.py
+++ b/gptqmodel/utils/backend.py
@@ -15,7 +15,6 @@ class BACKEND(str, Enum):
     TORCH_INT8 = "torch_int8" # optimized CPU int8 fused kernel
     TORCH = "torch" # GOOD: about 80% of triton
     TRITON = "triton" # VERY GOOD: all-around kernel
-    EXLLAMA_V1 = "exllama_v1" # FAST: optimized for batching == 1
     EXLLAMA_V2 = "exllama_v2" # FASTER: optimized for batching > 1
     EXLLAMA_EORA = "exllama_eora"
     MACHETE = "machete" # CUTLASS-based kernel optimized for Hopper (SM90+)
diff --git a/gptqmodel/utils/exllama.py b/gptqmodel/utils/exllama.py
deleted file mode 100644
index 053bbc894..000000000
--- a/gptqmodel/utils/exllama.py
+++ /dev/null
@@ -1,83 +0,0 @@
-# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
-# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
-# SPDX-License-Identifier: Apache-2.0
-# Contact: qubitium@modelcloud.ai, x.com/qubitium
-
-import torch
-
-from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
-from .torch import torch_empty_cache
-
-
-def exllama_set_max_input_length(model, max_input_length: int):
-    """
-    This method does not necessarily require `model` to inherit from BaseGPTQForCausalLM.
-
-    When using the exllama backend with act-order, it is necessary to initialize a buffer that depends on the maximum expected input length. In case the
-    default used (EXLLAMA_DEFAULT_MAX_INPUT_LENGTH) is too short, this method can be called to extend the buffer size without reloading the whole model.
-    """
-
-    # The import is set here to avoid a global import. Arguably this is quite ugly, it would be better to have lazy loading.
-    from gptqmodel_exllama_kernels import cleanup_buffers_cuda, prepare_buffers
-
-    if not model.quantize_config.desc_act:
-        raise ValueError(
-            "The method exllama_set_max_input_length should be called only when using the exllama backend **with act-order**."
-        )
-
-    uses_exllama_v1 = False
-    for name, submodule in model.named_modules():
-        if isinstance(submodule, ExllamaQuantLinear):
-            uses_exllama_v1 = True
-            break
-
-    if not uses_exllama_v1:
-        raise ValueError(
-            f"The function exllama_set_max_input_length was called, but the model (instance of {model.__class__.__name__}) does not use the exllama backend for GPTQ. An other implementation is used (exllamav2, triton) and that the call to exllama_set_max_input_length is unnecessary. Please remove the call to exllama_set_max_input_length or use the exllama v1 backend."
-        )
-
-    device_to_buffers_size = {}
-    for device, buffers in model.device_to_buffers.items():
-        device_to_buffers_size[device] = {
-            "max_dq_buffer_size": buffers["max_dq_buffer_size"],
-            "max_inner_outer_dim": buffers["max_inner_outer_dim"],
-        }
-
-    # For an unknown reason calling just `del model.device_to_buffers` raises an AttributeError.
-    for key in list(model.device_to_buffers.keys()):
-        del model.device_to_buffers[key]
-    model.device_to_buffers = None
-    del model.device_to_buffers
-
-    torch_empty_cache()
-    cleanup_buffers_cuda()
-
-    device_to_buffers = {}
-    for device, buffers_size in device_to_buffers_size.items():
-        # The temp_state buffer is required to reorder X in the act-order case.
-        # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-        device_to_buffers[device] = {
-            "temp_state": torch.zeros(
-                (max_input_length, buffers_size["max_inner_outer_dim"]),
-                dtype=torch.float16,
-                device=device,
-            ),
-            "temp_dq": torch.zeros(
-                (1, buffers_size["max_dq_buffer_size"]),
-                dtype=torch.float16,
-                device=device,
-            ),
-            "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
-            "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
-        }
-
-        prepare_buffers(
-            device,
-            device_to_buffers[device]["temp_state"],
-            device_to_buffers[device]["temp_dq"],
-        )
-
-    # Buffers need to be persistent to avoid any bug.
-    model.device_to_buffers = device_to_buffers
-
-    return model
diff --git a/gptqmodel/utils/model.py b/gptqmodel/utils/model.py
index 3ba42dde7..072216a16 100644
--- a/gptqmodel/utils/model.py
+++ b/gptqmodel/utils/model.py
@@ -41,12 +41,10 @@
 from ..models._const import (
     CPU,
     DEVICE,
-    EXLLAMA_DEFAULT_MAX_INPUT_LENGTH,
     EXPERT_INDEX_PLACEHOLDER,
     SUPPORTS_MODULE_TYPES,
 )
 from ..nn_modules.qlinear import BaseQuantLinear
-from ..nn_modules.qlinear.exllama import ExllamaQuantLinear
 from ..nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear
 from ..nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear
 from ..quantization import FORMAT, QuantizeConfig
@@ -975,14 +973,8 @@ def hf_gptqmodel_post_init(model, use_act_order: bool, quantize_config: Quantize
 def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeConfig = None,
                         max_input_length: Optional[int] = None):
     """
-    The max_input_length argument is specific to the exllama backend, that requires to initialize a buffer temp_state.
+    Initialize model-persistent backend scratch buffers after quantized weights are loaded.
     """
-    # post init for bitblas backend.
-    device_to_buffers_size = {}
-    # exllama
-    model_uses_exllama = False
-
-    # exllamav2
     fixed_bytes = {}
     model_uses_exllamav2 = False
 
@@ -1000,86 +992,6 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon
                 max_batch_size=int(os.getenv("AWQ_BATCH_SIZE", 1))
             )
             fixed_bytes[device] = max(scratch_fixed, fixed_bytes.get(device, 0))
-        elif isinstance(submodule, ExllamaQuantLinear):
-            model_uses_exllama = True
-            device = submodule.qweight.device
-            if device not in device_to_buffers_size:
-                device_to_buffers_size[device] = {
-                    "max_dq_buffer_size": 1,
-                    "max_inner_outer_dim": 1,
-                }
-            submodule._use_act_order = True if use_act_order else False
-
-            # Disable this heuristic for detecting act_order, but it could be used instead of the config.
-            """
-            if submodule.g_idx is None:
-                submodule.act_order = False
-            elif submodule.g_idx is not None and ((submodule.g_idx == 0).all() or torch.equal(submodule.g_idx.cpu(), torch.tensor([i // submodule.group_size for i in range(submodule.g_idx.shape[0])], dtype=torch.int32))):
-                submodule.g_idx = None
-                submodule.act_order = False
-            else:
-                submodule.act_order = True
-            """
-
-            device_to_buffers_size[device]["max_dq_buffer_size"] = max(
-                device_to_buffers_size[device]["max_dq_buffer_size"],
-                submodule.qweight.numel() * 8,
-                )
-
-            if use_act_order:
-                device_to_buffers_size[device]["max_inner_outer_dim"] = max(
-                    device_to_buffers_size[device]["max_inner_outer_dim"],
-                    submodule.in_features,
-                    submodule.out_features,
-                )
-
-    if model_uses_exllama:
-        # To be honest this is quite ugly, not proud of this.
-        from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params
-
-        device_to_buffers = {}
-
-        if use_act_order:
-            if max_input_length is None:
-                max_input_len = EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
-            else:
-                max_input_len = max_input_length
-        else:
-            if max_input_length is not None:
-                log.info(
-                    "Using exllama backend without act-order, the parameter max_input_length was set although not needed, it will be ignored."
-                )
-            max_input_len = 1
-
-        for device, buffers_size in device_to_buffers_size.items():
-            # The temp_state buffer is required to reorder X in the act-order case.
-            # The temp_dq buffer is required to dequantize weights when using cuBLAS, typically for the prefill.
-            device_to_buffers[device] = {
-                "temp_state": torch.zeros(
-                    (max_input_len, buffers_size["max_inner_outer_dim"]),
-                    dtype=torch.float16,
-                    device=device,
-                ),
-                "temp_dq": torch.zeros(
-                    (1, buffers_size["max_dq_buffer_size"]),
-                    dtype=torch.float16,
-                    device=device,
-                ),
-                "max_dq_buffer_size": buffers_size["max_dq_buffer_size"],
-                "max_inner_outer_dim": buffers_size["max_inner_outer_dim"],
-            }
-
-        # Buffers need to be persistent to avoid any bug.
-        model.device_to_buffers = device_to_buffers
-
-        for device, buffers in model.device_to_buffers.items():
-            prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
-
-        # Using the default from exllama repo here.
-        matmul_recons_thd = 16
-        matmul_fused_remap = False
-        matmul_no_half2 = False
-        set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
 
     if model_uses_exllamav2:
         from ..utils.exllamav2 import ScratchSpace
@@ -1102,9 +1014,6 @@ def gptqmodel_post_init(model, use_act_order: bool, quantize_config: QuantizeCon
 
     torch_empty_cache()
 
-    # if use_act_order and max_input_length and isinstance(submodule, ExllamaQuantLinear):
-    #     model = exllama_set_max_input_length(model, max_input_length)
-
     return model
 
 
diff --git a/gptqmodel_ext/exllama/cu_compat.cuh b/gptqmodel_ext/exllama/cu_compat.cuh
deleted file mode 100644
index c5258813e..000000000
--- a/gptqmodel_ext/exllama/cu_compat.cuh
+++ /dev/null
@@ -1,58 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _cuda_compat_cuh
-#define _cuda_compat_cuh
-
-// atomicAdd for half types, to support CC < 7.x
-
-__device__ __forceinline__ void atomicAdd_half(half* address, half val)
-{
-    unsigned int * address_as_ui = (unsigned int *) ((char *)address - ((size_t)address & 2));
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-
-    do
-    {
-        assumed = old;
-        __half_raw hsum;
-        hsum.x = (size_t)address & 2 ? (old >> 16) : (old & 0xffff);
-        half tmpres = __hadd(hsum, val);
-        hsum = __half_raw(tmpres);
-        old = (size_t)address & 2 ? (old & 0xffff) | (hsum.x << 16) : (old & 0xffff0000) | hsum.x;
-        old = atomicCAS(address_as_ui, assumed, old);
-    }
-    while (assumed != old);
-}
-
-// atomicAdd for half2 types
-
-__device__ __forceinline__ void atomicAdd_half2(half2* address, half2 val)
-{
-    unsigned int* address_as_ui = (unsigned int*)address;
-    unsigned int old = *address_as_ui;
-    unsigned int assumed;
-    do
-    {
-        assumed = old;
-        half2 old_val = *((half2*)&old);
-        half2 new_val = __hadd2(old_val, val);
-        old = atomicCAS(address_as_ui, assumed, *((unsigned int*)&new_val));
-    }
-    while (assumed != old);
-}
-
-//
-
-#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
-#if __CUDA_ARCH__ < 700 || defined(USE_ROCM)
-
-__device__ __forceinline__ void atomicAdd(half* address, half val) { atomicAdd_half(address, val); }
-
-#if __CUDA_ARCH__ < 600 || defined(USE_ROCM)
-__device__ __forceinline__ void atomicAdd(half2* address, half2 val) { atomicAdd_half2(address, val); }
-#endif
-
-#endif
-#endif
-
-#endif
diff --git a/gptqmodel_ext/exllama/cuda_buffers.cu b/gptqmodel_ext/exllama/cuda_buffers.cu
deleted file mode 100644
index 4416027c8..000000000
--- a/gptqmodel_ext/exllama/cuda_buffers.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#define _cuda_buffers_cu
-#include "cuda_buffers.cuh"
-
-CudaBuffers* g_buffers[CUDA_MAX_DEVICES] = {NULL};
-// __constant__ half2 q4_table[16][256];
-// half2 q4_table_host[16][256];
-// bool q4_table_init = false;
-
-CudaBuffers::CudaBuffers
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-) :
-    device(_device),
-    temp_state_size(_temp_state_size),
-    temp_state(_temp_state),
-    temp_dq(_temp_dq)
-{
-    cudaSetDevice(_device);
-
-    cudaStreamCreate(&alt_stream_1);
-    cudaStreamCreate(&alt_stream_2);
-    cudaStreamCreate(&alt_stream_3);
-    cudaEventCreate(&alt_stream_1_done);
-    cudaEventCreate(&alt_stream_2_done);
-    cudaEventCreate(&alt_stream_3_done);
-}
-
-CudaBuffers::~CudaBuffers()
-{
-    cudaStreamDestroy(alt_stream_1);
-    cudaStreamDestroy(alt_stream_2);
-    cudaStreamDestroy(alt_stream_3);
-    cudaEventDestroy(alt_stream_1_done);
-    cudaEventDestroy(alt_stream_2_done);
-    cudaEventDestroy(alt_stream_3_done);
-}
-
-CudaBuffers* get_buffers(const int device_index)
-{
-    return g_buffers[device_index];
-}
-
-void prepare_buffers_cuda
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-)
-{
-    CudaBuffers* buffers = new CudaBuffers
-    (
-        _device,
-        _temp_state_size,
-        _temp_state,
-        _temp_dq
-    );
-
-    g_buffers[_device] = buffers;
-}
-
-void cleanup_buffers_cuda()
-{
-    for (int i = 0; i < CUDA_MAX_DEVICES; i++)
-    {
-        if (!g_buffers[i]) continue;
-        delete g_buffers[i];
-        g_buffers[i] = NULL;
-    }
-}
diff --git a/gptqmodel_ext/exllama/cuda_buffers.cuh b/gptqmodel_ext/exllama/cuda_buffers.cuh
deleted file mode 100644
index 0bf2057c6..000000000
--- a/gptqmodel_ext/exllama/cuda_buffers.cuh
+++ /dev/null
@@ -1,55 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _cuda_buffers_cuh
-#define _cuda_buffers_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-const int CUDA_MAX_DEVICES = 16;
-
-// #ifndef _cuda_buffers_cu
-// extern __constant__ half2 q4_table[16][256];
-// #endif
-
-class CudaBuffers
-{
-public:
-    int device;
-
-    half* temp_state;           // [max_hidden_rows * intermediate_size]
-    int temp_state_size;
-    half* temp_dq;              // size of largest quant tensor * 8
-
-    cudaStream_t alt_stream_1;
-    cudaStream_t alt_stream_2;
-    cudaStream_t alt_stream_3;
-    cudaEvent_t alt_stream_1_done;
-    cudaEvent_t alt_stream_2_done;
-    cudaEvent_t alt_stream_3_done;
-
-    CudaBuffers
-    (
-        int _device,
-        int _temp_state_size,
-        half* _temp_state,
-        half* _temp_dq
-    );
-    ~CudaBuffers();
-};
-
-CudaBuffers* get_buffers(const int device_index);
-
-void prepare_buffers_cuda
-(
-    int _device,
-    int _temp_state_size,
-    half* _temp_state,
-    half* _temp_dq
-);
-
-void cleanup_buffers_cuda();
-
-#endif
diff --git a/gptqmodel_ext/exllama/cuda_func/column_remap.cu b/gptqmodel_ext/exllama/cuda_func/column_remap.cu
deleted file mode 100644
index 30e4039dd..000000000
--- a/gptqmodel_ext/exllama/cuda_func/column_remap.cu
+++ /dev/null
@@ -1,63 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include "column_remap.cuh"
-#include "../util.cuh"
-
-const int SHUF_BLOCKSIZE_X = 256;
-const int SHUF_BLOCKSIZE_Y = 16;
-
-__global__ void column_remap_kernel
-(
-    const half* __restrict__ x,
-    half* __restrict__ x_new,
-    const int x_width,
-    const int x_height,
-    const uint32_t* x_map
-)
-{
-    int x_column = SHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    int x_row = SHUF_BLOCKSIZE_Y * blockIdx.y;
-    if (x_column >= x_width) return;
-    //if (x_row >= x_height) return;
-
-    int x_stride = x_width;
-    int x_idx = x_row * x_stride + x_column;
-
-    int x_row_end = min(x_row + SHUF_BLOCKSIZE_Y, x_height);
-    int x_idx_end = x_row_end * x_stride + x_column;
-
-    int s_column = x_map[x_column];
-    int s_idx = x_row * x_stride + s_column;
-
-    while (x_idx < x_idx_end)
-    {
-        x_new[x_idx] = x[s_idx];
-        x_idx += x_stride;
-        s_idx += x_stride;
-    }
-}
-
-// Remap columns in x to correspond to sequential group index before matmul
-//
-// perform x -> seq_x such that seq_x @ seq_w == x @ w
-
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-)
-{
-    dim3 threads(SHUF_BLOCKSIZE_X, 1, 1);
-
-    dim3 blocks
-    (
-        (x_width + SHUF_BLOCKSIZE_X - 1) / SHUF_BLOCKSIZE_X,
-        (x_height + SHUF_BLOCKSIZE_Y - 1) / SHUF_BLOCKSIZE_Y,
-        1
-    );
-
-    column_remap_kernel<<<blocks, threads>>>(x, x_new, x_width, x_height, x_map);
-}
diff --git a/gptqmodel_ext/exllama/cuda_func/column_remap.cuh b/gptqmodel_ext/exllama/cuda_func/column_remap.cuh
deleted file mode 100644
index 6571c17d6..000000000
--- a/gptqmodel_ext/exllama/cuda_func/column_remap.cuh
+++ /dev/null
@@ -1,19 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _column_remap_cuh
-#define _column_remap_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-
-void column_remap_cuda
-(
-    const half* x,
-    half* x_new,
-    const int x_height,
-    const int x_width,
-    const uint32_t* x_map
-);
-
-#endif
\ No newline at end of file
diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu b/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu
deleted file mode 100644
index 7136a1a85..000000000
--- a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cu
+++ /dev/null
@@ -1,260 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include "q4_matmul.cuh"
-#include "column_remap.cuh"
-#include "../util.cuh"
-#include "../matrix.cuh"
-#include "../cu_compat.cuh"
-#include "../cuda_buffers.cuh"
-#if defined(USE_ROCM)
-#include "../hip_compat.cuh"
-#endif
-
-const int THREADS_X = 32;       // Block size and thread count along columns in w and out
-const int THREADS_Y = 1;        // Block size and thread count along rows in x and out
-
-typedef void (*fp_q4_matmul_kernel)
-(
-    const half*,
-    const uint32_t*,
-    half*,
-    const half*,
-    const uint32_t*,
-    const int,
-    const int,
-    const int,
-    const int,
-    const int,
-    const uint32_t*,
-    bool
-);
-
-template<bool use_half2, bool use_groupsize, bool use_x_map>
-__global__ void q4_matmul_kernel
-(
-    const half* __restrict__ x,
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int dim,
-    const int width,
-    const int groupsize,
-    const int block_size_z,
-    const uint32_t* __restrict__ x_map,
-    bool no_zero
-)
-{
-    // Start of block
-
-    int x_column = block_size_z * blockIdx.z;
-    int x_column_end = min(dim, block_size_z * (blockIdx.z + 1));
-
-    int w_column = THREADS_X * blockIdx.x + threadIdx.x;
-    int x_row = THREADS_Y * blockIdx.y + threadIdx.y;
-
-    int iterations = (x_column_end - x_column) / 8;
-
-    // Views
-
-    MatrixView_half x_(x, height, dim);
-    MatrixView_half w_scales_(w_scales, dim / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, dim / groupsize, width);
-    MatrixView_q4_column w_(w, dim, width);
-    MatrixView_half_rw out_(out, height, width);
-
-    // Zero output
-
-    if (!no_zero && blockIdx.z == 0 && (threadIdx.x & 1) == 0)
-    {
-        *((uint32_t*) out_.item_ptr(x_row, w_column)) = 0;
-        __syncthreads();
-    }
-
-    // Loop over part of x row (and w column)
-
-    half2 acc = {};
-    half acc_h = {};
-
-    if constexpr (use_groupsize)
-    {
-        // For quant matrices where groupsize divides BLOCK_SIZE_Z we always start on a group boundary, so this
-        // could be slightly faster
-
-        for (int k = x_column, group = x_column / groupsize; k < x_column + iterations * 8; group++, k += groupsize)
-        {
-            if constexpr (use_half2)
-            {
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f;  // Avoid overflows.
-
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-            else
-            {
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f;  // Avoid overflows.
-
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, groupsize / 8);
-            }
-        }
-    }
-    else
-    {
-        // Otherwise assume groupsize is a multiple of 8, do 8 columns per iteration and trust the cache
-
-        for (int k = x_column; k < x_column + iterations * 8; k += 8)
-        {
-            if constexpr (use_half2)
-            {
-                int group = k / groupsize;
-                half2 w_scale = w_scales_.item_half2half2(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f;  // Avoid overflows.
-
-                if constexpr (use_x_map) acc = dot_product_8_x_map(acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc = dot_product_8      (acc, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-            else
-            {
-                int group = k / groupsize;
-                half w_scale = w_scales_.item(group, w_column);
-                uint32_t w_zero = w_zeros_.item(group, w_column) & 0x0f;  // Avoid overflows.
-
-                if constexpr (use_x_map) acc_h = dot_product_8_x_map_h(acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1, x_map);
-                else                     acc_h = dot_product_8_h      (acc_h, x_, x_row, k, w_, k, w_column, w_scale, w_zero, 1);
-            }
-        }
-    }
-
-    // Add to block result
-
-    if constexpr (use_half2)
-    {
-        half result = __hadd(__low2half(acc), __high2half(acc));
-        atomicAdd(out_.item_ptr(x_row, w_column), result);
-    }
-    else
-    {
-        atomicAdd(out_.item_ptr(x_row, w_column), acc_h);
-    }
-}
-
-fp_q4_matmul_kernel q4_matmul_kernel_pick(ExLlamaTuning* tuningParams, int block_size_z, int groupsize, uint32_t* x_map)
-{
-    // <bool use_half2, bool use_groupsize, bool use_x_map>
-    if (tuningParams->matmul_no_half2) {
-        if (block_size_z % groupsize == 0) {
-            if (x_map) return q4_matmul_kernel<false, true,  true >;
-            else       return q4_matmul_kernel<false, true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<false, false, true >;
-            else       return q4_matmul_kernel<false, false, false>;
-        }
-    } else {
-        if (block_size_z % groupsize == 0)
-        {
-            if (x_map) return q4_matmul_kernel<true,  true,  true >;
-            else       return q4_matmul_kernel<true,  true,  false>;
-        } else {
-            if (x_map) return q4_matmul_kernel<true,  false, true >;
-            else       return q4_matmul_kernel<true,  false, false>;
-        }
-    }
-};
-
-// Compute y = x @ w
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero,
-    cudaStream_t alt_stream
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-
-    cudaSetDevice(w->device);
-
-    uint32_t* x_map = w->cuda_x_map;
-    const half* x_mapped = x;
-    if (x_map && !tuningParams->matmul_fused_remap && !alt_stream)
-    {
-        CudaBuffers* buffers = get_buffers(w->device);
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-        x_map = NULL;
-    }
-
-    int block_size_z;
-    if (w->width == 4096) block_size_z = 384;           // 7B
-    else if (w->width == 11008) block_size_z = 256;
-    else if (w->width == 5120) block_size_z = 384;      // 13B
-    else if (w->width == 13824) block_size_z = 256;
-    else if (w->width == 6656) block_size_z = 256;      // 33B
-    else if (w->width == 17920) block_size_z = 128;
-    else block_size_z = 256;
-
-    //if (!no_zero) cudaMemsetAsync(out, 0, x_height * w->width * sizeof(half));
-
-    dim3 threads(THREADS_X, THREADS_Y, 1);
-
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height + threads.y - 1) / threads.y,
-        (dim + block_size_z - 1) / block_size_z
-    );
-
-    fp_q4_matmul_kernel kernel = q4_matmul_kernel_pick(tuningParams, block_size_z, w->groupsize, x_map);
-
-    kernel<<<blocks, threads, 0, alt_stream>>> (x_mapped, w->cuda_qweight, out, w->cuda_scales, w->cuda_qzeros, height, dim, width, w->groupsize, block_size_z, x_map, no_zero);
-}
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero
-)
-{
-    int height = x_height;
-    int dim = w->height;
-    int width = w->width;
-
-    cudaSetDevice(w->device);
-    CudaBuffers* buffers = get_buffers(w->device);
-
-    const half* x_mapped = x;
-    if (w->cuda_x_map)
-    {
-        TORCH_CHECK(buffers->temp_state_size >= x_height * dim, "The temp_state buffer is too small in the exllama backend for GPTQ with act-order. Please call the exllama_set_max_input_length function to increase the buffer size for a sequence length >=", x_height, ":\nfrom gptqmodel import exllama_set_max_input_length\nmodel = exllama_set_max_input_length(model, max_input_length=", x_height, ")");
-        column_remap_cuda(x, buffers->temp_state, x_height, dim, w->cuda_x_map);
-        x_mapped = buffers->temp_state;
-    }
-
-    w->reconstruct(buffers->temp_dq);
-
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 700
-    const float alpha = 1.0f;
-    const float beta = no_zero ? 1.0f : 0.0f;
-    cublasSgemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, CUDA_R_16F, width,
-                  x_mapped, CUDA_R_16F, dim, &beta, out, CUDA_R_16F, width);
-#else
-    const half alpha = __float2half(1.0f);
-    const half beta = no_zero ? __float2half(1.0f) : __float2half(0.0f);
-    cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, width, height, dim, &alpha, buffers->temp_dq, width, x_mapped, dim, &beta, out, width);
-#endif
-}
diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh b/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh
deleted file mode 100644
index 49967648f..000000000
--- a/gptqmodel_ext/exllama/cuda_func/q4_matmul.cuh
+++ /dev/null
@@ -1,43 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matmul_cuh
-#define _q4_matmul_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include <ATen/cuda/CUDAContext.h>
-
-#include "q4_matrix.cuh"
-#include "../tuning.h"
-
-// Workaround for hipify_python using rocblas instead of hipblas.
-#if defined(USE_ROCM)
-#include <hipblas/hipblas.h>
-#define rocblas_handle hipblasHandle_t
-#endif
-
-void q4_matmul_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    const Q4Matrix* w,
-    half* out,
-    bool no_zero = false,
-    cudaStream_t alt_stream = NULL
-);
-
-void q4_matmul_recons_cuda
-(
-    ExLlamaTuning* tuningParams,
-    const half* x,
-    const int x_height,
-    Q4Matrix* w,
-    half* out,
-    const cublasHandle_t handle,
-    bool no_zero = false
-);
-
-#endif
diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu b/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu
deleted file mode 100644
index d6e441ae6..000000000
--- a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cu
+++ /dev/null
@@ -1,225 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include "q4_matrix.cuh"
-#include <vector>
-#include "../util.cuh"
-#include "../matrix.cuh"
-
-using namespace std;
-
-const int UNSHUF_BLOCKSIZE_X = 64;
-
-const int RECONS_THREADS_X = 64;      // Block size and thread count along columns in out, each thread converts 1 column
-const int RECONS_THREADS_Y = 1;       // Block size and thread count along rows in x and out, each thread converts 8 rows
-
-vector<Q4Matrix*> g_q4_matrices;
-
-void g_q4_keep_matrix(Q4Matrix* m)
-{
-    g_q4_matrices.push_back(m);
-}
-
-void g_q4_free_matrices()
-{
-    for (const auto& m : g_q4_matrices) delete m;
-    g_q4_matrices.clear();
-}
-
-Q4Matrix::Q4Matrix
-(
-    const int _height,
-    const int _width,
-    const int _groups,
-
-    uint32_t* _qweight,
-    uint32_t* _qzeros,
-    half* _scales,
-    uint32_t* _g_idx,
-
-    const int _device
-) :
-    height(_height),
-    width(_width),
-    groups(_groups),
-    device(_device)
-{
-    cudaSetDevice(device);
-
-    cuda_qweight = _qweight;
-    cuda_qzeros = _qzeros;
-    cuda_scales = _scales;
-
-    groupsize = height / groups;
-
-    if (_g_idx) make_sequential(_g_idx);
-}
-
-Q4Matrix::~Q4Matrix()
-{
-}
-
-// Make sequential
-
-__global__ void make_sequential_kernel
-(
-    const uint32_t* __restrict__ w,
-    uint32_t* __restrict__ w_new,
-    const uint32_t* __restrict__ x_map,
-    const int w_height,
-    const int w_width
-)
-{
-    const uint64_t* w2 = (uint64_t*) w;
-    uint64_t* w_new2 = (uint64_t*) w_new;
-    int w2_stride = w_width >> 1;
-
-    int w2_column = UNSHUF_BLOCKSIZE_X * blockIdx.x + threadIdx.x;
-    if (w2_column >= w2_stride) return;
-
-    int w_new2_row = blockIdx.y;
-
-    int x_map_idx = w_new2_row << 3;
-
-    uint64_t dst = 0;
-
-    #pragma unroll
-    for (int i = 0; i < 8; i++)
-    {
-        int source_row = x_map[x_map_idx++];
-
-        int w2_row = source_row >> 3;
-        int w2_subrow = source_row & 0x07;
-        int w2_row_shift = w2_subrow << 2;
-        int wnew2_row_shift = i << 2;
-
-        uint64_t src = w2[w2_row * w2_stride + w2_column];
-        src >>= w2_row_shift;
-        src &= 0x0000000f0000000f;
-        src <<= wnew2_row_shift;
-        dst |= src;
-    }
-
-    w_new2[w_new2_row * w2_stride + w2_column] = dst;
-}
-
-void Q4Matrix::make_sequential(const uint32_t* cpu_g_idx)
-{
-    uint32_t* cuda_new_qweight = NULL;
-    cudaMalloc(&cuda_new_qweight, height / 8 * width * sizeof(uint32_t));
-    cudaMalloc(&cuda_x_map, height * sizeof(uint32_t));  // TODO: Should probably be allocated in PyTorch
-
-    uint32_t* cpu_g_idx_map = (uint32_t*) calloc(groups, sizeof(uint32_t));
-    uint32_t* cpu_x_map = (uint32_t*) malloc(height * sizeof(uint32_t));
-    uint32_t* cpu_x_map_inv = (uint32_t*) malloc(height * sizeof(uint32_t));
-
-    // Group histogram
-
-    for (int i = 0; i < height; i++) cpu_g_idx_map[cpu_g_idx[i]]++;
-
-    // Group map
-
-    for (int i = 0, acc = 0; i < groups; i++)
-    {
-        short tmp = cpu_g_idx_map[i];
-        cpu_g_idx_map[i] = acc;
-        acc += tmp;
-    }
-
-    // X map (inverse)
-
-    for (int row = 0; row < height; row++)
-    {
-        uint32_t target_group = cpu_g_idx[row];
-        uint32_t target_row = cpu_g_idx_map[target_group];
-        cpu_g_idx_map[target_group]++;
-        cpu_x_map_inv[row] = target_row;
-    }
-
-    // X map
-
-    for (int row = 0; row < height; row++) cpu_x_map[cpu_x_map_inv[row]] = row;
-
-    // Move to CUDA
-
-    cudaMemcpyAsync(cuda_x_map, cpu_x_map, height * sizeof(uint32_t), cudaMemcpyHostToDevice);
-
-    // Rearrange rows in w
-
-    dim3 threads(UNSHUF_BLOCKSIZE_X, 1, 1);
-    dim3 blocks
-    (
-        (width + UNSHUF_BLOCKSIZE_X * 2 - 1) / (UNSHUF_BLOCKSIZE_X * 2),
-        height / 8,
-        1
-    );
-
-    make_sequential_kernel<<<blocks, threads>>>(cuda_qweight, cuda_new_qweight, cuda_x_map, height / 8, width);
-
-    // Replace qweights
-
-    cudaMemcpyAsync(cuda_qweight, cuda_new_qweight, height / 8 * width * sizeof(uint32_t), cudaMemcpyDeviceToDevice);
-
-    // Cleanup
-
-    cudaDeviceSynchronize();
-    cudaFree(cuda_new_qweight);
-    free(cpu_g_idx_map);
-    free(cpu_x_map);
-    free(cpu_x_map_inv);
-}
-
-__global__ void reconstruct_kernel
-(
-    const uint32_t* __restrict__ w,
-    half* __restrict__ out,  // (y)
-    const half* __restrict__ w_scales,
-    const uint32_t* __restrict__ w_zeros,
-    const int height,
-    const int width,
-    const int groupsize
-)
-{
-    // Start of block
-
-    int column = RECONS_THREADS_X * blockIdx.x + threadIdx.x;
-    int row = (RECONS_THREADS_Y * blockIdx.y + threadIdx.y) * 8;
-    if (column >= width) return;
-    
-    // Views
-
-    MatrixView_q4_column w_(w, height, width);
-    MatrixView_half_rw out_(out, height, width);
-    MatrixView_half w_scales_(w_scales, height / groupsize, width);
-    MatrixView_q4_row w_zeros_(w_zeros, height / groupsize, width);
-
-    // Groupsize version
-
-    int group = row / groupsize;
-
-    half w_scale = w_scales_.item(group, column);
-    uint32_t w_zero = w_zeros_.item(group, column) & 0x0f;
-
-    uint32_t w_read = w_.item_uint32_t(row, column);
-    half* out_ptr = out_.item_ptr(row, column);
-
-    #pragma unroll
-    for (int s = 0; s < 32; s += 4)
-    {
-        half w_item = __hmul(__int2half_rn((int)((w_read >> s) & 0x0f) - w_zero), w_scale);
-        *out_ptr = w_item; out_ptr += out_.width;
-    }
-}
-
-void Q4Matrix::reconstruct(half* out)
-{
-    dim3 threads(RECONS_THREADS_X, RECONS_THREADS_Y, 1);
-
-    dim3 blocks
-    (
-        (width + threads.x - 1) / threads.x,
-        (height / 8 + threads.y - 1) / threads.y,
-        1
-    );
-
-    reconstruct_kernel<<<blocks, threads>>>(cuda_qweight, out, cuda_scales, cuda_qzeros, height / 8, width, groupsize);
-}
diff --git a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh b/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh
deleted file mode 100644
index 50cb72a41..000000000
--- a/gptqmodel_ext/exllama/cuda_func/q4_matrix.cuh
+++ /dev/null
@@ -1,53 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _q4_matrix_cuh
-#define _q4_matrix_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-
-class Q4Matrix
-{
-public:
-
-    int device;
-
-    int height;
-    int width;
-    int groups;
-    int groupsize;
-
-    uint32_t* cuda_qweight = NULL;
-    uint32_t* cuda_qzeros = NULL;
-    half* cuda_scales = NULL;
-    uint32_t* cuda_x_map = NULL;
-
-    Q4Matrix
-    (
-        const int _height,
-        const int _width,
-        const int _groups,
-
-        uint32_t* _qweight,
-        uint32_t* _qzeros,
-        half* _scales,
-        uint32_t* _g_idx,
-
-        const int _device
-    );
-
-    ~Q4Matrix();
-
-    void reconstruct(half* out);
-
-private:
-
-    void make_sequential(const uint32_t* cpu_g_idx);
-
-};
-
-void g_q4_keep_matrix(Q4Matrix* m);
-void g_q4_free_matrices();
-
-#endif
\ No newline at end of file
diff --git a/gptqmodel_ext/exllama/exllama_ext.cpp b/gptqmodel_ext/exllama/exllama_ext.cpp
deleted file mode 100644
index 020fee4e1..000000000
--- a/gptqmodel_ext/exllama/exllama_ext.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#include <torch/extension.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-#include "util.cuh"
-#include "tuning.h"
-#include "cuda_buffers.cuh"
-#include "cuda_func/q4_matrix.cuh"
-#include "cuda_func/q4_matmul.cuh"
-#include "cuda_func/column_remap.cuh"
-
-#include <typeinfo>
-#include <limits>
-#include <algorithm>
-
-// Check CUDA return code. We don't want to include Torch headers in the .cu files because parsing them adds almost a
-// minute to the compile time on a 12900K. Also passing exceptions back to Python is super tricky, so in place of
-// exceptions, CUDA functions return with a cudaError_t which we can parse and dump to the console.
-
-void check_cuda(cudaError_t ret)
-{
-    switch (ret)
-    {
-        case cudaSuccess:
-            break;
-
-        case cudaUnspecified:
-            printf(" **** Unspecified error\n");
-            TORCH_CHECK(false, "CUDA error");
-            break;
-
-        default:
-            printf(" **** CUDA error\n"); \
-            printf(" **** %s\n", cudaGetErrorString(ret)); \
-            TORCH_CHECK(false, "CUDA error"); \
-            break;
-    }
-}
-
-// Some decluttering macros
-
-#define STRINGIFY_(__x) #__x
-#define STRINGIFY(__x) STRINGIFY_(__x)
-#define TORCH_CHECK_DTYPE(__x, __dtype) TORCH_CHECK((__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_DTYPE_OPT(__x, __dtype) TORCH_CHECK((__x).device().is_meta() || (__x).dtype() == torch::__dtype, #__x " is incorrect datatype, must be " #__dtype)
-#define TORCH_CHECK_SHAPES(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPES_OPT(__x, __dim_x, __y, __dim_y, __scale_y) TORCH_CHECK((__x).device().is_meta() || (__x).size(__dim_x) == (__y).size(__dim_y) * __scale_y, #__x " and " #__y " have incompatible shapes")
-#define TORCH_CHECK_SHAPE_MOD(__x, __dim_x, __mod) TORCH_CHECK((__x).size(__dim_x) % __mod == 0, #__x ".shape[" STRINGIFY(__dim_x) "] must be a multiple of " STRINGIFY(__mod))
-#define TORCH_CHECK_BUFFER_SIZE(__buffer, __minimum_size) TORCH_CHECK((__buffer).numel() >= __minimum_size, #__buffer " is too small")
-
-#define TORCH_CHECK_DEVICE_INDEX(__index) \
-do { \
-    TORCH_CHECK(__index >= 0, "no device index"); \
-    TORCH_CHECK(__index < CUDA_MAX_DEVICES, "invalid device index"); \
-} while(0)
-
-#define TORCH_CHECK_QUANT(__w, __w_scales, __w_zeros, __seq_g_idx, __x_map) \
-do { \
-    TORCH_CHECK_DTYPE(__w, kInt); \
-    TORCH_CHECK_DTYPE(__w_scales, kHalf); \
-    TORCH_CHECK_DTYPE(__w_zeros, kInt); \
-    TORCH_CHECK_DTYPE_OPT(__seq_g_idx, kShort); \
-    TORCH_CHECK_DTYPE_OPT(__x_map, kInt); \
-    TORCH_CHECK_SHAPES_OPT(__seq_g_idx, 0, __w, 0, 2 * 8); \
-    TORCH_CHECK_SHAPES_OPT(__x_map, 0, __w, 0, 8); \
-} while(0)
-
-int get_groupsize(torch::Tensor w, torch::Tensor w_zeros)
-{
-    int groupsize = w.size(0) * 8 / w_zeros.size(0);
-    TORCH_CHECK(groupsize * w_zeros.size(0) == w.size(0) * 8, "w.shape[-2] must be a multiple of zeros.shape[-2]")
-    return groupsize;
-}
-
-
-// Tuning parameters
-
-ExLlamaTuning tuningParams;
-
-void set_tuning_params
-(
-    int matmul_recons_thd,
-    bool matmul_fused_remap,
-    bool matmul_no_half2
-)
-{
-    tuningParams.matmul_recons_thd = matmul_recons_thd;
-    tuningParams.matmul_fused_remap = matmul_fused_remap;
-    tuningParams.matmul_no_half2 = matmul_no_half2;
-}
-
-
-// Release all unmanaged objects allocated by the extension
-
-void cleanup()
-{
-    cleanup_buffers_cuda();
-    g_q4_free_matrices();
-}
-
-
-// Prepare buffers for forward pass
-
-void prepare_buffers
-(
-    torch::Device device,
-    torch::Tensor temp_state,
-    torch::Tensor temp_dq
-)
-{
-    int device_index = device.index();
-    TORCH_CHECK_DEVICE_INDEX(device_index);
-    const at::cuda::OptionalCUDAGuard device_guard(device);
-    const long max_int = std::numeric_limits<int>::max();
-
-    prepare_buffers_cuda
-    (
-        device_index,
-        // buffer size used for sanity checks
-        std::clamp((long)temp_state.numel(), (long)0, max_int),
-        (half*) temp_state.data_ptr(),
-        (half*) temp_dq.data_ptr()
-    );
-}
-
-
-// Create Q4Matrix, return handle
-
-uintptr_t make_q4
-(
-    torch::Tensor qweight,
-    torch::Tensor qzeros,
-    torch::Tensor scales,
-    torch::Tensor g_idx,
-    int device
-)
-{
-    TORCH_CHECK_DTYPE(qweight, kInt);
-    TORCH_CHECK_DTYPE(qzeros, kInt);
-    TORCH_CHECK_DTYPE(scales, kHalf);
-    TORCH_CHECK_DTYPE_OPT(g_idx, kInt);
-    TORCH_CHECK_SHAPES(qweight, 1, qzeros, 1, 8);
-    TORCH_CHECK_SHAPES(scales, 1, qweight, 1, 1);
-    TORCH_CHECK_SHAPES(qzeros, 0, scales, 0, 1);
-
-    int width = qweight.size(1);
-    int height = qweight.size(0) * 8;
-    int groups = qzeros.size(0);
-
-    Q4Matrix* m = new Q4Matrix
-    (
-        height,
-        width,
-        groups,
-
-        (uint32_t*) qweight.data_ptr(),
-        (uint32_t*) qzeros.data_ptr(),
-        (half*) scales.data_ptr(),
-        g_idx.device().is_meta() ? NULL : (uint32_t*) g_idx.data_ptr(),
-
-        device
-    );
-
-    g_q4_keep_matrix(m);
-    return reinterpret_cast<uintptr_t> (m);
-}
-
-
-// Matmul half @ quant -> half
-
-void q4_matmul
-(
-    torch::Tensor x,
-    uintptr_t w,
-    torch::Tensor out
-)
-{
-    Q4Matrix* wm = reinterpret_cast<Q4Matrix*> (w);
-
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(out, kHalf);
-    TORCH_CHECK_SHAPES(x, 0, out, 0, 1);
-    TORCH_CHECK(wm->height == x.size(-1), "x and w have incompatible shapes")
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    int x_height = x.size(0);
-
-    if (tuningParams.matmul_recons_thd == 0 || x_height < tuningParams.matmul_recons_thd)
-    {
-        q4_matmul_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr()
-        );
-    }
-    else
-    {
-        q4_matmul_recons_cuda
-        (
-            &tuningParams,
-            (half*) x.data_ptr(),
-            x_height,
-            wm,
-            (half*) out.data_ptr(),
-            at::cuda::getCurrentCUDABlasHandle()
-        );
-    }
-}
-
-
-// Remap columns in half tensor
-
-void column_remap
-(
-    torch::Tensor x,
-    torch::Tensor x_new,
-    torch::Tensor x_map
-)
-{
-    TORCH_CHECK_DTYPE(x, kHalf);
-    TORCH_CHECK_DTYPE(x_new, kHalf);
-    TORCH_CHECK_DTYPE(x_map, kInt);
-    TORCH_CHECK_SHAPES(x_map, 0, x, 1, 1);
-
-    int height = x.size(0);
-    int width = x.size(1);
-
-    TORCH_CHECK_BUFFER_SIZE(x_new, height * width);
-
-    const at::cuda::OptionalCUDAGuard device_guard(device_of(x));
-
-    column_remap_cuda
-    (
-        (half*) x.data_ptr(),
-        (half*) x_new.data_ptr(),
-        height,
-        width,
-        (uint32_t*) x_map.data_ptr()
-    );
-}
-
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
-{
-    m.def("set_tuning_params", &set_tuning_params, "set_tuning_params");
-    m.def("prepare_buffers", &prepare_buffers, "prepare_buffers");
-    m.def("cleanup", &cleanup, "cleanup");
-    m.def("make_q4", &make_q4, "make_q4");
-    m.def("q4_matmul", &q4_matmul, "q4_matmul");
-    m.def("cleanup_buffers_cuda", &cleanup_buffers_cuda, "cleanup_buffers_cuda");
-}
diff --git a/gptqmodel_ext/exllama/hip_compat.cuh b/gptqmodel_ext/exllama/hip_compat.cuh
deleted file mode 100644
index 680274787..000000000
--- a/gptqmodel_ext/exllama/hip_compat.cuh
+++ /dev/null
@@ -1,53 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _hip_compat_cuh
-#define _hip_compat_cuh
-
-// Workaround for a bug in hipamd, backported from upstream, this is fixed in ROCm 5.6.
-__device__ __forceinline__ __half __compat_hrcp(__half x) {
-    return __half_raw{
-        static_cast<_Float16>(__builtin_amdgcn_rcph(static_cast<__half_raw>(x).data))};
-}
-
-// ROCm 6.0 compatible from: /opt/rocm-6.0.0/include/hip/amd_detail/amd_hip_fp16.h:1708
-__device__ __forceinline__ __half2 __compat_h2rcp(__half2 x) {
-    return _Float16_2{
-        _Float16_2{static_cast<_Float16>(1.0f), 
-            static_cast<_Float16>(1.0f)} / x.data};
-}
-
-#define hrcp __compat_hrcp
-#define h2rcp __compat_h2rcp
-
-// Automatic conversion of hipblasHgemm doesn't convert half to hipblasHalf.
-__host__ __forceinline__ hipblasStatus_t __compat_hipblasHgemm(hipblasHandle_t    handle,
-                                                               hipblasOperation_t transA,
-                                                               hipblasOperation_t transB,
-                                                               int                m,
-                                                               int                n,
-                                                               int                k,
-                                                               const half*        alpha,
-                                                               const half*        AP,
-                                                               int                lda,
-                                                               const half*        BP,
-                                                               int                ldb,
-                                                               const half*        beta,
-                                                               half*              CP,
-                                                               int                ldc) {
-    return hipblasHgemm(handle, transA, transB, m, n, k,
-                        reinterpret_cast<const hipblasHalf *>(alpha),
-                        reinterpret_cast<const hipblasHalf *>(AP), lda,
-                        reinterpret_cast<const hipblasHalf *>(BP), ldb,
-                        reinterpret_cast<const hipblasHalf *>(beta),
-                        reinterpret_cast<hipblasHalf *>(CP), ldc);
-}
-#define hipblasHgemm __compat_hipblasHgemm
-
-// Previous version of PyTorch were converting to rocBLAS instead of hipBLAS.
-#define rocblas_handle hipblasHandle_t
-#define rocblas_operation_none HIPBLAS_OP_N
-#define rocblas_get_stream hipblasGetStream
-#define rocblas_set_stream hipblasSetStream
-#define rocblas_hgemm __compat_hipblasHgemm
-
-#endif
diff --git a/gptqmodel_ext/exllama/matrix.cuh b/gptqmodel_ext/exllama/matrix.cuh
deleted file mode 100644
index 2fd5ab0b3..000000000
--- a/gptqmodel_ext/exllama/matrix.cuh
+++ /dev/null
@@ -1,294 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _matrix_cuh
-#define _matrix_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-
-class MatrixView_half
-{
-public:
-    const half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half(const half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half2 item_half2half2(int row, int column) const { return __half2half2(data[row * width + column]); }
-    __device__ __forceinline__ const half* item_ptr(int row, int column) const { return &data[row * width + column]; }
-};
-
-class MatrixView_half_rw
-{
-public:
-    half* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_half_rw(half* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ half item(int row, int column) const { return data[row * width + column]; }
-    __device__ __forceinline__ half2 item_half2(int row, int column) const { return ((half2*)data)[(row * width + column) / 2]; }
-    __device__ __forceinline__ half* item_ptr(int row, int column) { return &data[row * width + column]; }
-    __device__ __forceinline__ void set(int row, int column, half value) { data[row * width + column] = value; }
-    __device__ __forceinline__ void set_half2(int row, int column, half2 value) { ((half2*)data)[(row * width + column) / 2] = value; }
-};
-
-class MatrixView_q4_row
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_row(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (column & 0x07) * 4;
-        return (data[row * width / 8 + column / 8] >> shift) & 0x0f;
-    }
-};
-
-class MatrixView_q4_column
-{
-public:
-    const uint32_t* data;
-    const int height;
-    const int width;
-
-    __device__ __forceinline__ MatrixView_q4_column(const uint32_t* data, const int height, const int width)
-        : data(data), height(height), width(width)
-    { }
-
-    __device__ __forceinline__ int item(int row, int column) const
-    {
-        int shift = (row & 0x07) * 4;
-        return (data[row / 8 * width + column] >> shift) & 0x0f;
-    }
-
-    __device__ __forceinline__ uint32_t item_uint32_t(int row, int column) { return data[row / 8 * width + column]; }
-    __device__ __forceinline__ const uint32_t* item_uint32_ptr(int row, int column) { return &data[row / 8 * width + column]; }
-};
-
-// TODO: Rewrite all these dot product functions using functors or something, move to q4_matmul.cu
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale
-
-__device__ __forceinline__ half2 dot_product_8
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half2* h_ptr = (const half2*) h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-//         half2 v_01 = q4_table[v_zero - 1][(v_read      ) & 0xff]; // (constant memory is too slow apparently)
-//         half2 v_23 = q4_table[v_zero - 1][(v_read >>  8) & 0xff];
-//         half2 v_45 = q4_table[v_zero - 1][(v_read >> 16) & 0xff];
-//         half2 v_67 = q4_table[v_zero - 1][(v_read >> 24)       ];
-
-        half2 tmp = __hmul2(*h_ptr++, v_01);
-        tmp = __hfma2(*h_ptr++, v_23, tmp);
-        tmp = __hfma2(*h_ptr++, v_45, tmp);
-        tmp = __hfma2(*h_ptr++, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, h_column);
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(*h_ptr++, v_0);
-        tmp = __hfma(*h_ptr++, v_1, tmp);
-        tmp = __hfma(*h_ptr++, v_2, tmp);
-        tmp = __hfma(*h_ptr++, v_3, tmp);
-        tmp = __hfma(*h_ptr++, v_4, tmp);
-        tmp = __hfma(*h_ptr++, v_5, tmp);
-        tmp = __hfma(*h_ptr++, v_6, tmp);
-        tmp = __hfma(*h_ptr++, v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-// Accumulated dot product of 8-element row vectors in h and quantized column vectors in v, constant zero/scale, with x_map
-
-__device__ __forceinline__ half2 dot_product_8_x_map
-(
-    const half2 acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half2 v_scale_2,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half2 result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half2 v_01 = __halves2half2(v_0, v_1);
-        half2 v_23 = __halves2half2(v_2, v_3);
-        half2 v_45 = __halves2half2(v_4, v_5);
-        half2 v_67 = __halves2half2(v_6, v_7);
-
-        half h_0 = h_ptr[*x_map_ptr++];
-        half h_1 = h_ptr[*x_map_ptr++];
-        half h_2 = h_ptr[*x_map_ptr++];
-        half h_3 = h_ptr[*x_map_ptr++];
-        half h_4 = h_ptr[*x_map_ptr++];
-        half h_5 = h_ptr[*x_map_ptr++];
-        half h_6 = h_ptr[*x_map_ptr++];
-        half h_7 = h_ptr[*x_map_ptr++];
-
-        half2 h_01 = __halves2half2(h_0, h_1);
-        half2 h_23 = __halves2half2(h_2, h_3);
-        half2 h_45 = __halves2half2(h_4, h_5);
-        half2 h_67 = __halves2half2(h_6, h_7);
-
-        half2 tmp = __hmul2(h_01, v_01);
-        tmp = __hfma2(h_23, v_23, tmp);
-        tmp = __hfma2(h_45, v_45, tmp);
-        tmp = __hfma2(h_67, v_67, tmp);
-        result = __hfma2(v_scale_2, tmp, result);
-    }
-
-    return result;
-}
-
-__device__ __forceinline__ half dot_product_8_x_map_h
-(
-    const half acc,
-    MatrixView_half& h_,
-    const int h_row,
-    const int h_column,                 // divisible by 8
-    MatrixView_q4_column& v_,
-    const int v_row,                    // divisible by 8
-    const int v_column,
-    const half v_scale,
-    const uint32_t v_zero,              // + 1 (!!)
-    const int count,
-    const uint32_t* x_map
-)
-{
-    const half* h_ptr = h_.item_ptr(h_row, 0);
-    const uint32_t* x_map_ptr = x_map + h_column;
-    const uint32_t* v_ptr = (const uint32_t*) v_.item_uint32_ptr(v_row, v_column);
-    half result = acc;
-
-    for (int i = 0; i < count; i++)
-    {
-        uint32_t v_read = *v_ptr; v_ptr += v_.width;
-
-        half v_0 = __int2half_rn((int)((v_read      ) & 0x0f) - v_zero);
-        half v_1 = __int2half_rn((int)((v_read >>  4) & 0x0f) - v_zero);
-        half v_2 = __int2half_rn((int)((v_read >>  8) & 0x0f) - v_zero);
-        half v_3 = __int2half_rn((int)((v_read >> 12) & 0x0f) - v_zero);
-        half v_4 = __int2half_rn((int)((v_read >> 16) & 0x0f) - v_zero);
-        half v_5 = __int2half_rn((int)((v_read >> 20) & 0x0f) - v_zero);
-        half v_6 = __int2half_rn((int)((v_read >> 24) & 0x0f) - v_zero);
-        half v_7 = __int2half_rn((int)((v_read >> 28)       ) - v_zero);
-
-        half tmp = __hmul(h_ptr[*x_map_ptr++], v_0);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_1, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_2, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_3, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_4, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_5, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_6, tmp);
-        tmp = __hfma(h_ptr[*x_map_ptr++], v_7, tmp);
-        result = __hfma(v_scale, tmp, result);
-    }
-
-    return result;
-}
-
-#endif
diff --git a/gptqmodel_ext/exllama/tuning.h b/gptqmodel_ext/exllama/tuning.h
deleted file mode 100644
index 770ca46aa..000000000
--- a/gptqmodel_ext/exllama/tuning.h
+++ /dev/null
@@ -1,13 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _tuning_h
-#define _tuning_h
-
-struct ExLlamaTuning
-{
-    int matmul_recons_thd;
-    bool matmul_fused_remap;
-    bool matmul_no_half2;
-};
-
-#endif
diff --git a/gptqmodel_ext/exllama/util.cuh b/gptqmodel_ext/exllama/util.cuh
deleted file mode 100644
index 7b3975732..000000000
--- a/gptqmodel_ext/exllama/util.cuh
+++ /dev/null
@@ -1,33 +0,0 @@
-// Adapted from turboderp exllama: https://github.com/turboderp/exllama
-
-#ifndef _util_cuh
-#define _util_cuh
-
-#include <cuda_runtime.h>
-#include <cuda_fp16.h>
-#include <cstdint>
-#include <cstdio>
-
-#if defined(USE_ROCM)
-#define cudaUnspecified hipErrorUnknown
-#else
-#define cudaUnspecified cudaErrorApiFailureBase
-#endif
-
-// React to failure on return code != cudaSuccess
-
-#define _cuda_check(fn) \
-do { \
-    {_cuda_err = fn;} \
-    if (_cuda_err != cudaSuccess) goto _cuda_fail; \
-} while(false)
-
-// React to failure on return code == 0
-
-#define _alloc_check(fn) \
-do { \
-    if (!(fn)) { _cuda_err = cudaUnspecified; goto _cuda_fail; } \
-    else _cuda_err = cudaSuccess; \
-} while(false)
-
-#endif
diff --git a/gptqmodel_ext/exllama_eora/test_actual_value.py b/gptqmodel_ext/exllama_eora/test_actual_value.py
deleted file mode 100644
index 8d2291df9..000000000
--- a/gptqmodel_ext/exllama_eora/test_actual_value.py
+++ /dev/null
@@ -1,276 +0,0 @@
-# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
-# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
-# SPDX-License-Identifier: Apache-2.0
-# Contact: qubitium@modelcloud.ai, x.com/qubitium
-
-import math
-
-import gptqmodel_exllama_eora
-import torch
-# from eora_test import fused_concurrent, fused_sequential, cublas_reference, gptq_gemm_eora, gptq_gemm
-from gptqmodel_exllama_eora import gptq_gemm, gptq_gemm_eora
-from gptqmodel_exllama_kernels import make_q4, q4_matmul
-from safetensors import safe_open
-
-# model_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/"
-# lora_path = "/monster/data/model/sliuau-llama3.2-1b-4bit-group128/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors"
-
-target =  'model.layers.6.self_attn.q_proj'
-eora_tensors = {}
-with safe_open("/mnt/home/shihyangl/llama3.2-1b-4bit-group128-eora-rank128-arc/adapter_model.safetensors", framework="pt", device=0) as f:
-    for k in f.keys():
-        if target in k:
-            eora_tensors[k] = f.get_tensor(k)
-# print(eora_tensors)
-
-qw_tensors = {}
-with safe_open("/mnt/home/shihyangl/llama3.2-1b-4bit-group128/model.safetensors", framework="pt", device=0) as f:
-    for k in f.keys():
-        if target in k:
-            qw_tensors[k] = f.get_tensor(k)
-
-
-
-
-m = 1
-k = eora_tensors[f'{target}.lora_A.weight'].shape[1]
-n = eora_tensors[f'{target}.lora_B.weight'].shape[0]
-r = 128
-
-
-
-bit = 4
-use_exllama = True
-
-x = torch.rand((m, k), device='cuda', dtype=torch.float16)
-eora_a = eora_tensors[f'{target}.lora_A.weight'].to('cuda:0').T
-eora_b = torch.clone(eora_tensors[f'{target}.lora_B.weight'].T, memory_format=torch.contiguous_format)
-# torch.zeros((r, n),device='cuda'
-# eora_b.data = torch.transpose(eora_tensors[f'{target}.lora_B.weight'], 0, 1)
-# eora_b = eora_tensors[f'{target}.lora_B.weight'].to('cuda:0').T
-
-
-
-# print(eora_b)
-# print(eora_b)
-
-# eora_b = torch.rand((r, n), device='cuda', dtype=torch.float16) / (100 * 4)
-# eora_b = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(r, n), device='cuda', dtype=torch.float16)
-
-# eora_b = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(n, r), device='cuda', dtype=torch.float16).T
-
-# eora_b.data = torch.normal(-2.7120113372802734e-05, 0.0248565673828125, size=(n, r), device='cuda', dtype=torch.float16).T
-
-
-# eora_b[[0,2,4,6,8],:] = 0
-# eora_b_mean = eora_b.mean()
-# eora_b_std = eora_b.std()
-# sample_range = 2040
-# sample_idx = torch.randint(0,2048,(1,sample_range)).flatten()
-# eora_b[:,sample_idx] = 0
-
-# list_range = list(range(1,128))
-# eora_b[list_range,:] = 0
-# print(eora_b)
-
-
-# print(eora_b.shape)
-# print(eora_b)
-# print(eora_b)
-# print(f"eora_b max {eora_b.max()}")
-# print(f"eora_b max {eora_b.min()}")
-# print(f"eora_b mean {eora_b.mean()}")
-# print(f"eora_b std {eora_b.std()}")
-
-# real_eora_a = eora_tensors[f'{target}.lora_A.weight'].to('cuda:0').T
-# real_eora_b = eora_tensors[f'{target}.lora_B.weight'].to('cuda:0').T
-# print(f"real eora_a max {real_eora_a.max()}")
-# print(f"real eora_a min {real_eora_a.min()}")
-# print(f"real eora_a mean {real_eora_a.mean()}")
-# print(f"real eora_a std {real_eora_a.std()}")
-# print(f"real eora_b max {real_eora_b.max()}")
-# print(f"real eora_b min {real_eora_b.min()}")
-# print(f"real eora_b mean {real_eora_b.mean()}")
-# print(f"real eora_b std {real_eora_b.std()}")
-
-# eora_a = torch.randn((k, r), device='cuda', dtype=torch.float16) / (100 * 4)
-# eora_b = torch.randn((r, n), device='cuda', dtype=torch.float16) / (5)
-
-# eora_a = torch.rand((k, r), device='cuda', dtype=torch.float16) / (100 * 4)
-# eora_b = torch.rand((r, n), device='cuda', dtype=torch.float16) / (100 * 4)
-# print(f"dummy eora_a max {eora_a.max()}")
-# print(f"dummy eora_a max {eora_a.min()}")
-# print(f"dummy eora_a mean {eora_a.mean()}")
-# print(f"dummy eora_a std {eora_a.std()}")
-
-# print(f"dummy eora_b max {eora_b.max()}")
-# print(f"dummy eora_b max {eora_b.min()}")
-# print(f"dummy eora_b mean {eora_b.mean()}")
-# print(f"dummy eora_b std {eora_b.std()}")
-
-gptq_groups = 128
-# weight = qw_tensors[f'{target}.qweight'].to('cuda:0')
-# zeros = qw_tensors[f'{target}.qzeros'].to('cuda:0')
-# scales = qw_tensors[f'{target}.scales'].to('cuda:0')
-# idx = qw_tensors[f'{target}.g_idx'].to('cuda:0')
-
-weight = torch.zeros_like(qw_tensors[f'{target}.qweight'], device='cuda', dtype=torch.int32)
-zeros =  torch.zeros_like(qw_tensors[f'{target}.qzeros'], device='cuda', dtype=torch.int32)
-scales =  torch.zeros_like(qw_tensors[f'{target}.scales'], device='cuda', dtype=torch.float16)
-idx = qw_tensors[f'{target}.g_idx'].to('cuda:0')
-
-
-pack_dtype_bits = 32
-bits = 4
-
-pack_factor = pack_dtype_bits // bits
-wf = torch.tensor(list(range(0, pack_dtype_bits, bits)), dtype=torch.int32).unsqueeze(0).to("cuda:0")
-maxq = 2 ** bits - 1
-num_itr = idx.shape[0] // 2048
-def dequantize_weight(bits, wf, qweight, qzeros, maxq, scales, g_idx, dequant_dtype = torch.int8, num_itr: int=1):
-    if bits in [2, 4, 8]:
-        zeros = torch.bitwise_right_shift(
-            torch.unsqueeze(qzeros, 2).expand(-1, -1, pack_factor),
-            wf.unsqueeze(0),
-        ).to(dequant_dtype)
-        zeros = torch.bitwise_and(zeros, maxq).reshape(scales.shape)
-
-        weight = torch.bitwise_and(
-            torch.bitwise_right_shift(
-                torch.unsqueeze(qweight, 1).expand(-1, pack_factor, -1),
-                wf.unsqueeze(-1),
-            ).to(dequant_dtype),
-            maxq
-        )
-    elif bits == 3:
-        zeros = qzeros.reshape(qzeros.shape[0], qzeros.shape[1] // 3, 3, 1).expand(
-            -1, -1, -1, 12
-        )
-        zeros = zeros >> wf.unsqueeze(0)
-        zeros[:, :, 0, 10] = (zeros[:, :, 0, 10] & 0x3) | ((zeros[:, :, 1, 0] << 2) & 0x4)
-        zeros[:, :, 1, 11] = (zeros[:, :, 1, 11] & 0x1) | ((zeros[:, :, 2, 0] << 1) & 0x6)
-        zeros = zeros & 0x7
-        zeros = torch.cat(
-            [zeros[:, :, 0, :11], zeros[:, :, 1, 1:12], zeros[:, :, 2, 1:11]],
-            dim=2,
-        ).reshape(scales.shape)
-
-        weight = qweight.reshape(qweight.shape[0] // 3, 3, 1, qweight.shape[1]).expand(
-            -1, -1, 12, -1
-        )
-        weight = (weight >> wf.unsqueeze(-1)) & 0x7
-        weight[:, 0, 10] = (weight[:, 0, 10] & 0x3) | ((weight[:, 1, 0] << 2) & 0x4)
-        weight[:, 1, 11] = (weight[:, 1, 11] & 0x1) | ((weight[:, 2, 0] << 1) & 0x6)
-        weight = weight & 0x7
-        weight = torch.cat([weight[:, 0, :11], weight[:, 1, 1:12], weight[:, 2, 1:11]], dim=1)
-    weight = weight.reshape(weight.shape[0] * weight.shape[1], weight.shape[2])
-
-    if num_itr == 1:
-        weights = scales[g_idx.long()] * (weight - zeros[g_idx.long()])
-    else:
-        num_dim = g_idx.shape[0] // num_itr
-        weights = []
-        for i in range(num_itr):
-            scale_i = scales[:, i * num_dim: (i + 1) * num_dim]
-            weight_i = weight[:, i * num_dim: (i + 1) * num_dim]
-            zeros_i = zeros[:, i * num_dim: (i + 1) * num_dim]
-            g_idx_i = g_idx[i * num_dim: (i + 1) * num_dim].long()
-            weights.append(scale_i[g_idx_i] * (weight_i - zeros_i[g_idx_i]))
-        weights = torch.cat(weights, dim=1)
-
-    return weights
-
-def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
-                 bit: int) -> None:
-    gptqmodel_exllama_eora.gptq_shuffle(q_weight, q_perm, bit)
-
-## exllama GPTQModel
-def exllama_output( x , pack_dtype_bits, bits, group_size, qweight, qzeros, scales,g_idx, out_features, in_features ):
-    NON_TENSOR = torch.empty((1, 1), device="meta")
-    def ext_make_q4(qweight, qzeros, scales, g_idx, device):
-        """Construct Q4Matrix, return handle"""
-        return make_q4(qweight, qzeros, scales, g_idx if g_idx is not None else NON_TENSOR, device)
-
-
-    def ext_q4_matmul(x, q4, q4_width):
-        """Matrix multiplication, returns x @ q4"""
-        outshape = x.shape[:-1] + (q4_width,)
-        x = x.view(-1, x.shape[-1])
-        output = torch.empty((x.shape[0], q4_width), dtype=torch.float16, device=x.device)
-
-        q4_matmul(x, q4, output)
-
-        return output.view(outshape)
-
-    original_out_features = out_features
-    original_in_features = in_features
-
-    # auto pad
-    group_size = group_size if group_size != -1 else in_features
-    out_features = out_features + (-out_features % 32)
-    in_features = in_features + (-in_features % group_size)
-    in_features_padding_size = in_features - original_in_features
-    in_features_padding_shape = (0, in_features_padding_size)
-
-    if out_features != original_out_features or in_features != original_in_features:
-        qweight.resize_(in_features // pack_dtype_bits * bits, out_features)
-        qzeros.resize_(
-            math.ceil(in_features / group_size),
-            out_features // pack_dtype_bits * bits
-        )
-        scales.resize_((math.ceil(in_features / group_size), out_features), )
-        g_idx = torch.tensor([i // group_size for i in range(in_features)], dtype=torch.int32, device=g_idx.device)
-
-    width = qweight.shape[1]
-
-    # make_q4 segfaults if g_idx is not on cpu in the act-order case. In the non act-order case, None needs to be passed for g_idx.
-    q4 = ext_make_q4(
-        qweight,
-        qzeros,
-        scales,
-        None,
-        qweight.device.index,
-    )
-
-    out = ext_q4_matmul(x, q4, width)
-
-    return out
-
-
-
-ax = x @ eora_a
-def test_eora_kernel():
-
-    # zeros_copy = zeros.clone() + 0b00010001000100010001000100010001
-    # exllama_out = exllama_output( x , pack_dtype_bits, bits, group_size=gptq_groups, qweight = weight, qzeros = zeros_copy, scales = scales,g_idx =idx , out_features = n, in_features = k)
-    # exllama_out = exllama_out  + (ax @ eora_b)
-    
-    # deq_weight = dequantize_weight(bits=4, wf = wf, qweight=weight, qzeros=zeros_copy, maxq=maxq, scales=scales, g_idx=idx, dequant_dtype=torch.int8, num_itr=num_itr)
-    # torch_kernel_out = torch.matmul(x, deq_weight).reshape(m,512) + (ax @ eora_b)
-
-    idx.data = torch.argsort(idx).to(torch.int32)
-    gptq_shuffle(weight, idx, bits)
-    
-    ## I confirmed this part to be identical to that of test_gptq.py
-
-
-    out_shape = x.shape[:-1] + (weight.shape[-1],)
-    reshaped_x = x.reshape(-1, x.shape[-1])
-
-    gptq_pytorch_out = gptq_gemm(reshaped_x, weight, zeros, scales, idx, use_exllama, bit) + (ax @ eora_b)
-
-    gptq_eora_fused_out = gptq_gemm_eora(reshaped_x, weight, zeros, scales, idx, use_exllama, bit, ax, eora_b)
-    torch.set_printoptions(precision=6)
-    # print("gptq exllama kernel out: ")
-    # print(exllama_out[0][:10])
-    # print("gptq torch kernel out: ")
-    # print(torch_kernel_out[0][:10])
-    # I want this to match the above two output
-    print("vllm exllama_pytorch_out: ")
-    print(gptq_pytorch_out[0][:10])
-
-    print("vllm exllama_eora_fused_out: ")
-    print(gptq_eora_fused_out[0][:10])
-    torch.testing.assert_close(gptq_pytorch_out, gptq_eora_fused_out, rtol=0.05, atol=0.5)  # 5 % relative tolerance, 0.5 absolute tolerance
-
-test_eora_kernel()
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 0ecdb7e88..92acb90ca 100644
--- a/setup.py
+++ b/setup.py
@@ -568,7 +568,6 @@ def _env_enabled_any(names, default="1") -> bool:
 
 # Optional kernels and not build by default. Enable compile with env flags
 BUILD_EORA = _env_enabled(os.environ.get("GPTQMODEL_BUILD_EORA", "0"))
-BUILD_EXLLAMA_V1 = _env_enabled(os.environ.get("GPTQMODEL_BUILD_EXLLAMA_V1", "0"))
 
 if BUILD_CUDA_EXT == "1":
     # Import torch's cpp_extension only if we're truly building GPU extensions
@@ -816,23 +815,6 @@ def _hipify_compile_flags(flags):
                         )
                     ]
 
-            # both CUDA and ROCm compatible
-            if BUILD_EXLLAMA_V1:
-                extensions += [
-                    cpp_ext.CUDAExtension(
-                        "gptqmodel_exllama_kernels",
-                        [
-                            "gptqmodel_ext/exllama/exllama_ext.cpp",
-                            "gptqmodel_ext/exllama/cuda_buffers.cu",
-                            "gptqmodel_ext/exllama/cuda_func/column_remap.cu",
-                            "gptqmodel_ext/exllama/cuda_func/q4_matmul.cu",
-                            "gptqmodel_ext/exllama/cuda_func/q4_matrix.cu",
-                        ],
-                        extra_link_args=extra_link_args,
-                        extra_compile_args=extra_compile_args,
-                    )
-                ]
-
             if BUILD_AWQ:
                 if ROCM_VERSION:
                     print("Skipping AWQ kernels on ROCm: inline PTX is CUDA-only.")
diff --git a/tests/kernels/test_awq.py b/tests/kernels/test_awq.py
index 8e671f427..212e0e20a 100644
--- a/tests/kernels/test_awq.py
+++ b/tests/kernels/test_awq.py
@@ -34,7 +34,6 @@
     AwqGEMMTritonQuantLinear = None  # type: ignore[assignment]
     awq_triton_import_exception = exc
 
-from gptqmodel.nn_modules.qlinear.exllama_awq import AwqExllamaQuantLinear
 from gptqmodel.nn_modules.qlinear.exllamav2_awq import AwqExllamaV2QuantLinear
 from gptqmodel.utils.exllamav2 import ScratchSpace
 
@@ -72,7 +71,6 @@ class TestAwqKernelOutput(unittest.TestCase):
         (BACKEND.MARLIN, torch.float16, 0.006),
         (BACKEND.TORCH_FUSED_AWQ, torch.float16, 0.004),
         # (BACKEND.MARLIN, torch.bfloat16, 0.05),
-        (BACKEND.EXLLAMA_V1, torch.float16, 0.006),
         (BACKEND.EXLLAMA_V2, torch.float16, 0.0068),
     ]
 
@@ -87,7 +85,6 @@ def setUpClass(cls) -> None:
             cls.backend_skip_reason[BACKEND.GEMM] = "CUDA is required for GEMM backend."
             cls.backend_skip_reason[BACKEND.TRITON] = "CUDA is required for AWQ Triton backend."
             cls.backend_skip_reason[BACKEND.MARLIN] = "CUDA is required for AWQ Marlin kernel."
-            cls.backend_skip_reason[BACKEND.EXLLAMA_V1] = "CUDA is required for ExLlama v1 AWQ kernel."
             cls.backend_skip_reason[BACKEND.EXLLAMA_V2] = "CUDA is required for ExLlama v2 AWQ kernel."
         if awq_triton_import_exception is not None:
             cls.backend_skip_reason[BACKEND.TRITON] = (
@@ -141,12 +138,6 @@ def setUpClass(cls) -> None:
             else None
         )
 
-        cls.modules[BACKEND.EXLLAMA_V1] = (
-            cls._build_exllama_v1_module(qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu)
-            if cls.cuda_available
-            else None
-        )
-
         cls.modules[BACKEND.EXLLAMA_V2] = (
             cls._build_exllama_v2_module(qweight_cpu, qzeros_cpu, scales_cpu, bias_cpu)
             if cls.cuda_available
@@ -350,42 +341,6 @@ def _build_torch_awq_module(
         module.post_init()
         return module
 
-    @classmethod
-    def _build_exllama_v1_module(
-        cls,
-        qweight_cpu: torch.Tensor,
-        qzeros_cpu: torch.Tensor,
-        scales_cpu: torch.Tensor,
-        bias_cpu: torch.Tensor,
-    ) -> Optional[AwqExllamaQuantLinear]:
-        try:
-            module = AwqExllamaQuantLinear(
-                bits=cls.BITS,
-                group_size=cls.GROUP_SIZE,
-                sym=True,
-                desc_act=False,
-                in_features=cls.in_features,
-                out_features=cls.out_features,
-                bias=True,
-                adapter=None,
-                register_buffers=True,
-            ).to(cls.device)
-
-            module.qweight.copy_(qweight_cpu.to(cls.device))
-            module.qzeros.copy_(qzeros_cpu.to(cls.device))
-            module.scales.copy_(scales_cpu.to(torch.float16).to(cls.device))
-            module.bias.copy_(bias_cpu.to(torch.float16).to(cls.device))
-
-            module.eval()
-            module.post_init()
-            return module
-        except Exception as exc:
-            cls.backend_skip_reason[BACKEND.EXLLAMA_V1] = (
-                f"ExLlama v1 AWQ kernel unavailable: {exc}"
-            )
-            return None
-
-    @classmethod
     def _build_exllama_v2_module(
         cls,
         qweight_cpu: torch.Tensor,
diff --git a/tests/test_bits.py b/tests/test_bits.py
index 8f1ae4d5f..20ababc53 100644
--- a/tests/test_bits.py
+++ b/tests/test_bits.py
@@ -20,7 +20,6 @@
 
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
-from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
@@ -35,7 +34,6 @@
 
 class TestBits(unittest.TestCase):
     QLINEAR_DICT = {
-        BACKEND.EXLLAMA_V1: ExllamaQuantLinear,
         BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear,
         BACKEND.TRITON: TritonV2QuantLinear,
         BACKEND.TORCH: TorchQuantLinear,
@@ -69,11 +67,6 @@ def check_results(self, bits: int, task_results):
 
     @classmethod
     def setUpClass(cls):
-        # cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA, BACKEND.TORCH, BACKEND.BITBLAS,
-        #                      BACKEND.IPEX]
-        # cls.backends = list(cls.pack_backends)
-        # cls.backends.extend([BACKEND.EXLLAMA_V2, BACKEND.MARLIN, ])
-
         # TODO Only CUDA Quant Linear is tested for now
         cls.pack_backends = [BACKEND.TRITON]
         cls.backends = [BACKEND.MARLIN]
diff --git a/tests/test_bits_new.py b/tests/test_bits_new.py
index b9951f383..fe4a50520 100644
--- a/tests/test_bits_new.py
+++ b/tests/test_bits_new.py
@@ -167,7 +167,6 @@ def test_quant_and_eora(self):
                 del model
                 torch_empty_cache()
 
-            # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA,
             for backend in [ BACKEND.TORCH ]: # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN
                 base_bench = bench(path=save_path, backend=backend, adapter=None) # inference using qweights only
                 # eora_bench = bench(path=tmpdir, backend=backend, adapter=eora) # inference using eora (lora)
diff --git a/tests/test_group_size.py b/tests/test_group_size.py
index 32380ece0..4f96bb97c 100644
--- a/tests/test_group_size.py
+++ b/tests/test_group_size.py
@@ -19,7 +19,6 @@
 
 from gptqmodel import BACKEND, GPTQModel, QuantizeConfig  # noqa: E402
 from gptqmodel.nn_modules.qlinear.bitblas import BitBLASQuantLinear  # noqa: E402
-from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.torch import TorchQuantLinear  # noqa: E402
@@ -34,7 +33,6 @@
 
 class TestGroupSize(unittest.TestCase):
     QLINEAR_DICT = {
-        BACKEND.EXLLAMA_V1: ExllamaQuantLinear,
         BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear,
         BACKEND.TRITON: TritonV2QuantLinear,
         BACKEND.TORCH: TorchQuantLinear,
@@ -45,7 +43,7 @@ class TestGroupSize(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
-        cls.pack_backends = [BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS]
+        cls.pack_backends = [BACKEND.TRITON, BACKEND.TORCH, BACKEND.BITBLAS]
         cls.backends = list(cls.pack_backends)
         cls.backends.extend([BACKEND.EXLLAMA_V2, BACKEND.MARLIN, ])
 
diff --git a/tests/test_inference_speed.py b/tests/test_inference_speed.py
index f0128098f..8731381ae 100644
--- a/tests/test_inference_speed.py
+++ b/tests/test_inference_speed.py
@@ -23,7 +23,6 @@
 
 (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 748),
 (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.CUDA, 493),
-(InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 717),
 (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 775),
 (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 296),
 (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 295),
@@ -41,7 +40,6 @@ class TestInferenceSpeed(InferenceSpeed):
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.MARLIN, 286.74, False, False),
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 176.00, False, False),
             # (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TORCH, 53, False, False),
-            (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V1, 282.64, False, False),
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.EXLLAMA_V2, 290.60, False, False),
             (InferenceSpeed.NATIVE_MODEL_ID, BACKEND.TRITON, 239.58, False, False),
             (InferenceSpeed.BITBLAS_NATIVE_MODEL_ID, BACKEND.BITBLAS, 2167.38, False, False), # Second time running bitblas, there is cache
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 270403508..dd4b546a1 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -48,7 +48,6 @@ def setUpClass(cls):
         #BACKEND.TORCH,
         # BACKEND.CUDA,
         # BACKEND.TRITON,
-        # BACKEND.EXLLAMA_V1,
         # BACKEND.EXLLAMA_V2,
         BACKEND.MARLIN,
         # # (BACKEND.IPEX), <-- not tested yet
diff --git a/tests/test_packable.py b/tests/test_packable.py
index 6061e357e..c84f4a2c8 100644
--- a/tests/test_packable.py
+++ b/tests/test_packable.py
@@ -13,7 +13,6 @@
 from safetensors.torch import load_file
 
 from gptqmodel import BACKEND, GPTQModel
-from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.exllama_eora import ExllamaEoraQuantLinear
 from gptqmodel.nn_modules.qlinear.exllamav2 import ExllamaV2QuantLinear  # noqa: E402
 from gptqmodel.nn_modules.qlinear.marlin import MarlinQuantLinear  # noqa: E402
@@ -28,7 +27,6 @@
 class TestPackable(unittest.TestCase):
     QLINEAR_DICT = {
         BACKEND.EXLLAMA_EORA: ExllamaEoraQuantLinear,
-        BACKEND.EXLLAMA_V1: ExllamaQuantLinear,
         BACKEND.EXLLAMA_V2: ExllamaV2QuantLinear,
         BACKEND.TRITON: TritonV2QuantLinear,
         BACKEND.TORCH: TorchQuantLinear,
@@ -54,7 +52,6 @@ def setUpClass(cls):
     @parameterized.expand(
         [
             (BACKEND.EXLLAMA_EORA, {"qweight": False, "qzeros": True, "scales": True, "g_idx": False}),
-            (BACKEND.EXLLAMA_V1, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}),
             (BACKEND.EXLLAMA_V2, {"qweight": False, "qzeros": True, "scales": True, "g_idx": True}),
             (BACKEND.TRITON, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}),
             (BACKEND.TORCH, {"qweight": True, "qzeros": True, "scales": True, "g_idx": True}),
diff --git a/tests/test_packing_speed.py b/tests/test_packing_speed.py
index 57715e597..7d1d3318e 100644
--- a/tests/test_packing_speed.py
+++ b/tests/test_packing_speed.py
@@ -159,7 +159,6 @@ def _time_pack_impl(self, qlinearCls, backend, impl: str, repeats: int, threads:
 
     @parameterized.expand(
         [
-            # [ExllamaQuantLinear, BACKEND.EXLLAMA, 9.63],
             # [TritonV2QuantLinear, BACKEND.TRITON, 9.67],
             [TorchQuantLinear, BACKEND.TORCH, 21.05],  # A100 Z3 33.56 # 4090? 27.0297
         ]
@@ -178,7 +177,6 @@ def test_pack_speed_single_thread(self, qlinearCls, backend, expect_time):
 
     @parameterized.expand(
         [
-            # [ExllamaQuantLinear, BACKEND.EXLLAMA, 9.63],
             # [TritonV2QuantLinear, BACKEND.TRITON, 9.67],
             [TorchQuantLinear, BACKEND.TORCH, 14.71],  # A100 Z3 33.56 # 4090? 27.0297
         ]
diff --git a/tests/test_post_quant_eora.py b/tests/test_post_quant_eora.py
index aa6d8c73e..7a1d039c4 100644
--- a/tests/test_post_quant_eora.py
+++ b/tests/test_post_quant_eora.py
@@ -107,7 +107,6 @@ def test_post_quant_eora(self):
                 calibration_dataset_concat_size=calibration_dataset_concat_size,
             )
 
-            # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA,
             # for backend in [BACKEND.MARLIN]:  # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN
             #     base_bench = bench(path=self.QUANTIZED_MODEL_PATH, backend=backend, adapter=None)  # inference using qweights only
             #     eora_bench = bench(path=self.QUANTIZED_MODEL_PATH, backend=backend, adapter=eora)  # inference using eora (lora)
diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py
deleted file mode 100644
index d1570c76e..000000000
--- a/tests/test_q4_exllama_v1.py
+++ /dev/null
@@ -1,1208 +0,0 @@
-# SPDX-FileCopyrightText: 2024-2025 ModelCloud.ai
-# SPDX-FileCopyrightText: 2024-2025 qubitium@modelcloud.ai
-# SPDX-License-Identifier: Apache-2.0
-# Contact: qubitium@modelcloud.ai, x.com/qubitium
-
-# -- do not touch
-import os
-
-
-os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
-# -- end do not touch
-
-
-import torch  # noqa: E402
-from gptqmodel_exllama_kernels import prepare_buffers, set_tuning_params  # noqa: E402
-from models.model_test import ModelTest  # noqa: E402
-from transformers import AutoTokenizer  # noqa: E402
-
-from gptqmodel import BACKEND, GPTQModel, exllama_set_max_input_length  # noqa: E402
-from gptqmodel.models._const import DEVICE, EXLLAMA_DEFAULT_MAX_INPUT_LENGTH  # noqa: E402
-from gptqmodel.nn_modules.qlinear.exllama import ExllamaQuantLinear  # noqa: E402
-from gptqmodel.quantization import FORMAT, METHOD  # noqa: E402
-from gptqmodel.utils.importer import select_quant_linear  # noqa: E402
-from gptqmodel.utils.model import gptqmodel_post_init  # noqa: E402
-
-
-REFERENCE = torch.Tensor(
-    [
-        5.8398,
-        6.8555,
-        7.2734,
-        6.4219,
-        6.2070,
-        5.8203,
-        6.5664,
-        6.4219,
-        6.2148,
-        5.3281,
-        5.7578,
-        7.5312,
-        8.1016,
-        6.1133,
-        7.2031,
-        6.6484,
-        6.5156,
-        6.0117,
-        6.0312,
-        6.1914,
-        6.2109,
-        6.8125,
-        5.8125,
-        7.1172,
-        7.3125,
-        6.7305,
-        5.9961,
-        6.5117,
-        6.1914,
-        5.9648,
-        7.1680,
-        6.4766,
-        7.2070,
-        6.5469,
-        6.7734,
-        6.4219,
-        6.8086,
-        7.0469,
-        5.9297,
-        6.4727,
-        6.2539,
-        5.9570,
-        7.2383,
-        5.8945,
-        6.0820,
-        5.7969,
-        7.1094,
-        6.2188,
-        6.7500,
-        7.3555,
-        6.2930,
-        6.7734,
-        5.9219,
-        7.4805,
-        6.8750,
-        6.4102,
-        6.5898,
-        6.5469,
-        7.6016,
-        6.7461,
-        5.9492,
-        7.2227,
-        5.8164,
-        5.4570,
-        6.2930,
-        7.3984,
-        6.0938,
-        7.3984,
-        5.9609,
-        6.3516,
-        6.5664,
-        5.7969,
-        7.1250,
-        6.0781,
-        6.7930,
-        5.9492,
-        6.1641,
-        6.5898,
-        6.0586,
-        6.3359,
-        6.7930,
-        7.0469,
-        6.0664,
-        6.3320,
-        5.4414,
-        6.7617,
-        5.1641,
-        7.2891,
-        6.8516,
-        6.5312,
-        5.6914,
-        7.3711,
-        6.8203,
-        5.9492,
-        7.0781,
-        6.3164,
-        7.1992,
-        7.1133,
-        7.4219,
-        7.5586,
-        7.1836,
-        6.9102,
-        6.4844,
-        6.9805,
-        6.1953,
-        6.5156,
-        5.4844,
-        6.6602,
-        6.6719,
-        7.9844,
-        6.4727,
-        6.6367,
-        6.2227,
-        6.4531,
-        5.0625,
-        6.4609,
-        6.7031,
-        6.6445,
-        6.5234,
-        6.8633,
-        6.6055,
-        5.6055,
-        6.4453,
-        7.2617,
-        6.3945,
-        6.6367,
-        6.1055,
-        7.0664,
-        6.0820,
-        6.6875,
-        6.1445,
-        6.8672,
-        6.2070,
-        6.8828,
-        6.1484,
-        6.7070,
-        6.8516,
-        6.2734,
-        7.1055,
-        7.0586,
-        6.9648,
-        5.9727,
-        6.1016,
-        6.8750,
-        7.0078,
-        7.1523,
-        5.7383,
-        5.9531,
-        6.5508,
-        7.5352,
-        6.1602,
-        6.2578,
-        6.3906,
-        5.7383,
-        6.7031,
-        5.7344,
-        6.3516,
-        5.2852,
-        7.5312,
-        6.4531,
-        6.6406,
-        6.2266,
-        6.1094,
-        5.9102,
-        5.7617,
-        6.3789,
-        7.0508,
-        6.3750,
-        6.3320,
-        6.8555,
-        6.7266,
-        7.0352,
-        7.7695,
-        6.3984,
-        6.5039,
-        6.8320,
-        6.1602,
-        6.0312,
-        6.3828,
-        6.9023,
-        7.4336,
-        7.3711,
-        6.1016,
-        7.0703,
-        6.3281,
-        6.8281,
-        6.4922,
-        5.9453,
-        5.1016,
-        6.7188,
-        6.1406,
-        6.6289,
-        7.2695,
-        6.2070,
-        6.7070,
-        7.2930,
-        7.1836,
-        6.3828,
-        6.1992,
-        6.7070,
-        7.8008,
-        7.7773,
-        5.6602,
-        7.0273,
-        6.6172,
-        6.0898,
-        5.3516,
-        7.3359,
-        5.9727,
-        6.0078,
-        7.0586,
-        6.3086,
-        6.8555,
-        7.2617,
-        7.3477,
-        6.3828,
-        7.1133,
-        6.6328,
-        7.3516,
-        6.9141,
-        7.2031,
-        6.9805,
-        6.1719,
-        6.7812,
-        8.3047,
-        6.5898,
-        6.3633,
-        6.2539,
-        7.2773,
-        6.5938,
-        6.4141,
-        6.8203,
-        6.8906,
-        7.8828,
-        5.9609,
-        6.4180,
-        7.3984,
-        5.7539,
-        7.1758,
-        6.6641,
-        6.9062,
-        6.2578,
-        7.5508,
-        6.1719,
-        6.5742,
-        5.9375,
-        6.7891,
-        6.2109,
-        6.5039,
-        6.8750,
-        6.2031,
-        6.8828,
-        7.1094,
-        5.9570,
-        7.2969,
-        6.6797,
-        6.8828,
-        5.5430,
-        6.9648,
-        5.8398,
-        6.5430,
-        6.3945,
-        6.5664,
-        5.8086,
-        6.6172,
-        7.0586,
-        6.8867,
-        6.0820,
-        5.8125,
-        6.7070,
-        7.5742,
-        6.2578,
-        6.1328,
-        6.5391,
-        5.4531,
-        6.8242,
-        6.6953,
-        6.8008,
-        6.3398,
-        6.4805,
-        7.2266,
-        6.3281,
-        6.6875,
-        6.4688,
-        5.9414,
-        7.4297,
-        5.8711,
-        6.0625,
-        5.8750,
-        6.5664,
-        5.8867,
-        6.3477,
-        6.1133,
-        6.9453,
-        5.0547,
-        6.7812,
-        6.4922,
-        7.2422,
-        5.4688,
-        6.2109,
-        7.2148,
-        6.1758,
-        5.9297,
-        7.1953,
-        5.5195,
-        6.3203,
-        5.9961,
-        7.9297,
-        6.2695,
-        6.4414,
-        6.7266,
-        7.1875,
-        7.3203,
-        5.4062,
-        6.0625,
-        7.0898,
-        5.3828,
-        5.6133,
-        6.0742,
-        6.6836,
-        5.7109,
-        7.2852,
-        7.7539,
-        7.5820,
-        6.4258,
-        5.9336,
-        6.3750,
-        6.3555,
-        7.5469,
-        6.2539,
-        6.5898,
-        6.4102,
-        7.0469,
-        5.7344,
-        7.2031,
-        6.7969,
-        5.6836,
-        7.6523,
-        6.9297,
-        7.8672,
-        6.4766,
-        6.3008,
-        7.0977,
-        6.5430,
-        7.0938,
-        5.8398,
-        6.9883,
-        6.5312,
-        6.3203,
-        6.3594,
-        5.4062,
-        6.9688,
-        5.7930,
-        6.3164,
-        6.5547,
-        7.1992,
-        5.8750,
-        6.3008,
-        6.7930,
-        6.0391,
-        7.4766,
-        6.6094,
-        6.5625,
-        5.9805,
-        6.2422,
-        7.2109,
-        6.6875,
-        5.3047,
-        7.6211,
-        5.9453,
-        6.5625,
-        6.1641,
-        6.1250,
-        6.5977,
-        7.7422,
-        7.0742,
-        5.6875,
-        6.2656,
-        6.6250,
-        6.8945,
-        5.7070,
-        6.3203,
-        5.7500,
-        6.2695,
-        6.2773,
-        6.8516,
-        6.4883,
-        7.0000,
-        6.7578,
-        6.1875,
-        5.9844,
-        5.5703,
-        6.7188,
-        5.5273,
-        5.3438,
-        7.2500,
-        6.7852,
-        6.5195,
-        6.8125,
-        6.0664,
-        6.7852,
-        7.0000,
-        7.0781,
-        6.8477,
-        7.2930,
-        6.3438,
-        7.1523,
-        6.3281,
-        6.8047,
-        7.3203,
-        5.3359,
-        6.1484,
-        6.5586,
-        7.3828,
-        6.2344,
-        7.1523,
-        6.4102,
-        5.5898,
-        7.0195,
-        7.1172,
-        5.8008,
-        6.5742,
-        6.2891,
-        8.0312,
-        6.9023,
-        6.5898,
-        7.1953,
-        6.7266,
-        6.0078,
-        5.5430,
-        6.4766,
-        6.4258,
-        5.9648,
-        8.0859,
-        5.0547,
-        7.2188,
-        7.4375,
-        6.5156,
-        5.9922,
-        6.3281,
-        6.2852,
-        6.7734,
-        6.2461,
-        6.9805,
-        5.4648,
-        5.8867,
-        6.8242,
-        6.3008,
-        6.3281,
-        7.3047,
-        7.1836,
-        6.5195,
-        6.6328,
-        6.7188,
-        5.4336,
-        6.5078,
-        5.3477,
-        5.5508,
-        7.3125,
-        5.8750,
-        6.5195,
-        6.2383,
-        6.3594,
-        6.0898,
-        6.4141,
-        5.9844,
-        6.6250,
-        7.7109,
-        6.0391,
-        7.2344,
-        5.9453,
-        5.9453,
-        7.0586,
-        5.6641,
-        7.2773,
-        6.5195,
-        7.2227,
-        6.3359,
-        5.3203,
-        6.4375,
-        7.2383,
-        6.4023,
-        6.2148,
-        7.3750,
-        5.8164,
-        6.2109,
-        6.5430,
-        5.8164,
-        6.1680,
-        6.7656,
-        6.0820,
-        6.1094,
-        6.5312,
-        6.8906,
-        6.8320,
-        6.1289,
-        6.3125,
-        7.6797,
-        6.3008,
-        6.0000,
-        7.3320,
-        6.7852,
-        6.9297,
-        6.6328,
-        6.2266,
-        5.1602,
-        6.2031,
-        7.0547,
-        5.9492,
-        6.0703,
-        6.0977,
-        6.8086,
-        6.0742,
-        6.0195,
-        7.0625,
-        6.5781,
-        5.7461,
-        6.1562,
-        7.0430,
-        6.7148,
-        6.5312,
-        6.5820,
-        6.4570,
-        7.5508,
-        5.6289,
-        6.0547,
-        6.5000,
-        7.3125,
-        5.8477,
-        5.9297,
-        6.2578,
-        6.0078,
-        5.9922,
-        7.3398,
-        7.4922,
-        7.8906,
-        7.5547,
-        5.4648,
-        6.5156,
-        6.3242,
-        6.1094,
-        6.9219,
-        6.7227,
-        6.6836,
-        7.4023,
-        5.9648,
-        7.2383,
-        6.7695,
-        6.6797,
-        7.0547,
-        6.3047,
-        6.4688,
-        6.9961,
-        6.0391,
-        5.9727,
-        6.8398,
-        6.7422,
-        5.7656,
-        5.4766,
-        6.7852,
-        7.0820,
-        5.3516,
-        7.6523,
-        5.1562,
-        6.6445,
-        6.1211,
-        6.2695,
-        6.0703,
-        6.3594,
-        6.4062,
-        6.3398,
-        5.7578,
-        6.5391,
-        6.2500,
-        6.5742,
-        6.5000,
-        7.5625,
-        7.0117,
-        6.5547,
-        7.1250,
-        6.4453,
-        6.6094,
-        6.1875,
-        6.4219,
-        6.6172,
-        6.4336,
-        6.5703,
-        6.1758,
-        6.4219,
-        6.6016,
-        6.7383,
-        6.7070,
-        6.1328,
-        5.5586,
-        6.6367,
-        6.3789,
-        6.2578,
-        5.5039,
-        6.6172,
-        6.4648,
-        5.8086,
-        7.2031,
-        5.8125,
-        6.3711,
-        7.6758,
-        7.1289,
-        5.8086,
-        6.3008,
-        6.2109,
-        6.1602,
-        6.1797,
-        7.2305,
-        6.7266,
-        6.2422,
-        5.6719,
-        6.7070,
-        6.9414,
-        6.8594,
-        7.4023,
-        7.2109,
-        6.0156,
-        6.6680,
-        6.6172,
-        7.1250,
-        6.6523,
-        6.9531,
-        6.7617,
-        6.4961,
-        6.9414,
-        5.7188,
-        7.6367,
-        6.5469,
-        6.2305,
-        6.4414,
-        7.4648,
-        5.9102,
-        6.2461,
-        6.1367,
-        6.8203,
-        6.5703,
-        6.8867,
-        7.0000,
-        6.7539,
-        6.1719,
-        6.5469,
-        6.2422,
-        5.4297,
-        5.7305,
-        5.1641,
-        6.1875,
-        7.0312,
-        6.6484,
-        6.0234,
-        7.4102,
-        6.8711,
-        6.3086,
-        6.3711,
-        6.7344,
-        6.6992,
-        5.9766,
-        7.3906,
-        7.1875,
-        6.4883,
-        6.3984,
-        7.3438,
-        6.9688,
-        6.9062,
-        6.4375,
-        6.7891,
-        7.0117,
-        6.4883,
-        5.7500,
-        7.0898,
-        7.0742,
-        6.7070,
-        5.8750,
-        6.0469,
-        6.6445,
-        5.2773,
-        6.8984,
-        6.1641,
-        7.0508,
-        7.4609,
-        5.0273,
-        6.7734,
-        6.4531,
-        5.7656,
-        6.5312,
-        7.4648,
-        6.1250,
-        6.5625,
-        7.1367,
-        6.0625,
-        6.1211,
-        6.9766,
-        6.6758,
-        6.3164,
-        6.8828,
-        6.8203,
-        6.7500,
-        6.5352,
-        7.3008,
-        6.7852,
-        6.1914,
-        5.0508,
-        6.7188,
-        7.1172,
-        6.8008,
-        6.8086,
-        5.4883,
-        6.9180,
-        6.5742,
-        6.1719,
-        7.0469,
-        7.1523,
-        5.9492,
-        5.8594,
-        6.8320,
-        6.1719,
-        6.2031,
-        6.8398,
-        7.3008,
-        6.6289,
-        6.4922,
-        6.0000,
-        5.4766,
-        6.3320,
-        6.5117,
-        6.2812,
-        7.5742,
-        6.3516,
-        7.0039,
-        6.4570,
-        7.1523,
-        7.6289,
-        6.2578,
-        7.1875,
-        6.4844,
-        5.7930,
-        6.7070,
-        7.5508,
-        7.1797,
-        6.0430,
-        6.8711,
-        6.5742,
-        7.5781,
-        6.4766,
-        6.5391,
-        6.9453,
-        6.1992,
-        6.6367,
-        6.2812,
-        6.0234,
-        6.6953,
-        7.0312,
-        6.2031,
-        6.5625,
-        6.6719,
-        6.1719,
-        6.5586,
-        5.7031,
-        7.4609,
-        6.6211,
-        7.7227,
-        6.9141,
-        6.0469,
-        6.2500,
-        5.3828,
-        6.0078,
-        5.8164,
-        5.8867,
-        6.1523,
-        6.6523,
-        6.6953,
-        7.3125,
-        6.4844,
-        5.9570,
-        5.9531,
-        6.2109,
-        5.5039,
-        6.5117,
-        6.8203,
-        6.6133,
-        6.4766,
-        5.9297,
-        7.1445,
-        7.1914,
-        6.0117,
-        6.8281,
-        6.7422,
-        6.1328,
-        6.9805,
-        6.5625,
-        6.9180,
-        7.1133,
-        7.3359,
-        5.7617,
-        5.8711,
-        6.4961,
-        6.5859,
-        6.2422,
-        6.5273,
-        6.7461,
-        6.6992,
-        6.7695,
-        6.6289,
-        5.9453,
-        5.9805,
-        7.1172,
-        6.6719,
-        6.0039,
-        7.6875,
-        6.7812,
-        7.8359,
-        6.9531,
-        7.4336,
-        7.6602,
-        6.8164,
-        7.3945,
-        7.1602,
-        6.8789,
-        5.0078,
-        6.0547,
-        6.8086,
-        6.7070,
-        6.4688,
-        6.4492,
-        6.6172,
-        5.5625,
-        6.6914,
-        6.4297,
-        5.7461,
-        5.3359,
-        6.8750,
-        6.4609,
-        7.4062,
-        5.2070,
-        6.0820,
-        6.7383,
-        6.5703,
-        6.1797,
-        6.7070,
-        6.5977,
-        5.9961,
-        6.6328,
-        6.9375,
-        6.3906,
-        6.6484,
-        4.9609,
-        6.6445,
-        6.5898,
-        7.1875,
-        7.5195,
-        6.7969,
-        6.1367,
-        6.8906,
-        7.4297,
-        6.3633,
-        6.0508,
-        6.5000,
-        6.4648,
-        6.7539,
-        6.7109,
-        5.8086,
-        6.6016,
-        7.1133,
-        4.8672,
-        6.6367,
-        6.1641,
-        5.1758,
-        6.9453,
-        6.3242,
-        7.0664,
-        6.4805,
-        6.3516,
-        6.7383,
-        8.4688,
-        6.7305,
-        5.9844,
-        6.5938,
-        7.2969,
-        6.5977,
-        7.5898,
-        6.2969,
-        6.8672,
-        6.6680,
-        7.1289,
-        6.6875,
-        5.4258,
-        8.1875,
-        8.0391,
-        7.7969,
-        6.6445,
-        7.0703,
-        7.3359,
-        6.9805,
-        6.6328,
-        6.5352,
-        6.2422,
-        5.5820,
-        6.8633,
-        6.8047,
-        6.5703,
-        6.0117,
-        6.7539,
-        7.1719,
-        6.8438,
-        7.3633,
-        6.6016,
-        7.2070,
-        6.4727,
-        5.8008,
-        7.4062,
-        7.4805,
-        6.6445,
-        5.9023,
-        6.3984,
-        6.9961,
-        6.6680,
-        6.8242,
-        6.7148,
-        6.6172,
-        6.9727,
-        6.8320,
-        5.9766,
-        6.6133,
-        5.5977,
-        6.7773,
-        7.3906,
-        6.9219,
-        7.0781,
-        6.6914,
-        5.7539,
-        6.7969,
-        6.8008,
-        5.8047,
-        7.1055,
-        6.4961,
-        6.0352,
-        5.6211,
-        7.4414,
-        7.0703,
-        6.1172,
-        6.7461,
-        6.4492,
-        7.7148,
-        6.4258,
-        6.0039,
-        6.5156,
-        7.2188,
-        7.4531,
-        7.4844,
-        7.5938,
-        7.4023,
-        6.7617,
-        6.0078,
-        6.3320,
-        5.8906,
-        7.5977,
-        5.6523,
-        6.7734,
-        6.3008,
-        5.2227,
-        7.1719,
-        7.1289,
-        6.6602,
-        5.4609,
-        7.0312,
-        6.0820,
-        6.1719,
-        6.0000,
-        6.5547,
-        6.6328,
-        7.0547,
-        7.0859,
-        6.2656,
-        5.5234,
-        6.0273,
-        6.7891,
-        7.1875,
-        6.9531,
-        6.8203,
-        6.3516,
-        6.1172,
-        6.4648,
-        6.9180,
-        7.3906,
-        6.2812,
-        5.7109,
-        6.1484,
-        6.9102,
-        6.8711,
-        7.0156,
-        6.1445,
-        5.8867,
-        6.3828,
-        5.9961,
-        6.6914,
-        6.7891,
-        7.0820,
-        6.6719,
-        6.9297,
-        6.3750,
-        6.7578,
-        6.4883,
-        6.2227,
-        6.2305,
-        6.0508,
-        6.6484,
-        5.7578,
-        7.2070,
-        7.2383,
-        6.9375,
-        7.2578,
-        6.5312,
-        6.0312,
-        6.7930,
-        6.2578,
-        7.0625,
-        7.2148,
-        6.4961,
-        7.0703,
-        6.4727,
-        7.3906,
-    ]
-).to(torch.float16)
-
-MODEL_ID = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit"
-
-def get_diff(a, ref):
-    eps = 1e-6
-    return f"Maxdiff: {(a - ref).abs().max()}, Mean relative diff: {((a - ref).abs() / (ref.abs() + eps)).mean()}"
-
-class TestsQ4ExllamaV1(ModelTest):
-    def test_exllama(self):
-        group_size = 128
-
-        m = 1
-        k = 1024
-        n = 1024
-        device = torch.device("cuda:0")
-
-        pack_dtype = torch.int32
-
-        linear_class = select_quant_linear(
-            bits=4,
-            group_size=group_size,
-            desc_act=False,
-            sym=True,
-            backend=BACKEND.EXLLAMA_V1,
-            format=FORMAT.GPTQ,
-            quant_method=METHOD.GPTQ,
-            pack_dtype=pack_dtype,
-            device=DEVICE.CUDA,
-        )
-
-        linear = linear_class(
-            bits=4,
-            group_size=group_size,
-            desc_act=False,
-            sym=True,
-            in_features=k,
-            out_features=n,
-            bias=False,
-            pack_dtype=pack_dtype,
-            backend=BACKEND.EXLLAMA_V1,
-        )
-        self.assertTrue(isinstance(linear, ExllamaQuantLinear))
-
-        torch.manual_seed(42)
-
-        linear.qweight = torch.randint(-100, 100, size=linear.qweight.shape, dtype=torch.int32)
-        linear.scales = linear.scales + 0.002
-        linear.qzeros += 0b00010001000100010001000100010001  # for new weight format
-
-        linear = linear.eval()
-        linear = linear.to(device)
-
-        linear = gptqmodel_post_init(linear, use_act_order=False)
-
-        max_inner_outer_dim = max(k, n)
-        max_dq_buffer_size = linear.in_features * linear.out_features
-        max_input_len = 2048
-        buffers = {
-            "temp_state": torch.zeros((max_input_len, max_inner_outer_dim), dtype=torch.float16, device=device),
-            "temp_dq": torch.zeros((1, max_dq_buffer_size), dtype=torch.float16, device=device),
-        }
-
-        prepare_buffers(device, buffers["temp_state"], buffers["temp_dq"])
-
-        # Using the default from exllama repo here.
-        matmul_recons_thd = 8
-        matmul_fused_remap = False
-        matmul_no_half2 = False
-        set_tuning_params(matmul_recons_thd, matmul_fused_remap, matmul_no_half2)
-
-        inp = torch.rand(1, m, k, dtype=torch.float16).to(device)
-
-        with torch.inference_mode():
-            res = linear(inp)[0][0]
-
-        reference = REFERENCE.to(device)
-
-        self.assertTrue(
-            torch.allclose(res, reference, rtol=3e-5, atol=2e-2),
-            get_diff(res, reference),
-        )
-
-    def test_exllama_buffer_size(self):
-        model_id = "/monster/data/model/TinyLlama-1.1B-Chat-v1.0-GPTQ-4bit-sharded/"
-        prompt = "I am in Paris and" * 450
-        device = torch.device("cuda:0")
-
-        revision = "desc_act_true"
-
-        model_q = GPTQModel.from_quantized(
-            model_id,
-            revision=revision,
-            device="cuda:0",
-            backend=BACKEND.EXLLAMA_V1,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(model_id)
-
-        inp = tokenizer(prompt, return_tensors="pt").to(device)
-
-        self.assertTrue(
-            inp["input_ids"].shape[1] > EXLLAMA_DEFAULT_MAX_INPUT_LENGTH
-        )  # 2048 is the default max_input_length
-
-        model_q = exllama_set_max_input_length(model_q, 4096)
-
-        _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-
-        model_q = exllama_set_max_input_length(model_q, 1034)
-
-        with self.assertRaises(RuntimeError) as cm:
-            _ = model_q.generate(**inp, num_beams=1, min_new_tokens=3, max_new_tokens=3)
-        self.assertIn("temp_state buffer is too small", str(cm.exception))
-
-    def test_generation_desc_act_false(self):
-        prompt = "The capital of France is"
-        device = torch.device("cuda:0")
-
-        model_q = GPTQModel.from_quantized(
-            MODEL_ID,
-            device="cuda:0",
-            backend=BACKEND.EXLLAMA_V1,
-        )
-        tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
-
-        predicted_text = ModelTest.generate_stable_with_limit(
-            model_q,
-            tokenizer,
-            prompt,
-            min_new_tokens=60,
-            max_new_tokens=60,
-            skip_special_tokens=False,
-        )
-
-        print("predicted_text", predicted_text)
-        assert "paris" in predicted_text.lower() or "city" in predicted_text.lower() or "country" in predicted_text.lower()
-
-    def test_generation_desc_act_true(self):
-        revision = "desc_act_true"
-
-        model_q = GPTQModel.from_quantized(
-            MODEL_ID,
-            revision=revision,
-            device="cuda:0",
-            backend=BACKEND.EXLLAMA_V1,
-        )
-
-        self.NATIVE_ARC_CHALLENGE_ACC = 0.285
-        self.NATIVE_ARC_CHALLENGE_ACC_NORM = 0.314
-        task_results = self.lm_eval(model=model_q, delete_quantized_model=False)
-        self.check_results(task_results)
-
-    def test_multigpu(self):
-        # TODO
-        pass
diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py
index 4f973c39c..c02d6d34c 100644
--- a/tests/test_q4_exllama_v2.py
+++ b/tests/test_q4_exllama_v2.py
@@ -16,7 +16,7 @@
 
 import torch  # noqa: E402
 from models.model_test import ModelTest  # noqa: E402
-from test_q4_exllama_v1 import REFERENCE, get_diff  # noqa: E402
+from test_q4_reference import REFERENCE, get_diff  # noqa: E402
 from transformers import AutoTokenizer  # noqa: E402
 
 from gptqmodel import BACKEND, GPTQModel  # noqa: E402
diff --git a/tests/test_quant_and_eora.py b/tests/test_quant_and_eora.py
index 812929eea..d2f1d2372 100644
--- a/tests/test_quant_and_eora.py
+++ b/tests/test_quant_and_eora.py
@@ -123,7 +123,6 @@ def test_quant_and_eora(self, quant_method: METHOD, format: FORMAT):
             del model
             torch_empty_cache()
 
-            # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA,
             for backend in [BACKEND.MARLIN]:  # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN
                 base_bench = self.bench(path=tmpdir, backend=backend, adapter=None)  # inference using qweights only
                 eora_bench = self.bench(path=tmpdir, backend=backend, adapter=eora)  # inference using eora (lora)
@@ -266,7 +265,6 @@ def test_quant_and_eora(self):
             del model
             torch_empty_cache()
 
-            # BACKEND.EXLLAMA_V2, BACKEND.EXLLAMA_V1, BACKEND.TRITON, BACKEND.CUDA,
             for backend in [BACKEND.MARLIN]:  # BACKEND.IPEX, BACKEND.BITBLAS, BACKEND.EXLLAMA_V2V BACKEND.MARLIN
                 eora_bench = self.bench(path=tmpdir, backend=backend, adapter=eora)  # inference using eora (lora)
                 base_bench = self.bench(path=tmpdir, backend=backend, adapter=None)  # inference using qweights only
diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py
index 88c1c4bdd..1b828d652 100644
--- a/tests/test_save_loaded_quantized_model.py
+++ b/tests/test_save_loaded_quantized_model.py
@@ -48,7 +48,6 @@ def _generate_or_skip(self, model, backend: BACKEND, tokenizer, prompt, **kwargs
         [
             (BACKEND.AUTO),
             (BACKEND.EXLLAMA_V2),
-            # (BACKEND.EXLLAMA_V1),
             (BACKEND.TRITON),
             (BACKEND.BITBLAS),
             (BACKEND.MARLIN),