ModelCloud · Qubitium · Mar 13, 2026 · Mar 13, 2026
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,5 +1,4 @@
 recursive-include gptqmodel_ext/awq *.h *.cuh *.cu *.cpp
-recursive-include gptqmodel_ext/exllama *.h *.cuh *.cu *.cpp
 recursive-include gptqmodel_ext/exllamav2 *.h *.cuh *.cu *.cpp
 recursive-include gptqmodel_ext/exllama_eora/eora *.h *.cuh *.cu *.cpp *.py
 recursive-include gptqmodel_ext/marlin *.h *.cuh *.cu *.cpp *.hpp

diff --git a/examples/README.md b/examples/README.md
@@ -27,7 +27,7 @@ python basic_usage_bitblas.py
 
 To Execute `basic_usage_exllama.py`, using command like this:
 ```shell
-python basic_usage_exllama.py --backend EXLLAMA/EXLLAMA_V2
+python basic_usage_exllama.py --backend EXLLAMA_V2
 ```
 
 To Execute `basic_usage_marlin.py`, using command like this:
@@ -104,4 +104,3 @@ CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_id_or_path PATH/TO/MOD
 ```
 
 Use `--help` flag to see detailed descriptions for more command arguments.
-
diff --git a/examples/benchmark/perplexity.py b/examples/benchmark/perplexity.py
@@ -42,7 +42,7 @@
     parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
     parser.add_argument("--use_fast_tokenizer", action="store_true", help="Whether to use fast tokenizer")
     parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
-    parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v1', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
+    parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
     args = parser.parse_args()
 
     tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=args.use_fast_tokenizer)

diff --git a/gptqmodel/__init__.py b/gptqmodel/__init__.py
@@ -52,7 +52,6 @@
 from .models.auto import ASCII_LOGO
 from .quantization import BaseQuantizeConfig, GPTAQConfig, QuantizeConfig
 from .utils import BACKEND
-from .utils.exllama import exllama_set_max_input_length
 from .version import __version__
 
 

diff --git a/gptqmodel/models/_const.py b/gptqmodel/models/_const.py
@@ -128,6 +128,4 @@ def get_best_device(backend: BACKEND = BACKEND.AUTO) -> torch.device:
     else:
         return CPU
 
-EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048
-
 EXPERT_INDEX_PLACEHOLDER = "{expert_index}"
diff --git a/gptqmodel/nn_modules/qlinear/exllama.py b/gptqmodel/nn_modules/qlinear/exllama.py
diff --git a/gptqmodel/nn_modules/qlinear/exllama_awq.py b/gptqmodel/nn_modules/qlinear/exllama_awq.py
diff --git a/gptqmodel/utils/backend.py b/gptqmodel/utils/backend.py
@@ -15,7 +15,6 @@ class BACKEND(str, Enum):
     TORCH_INT8 = "torch_int8" # optimized CPU int8 fused kernel
     TORCH = "torch" # GOOD: about 80% of triton
     TRITON = "triton" # VERY GOOD: all-around kernel
-    EXLLAMA_V1 = "exllama_v1" # FAST: optimized for batching == 1
     EXLLAMA_V2 = "exllama_v2" # FASTER: optimized for batching > 1
     EXLLAMA_EORA = "exllama_eora"
     MACHETE = "machete" # CUTLASS-based kernel optimized for Hopper (SM90+)