Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
recursive-include gptqmodel_ext/awq *.h *.cuh *.cu *.cpp
recursive-include gptqmodel_ext/exllama *.h *.cuh *.cu *.cpp
recursive-include gptqmodel_ext/exllamav2 *.h *.cuh *.cu *.cpp
recursive-include gptqmodel_ext/exllama_eora/eora *.h *.cuh *.cu *.cpp *.py
recursive-include gptqmodel_ext/marlin *.h *.cuh *.cu *.cpp *.hpp
Expand Down
3 changes: 1 addition & 2 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ python basic_usage_bitblas.py

To Execute `basic_usage_exllama.py`, using command like this:
```shell
python basic_usage_exllama.py --backend EXLLAMA/EXLLAMA_V2
python basic_usage_exllama.py --backend EXLLAMA_V2
```

To Execute `basic_usage_marlin.py`, using command like this:
Expand Down Expand Up @@ -104,4 +104,3 @@ CUDA_VISIBLE_DEVICES=0 python generation_speed.py --model_id_or_path PATH/TO/MOD
```

Use `--help` flag to see detailed descriptions for more command arguments.

2 changes: 1 addition & 1 deletion examples/benchmark/perplexity.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
parser.add_argument("--is_quantized", action="store_true", help="Is the model GPTQ quantized?")
parser.add_argument("--use_fast_tokenizer", action="store_true", help="Whether to use fast tokenizer")
parser.add_argument("--trust_remote_code", action="store_true", help="Whether to use remote code")
parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v1', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
parser.add_argument("--backend", choices=['auto', 'marlin', 'exllama_v2', 'triton', 'cuda', 'torch', 'ipex', 'bitblas'], default='auto', help="Whether to use BACKEND format")
args = parser.parse_args()

tokenizer = AutoTokenizer.from_pretrained(args.model, use_fast=args.use_fast_tokenizer)
Expand Down
1 change: 0 additions & 1 deletion gptqmodel/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@
from .models.auto import ASCII_LOGO
from .quantization import BaseQuantizeConfig, GPTAQConfig, QuantizeConfig
from .utils import BACKEND
from .utils.exllama import exllama_set_max_input_length
from .version import __version__


Expand Down
2 changes: 0 additions & 2 deletions gptqmodel/models/_const.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,4 @@ def get_best_device(backend: BACKEND = BACKEND.AUTO) -> torch.device:
else:
return CPU

EXLLAMA_DEFAULT_MAX_INPUT_LENGTH = 2048

EXPERT_INDEX_PLACEHOLDER = "{expert_index}"
170 changes: 0 additions & 170 deletions gptqmodel/nn_modules/qlinear/exllama.py

This file was deleted.

146 changes: 0 additions & 146 deletions gptqmodel/nn_modules/qlinear/exllama_awq.py

This file was deleted.

1 change: 0 additions & 1 deletion gptqmodel/utils/backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class BACKEND(str, Enum):
TORCH_INT8 = "torch_int8" # optimized CPU int8 fused kernel
TORCH = "torch" # GOOD: about 80% of triton
TRITON = "triton" # VERY GOOD: all-around kernel
EXLLAMA_V1 = "exllama_v1" # FAST: optimized for batching == 1
EXLLAMA_V2 = "exllama_v2" # FASTER: optimized for batching > 1
EXLLAMA_EORA = "exllama_eora"
MACHETE = "machete" # CUTLASS-based kernel optimized for Hopper (SM90+)
Expand Down
Loading
Loading