Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 64 additions & 8 deletions tests/models/model_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@
import tempfile # noqa: E402
import textwrap # noqa: E402
import unittest # noqa: E402
from collections.abc import Iterable # noqa: E402
from collections.abc import Iterable, Mapping # noqa: E402

import torch.cuda # noqa: E402
from datasets import load_dataset # noqa: E402
Expand Down Expand Up @@ -362,18 +362,74 @@ def generateChat(self, model, tokenizer, prompt=None):
print(f"Result is: \n{output}")
return output

def generate_with_limit(self, model, tokenizer, prompt, max_new_tokens=512):
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
# Use this helper for CI output assertions instead of raw model.generate(),
# including in standalone unittest cases, so expected-text checks stay deterministic.
@staticmethod
def generate_stable_with_limit(
model,
tokenizer,
prompt=None,
max_new_tokens=512,
min_new_tokens=None,
skip_special_tokens=True,
inputs=None,
decode_start_idx=None,
batch_decode=False,
clean_up_tokenization_spaces=None,
return_generate_output=False,
**generate_kwargs,
):
if inputs is None:
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
elif hasattr(inputs, "to"):
inputs = inputs.to(model.device)

generation_inputs = dict(inputs) if isinstance(inputs, Mapping) else {"input_ids": inputs}

decoder = getattr(tokenizer, "tokenizer", tokenizer)
pad_token_id = decoder.pad_token_id if decoder.pad_token_id is not None else decoder.eos_token_id
generated = model.generate(
**inputs,
**generation_inputs,
max_new_tokens=max_new_tokens,
min_new_tokens=min_new_tokens,
do_sample=False,
num_beams=1,
pad_token_id=pad_token_id,
eos_token_id=tokenizer.eos_token_id,
eos_token_id=decoder.eos_token_id,
**generate_kwargs,
)
if return_generate_output:
return generated

generated_ids = generated[0] if isinstance(generated, tuple) else generated

if batch_decode:
if decode_start_idx is None:
if hasattr(inputs, "input_ids"):
decode_start_idx = [len(input_ids) for input_ids in inputs.input_ids]
else:
raise ValueError("decode_start_idx is required for batch_decode when inputs lack input_ids")

if isinstance(decode_start_idx, int):
generated_ids = [output_ids[decode_start_idx:] for output_ids in generated_ids]
else:
generated_ids = [
output_ids[start_idx:]
for start_idx, output_ids in zip(decode_start_idx, generated_ids)
]

decode_kwargs = {"skip_special_tokens": skip_special_tokens}
if clean_up_tokenization_spaces is not None:
decode_kwargs["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
return tokenizer.batch_decode(generated_ids, **decode_kwargs)[0]

if decode_start_idx is None:
decode_start_idx = 0

return tokenizer.decode(
generated_ids[0][decode_start_idx:],
skip_special_tokens=skip_special_tokens,
)
return tokenizer.decode(generated[0], skip_special_tokens=True)

def run_generic_inference_checks(self, model, tokenizer, backend):
model.eval()
Expand All @@ -383,7 +439,7 @@ def run_generic_inference_checks(self, model, tokenizer, backend):
prompt = item["prompt"]
keywords = item["keywords"]
try:
response = self.generate_with_limit(model, tokenizer, prompt)
response = self.generate_stable_with_limit(model, tokenizer, prompt)
normalized = response.lower()
matched = any(keyword.lower() in normalized for keyword in keywords)
results.append(
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_bloom_bias_torch_fused.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_with_torch_fused_cpu(self, backend):
backend=BACKEND.TORCH_FUSED,
device=DEVICE.CPU,
)
generate_str = self.generate_with_limit(
generate_str = self.generate_stable_with_limit(
model,
tokenizer,
"The capital city of France is named",
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_llama3_2_torch_fused.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def test_with_torch_fused_cpu(self, backend):
device=DEVICE.CPU,
)
tokenizer = model.tokenizer
generate_str = self.generate_with_limit(
generate_str = self.generate_stable_with_limit(
model,
tokenizer,
"The capital of France is is",
Expand Down
10 changes: 7 additions & 3 deletions tests/models/test_ovis2.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,13 @@ def test_ovis(self):
inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)

with torch.inference_mode():
output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
output = self.generate_stable_with_limit(
model,
processor,
inputs=inputs,
max_new_tokens=128,
batch_decode=True,
)
print(f'Output:\n{output}')

self.assertIn("snow", output.lower())
27 changes: 13 additions & 14 deletions tests/models/test_ovis_1_6_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,23 +38,22 @@ def test_ovis_1_6(self):
input_ids = input_ids.unsqueeze(0).to(device=model.device)
attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
inputs = {
"input_ids": input_ids,
"pixel_values": pixel_values,
"attention_mask": attention_mask,
}

# generate output
with torch.inference_mode():
gen_kwargs = {
"max_new_tokens": 1024,
"do_sample": False,
"top_p": None,
"top_k": None,
"temperature": None,
"repetition_penalty": None,
"eos_token_id": model.generation_config.eos_token_id,
"pad_token_id": text_tokenizer.pad_token_id,
"use_cache": True
}
output_ids = \
model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
output = self.generate_stable_with_limit(
model,
text_tokenizer,
inputs=inputs,
max_new_tokens=1024,
skip_special_tokens=True,
use_cache=True,
)

print(f'Output:\n{output}')

Expand Down
9 changes: 8 additions & 1 deletion tests/models/test_qwen2_5_omni.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,14 @@ def test_qwen2_5_omni(self):

# Inference: Generation of the output (text and audio)
audio_file_name = 'output_gptq.wav'
generated_ids, audio = model.generate(**inputs, max_new_tokens=128, return_audio = True)
generated_ids, audio = self.generate_stable_with_limit(
model,
processor,
inputs=inputs,
max_new_tokens=128,
return_generate_output=True,
return_audio=True,
)
sf.write(
audio_file_name,
audio.reshape(-1).detach().cpu().numpy(),
Expand Down
15 changes: 8 additions & 7 deletions tests/models/test_qwen2_5_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,14 @@ def test_qwen2_vl(self):
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
output_text = self.generate_stable_with_limit(
model,
processor,
inputs=inputs,
max_new_tokens=128,
batch_decode=True,
clean_up_tokenization_spaces=False,
)
print("output_text:", output_text)

self.assertIn("dog", output_text)
Expand Down
15 changes: 8 additions & 7 deletions tests/models/test_qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,14 @@ def test_qwen2_vl(self):
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
output_text = self.generate_stable_with_limit(
model,
processor,
inputs=inputs,
max_new_tokens=128,
batch_decode=True,
clean_up_tokenization_spaces=False,
)
print("output_text:", output_text)

self.assertIn("dog", output_text)
Expand Down
15 changes: 8 additions & 7 deletions tests/models/test_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,14 @@ def test_qwen3_vl(self):
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)[0]
output_text = self.generate_stable_with_limit(
model,
processor,
inputs=inputs,
max_new_tokens=128,
batch_decode=True,
clean_up_tokenization_spaces=False,
)
print("output_text:", output_text)

self.assertIn("dog", output_text)
Expand Down
25 changes: 18 additions & 7 deletions tests/test_awq.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,8 +152,12 @@ def test_quant_and_inference(self, checkpoint_format, backend, group_size: int):

self.assert_awq_linear(model, backend)

tokens = model.generate("Capital of France is", max_new_tokens=100)[0]
result = model.tokenizer.decode(tokens)
result = ModelTest.generate_stable_with_limit(
model,
self.tokenizer,
"The capital city of France is named",
max_new_tokens=100,
)
print(f"BACKEND: {backend}, Result: {result}")
if "paris" not in result.lower() and "city" not in result.lower():
raise AssertionError(" `paris` not found in `result`")
Expand Down Expand Up @@ -191,8 +195,12 @@ def test_inference_mistral_awq(self):
device="cuda"
)

tokens = model.generate("Capital of France is", max_new_tokens=64)[0]
result = model.tokenizer.decode(tokens)
result = ModelTest.generate_stable_with_limit(
model,
model.tokenizer,
"The capital city of France is named",
max_new_tokens=64,
)
if "paris" not in result.lower() and "city" not in result.lower():
raise AssertionError(" `paris` not found in `result`")

Expand All @@ -205,9 +213,12 @@ def test_inference_quantized_by_llm_awq(self):
device="cuda"
)

tokens = model.generate("The capital city of France is named",
max_new_tokens=512)[0]
result = model.tokenizer.decode(tokens)
result = ModelTest.generate_stable_with_limit(
model,
model.tokenizer,
"The capital city of France is named",
max_new_tokens=512,
)
print("result", result)
if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower() and "country" not in result.lower():
raise AssertionError(" `paris` not found in `result`")
Expand Down
9 changes: 7 additions & 2 deletions tests/test_awq_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from datasets import load_dataset
from parameterized import parameterized
from transformers import AutoTokenizer
from models.model_test import ModelTest

from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
from gptqmodel.quantization import FORMAT, METHOD, QUANT_CONFIG_FILENAME
Expand Down Expand Up @@ -83,8 +84,12 @@ def test_quant_and_inference(self, group_size: int):

# self.assert_awq_linear(model)

tokens = model.generate("Capital of France is", max_new_tokens=100)[0]
result = model.tokenizer.decode(tokens)
result = ModelTest.generate_stable_with_limit(
model,
model.tokenizer,
"The capital city of France is named",
max_new_tokens=100,
)
print(f"BACKEND: {BACKEND.GEMM}, Result: {result}")
if "paris" not in result.lower() and "city" not in result.lower():
raise AssertionError(" `paris` not found in `result`")
Expand Down
34 changes: 20 additions & 14 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import unittest # noqa: E402

import torch
from models.model_test import ModelTest # noqa: E402
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig # noqa: E402

from gptqmodel.utils.torch import torch_empty_cache # noqa: E402
Expand Down Expand Up @@ -60,8 +61,13 @@ def _test_quantize(self, device_map):

model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map=device_map)

generate_str = tokenizer.decode(
model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])
generate_str = ModelTest.generate_stable_with_limit(
model,
tokenizer,
"gptqmodel is",
max_new_tokens=30,
skip_special_tokens=False,
)

self.assertIn("is a good", generate_str.lower())

Expand Down Expand Up @@ -103,9 +109,14 @@ def assertInference(self, model, tokenizer=None, keywords=None, prompt=INFERENCE
def generate(self, model, tokenizer, prompt=None):
if prompt is None:
prompt = self.INFERENCE_PROMPT
inp = tokenizer(prompt, return_tensors="pt").to(model.device)
res = model.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=30)
output = tokenizer.decode(res[0])
output = ModelTest.generate_stable_with_limit(
model,
tokenizer,
prompt,
min_new_tokens=10,
max_new_tokens=30,
skip_special_tokens=False,
)
print(f"Result is: >>\n{output}\n<<")
return output

Expand All @@ -117,18 +128,13 @@ def test_llm_awq(self):
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

inputs = tokenizer("Capital of France is", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model.generate(
**inputs,
result = ModelTest.generate_stable_with_limit(
model,
tokenizer,
"The capital city of France is named",
max_new_tokens=128,
temperature=0.7,
top_p=0.9,
do_sample=True
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("result:", result)

if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower():
Expand Down
Loading
Loading