From 5078978a5b86871b3e78d8b1b0f6f0a67451062a Mon Sep 17 00:00:00 2001 From: Qubitium Date: Fri, 13 Mar 2026 08:58:33 +0000 Subject: [PATCH 1/2] rename generate_with_limit to generate_stable_with_limit and use for ci tests --- tests/models/model_test.py | 6 ++++-- tests/models/test_bloom_bias_torch_fused.py | 2 +- tests/models/test_llama3_2_torch_fused.py | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index 6db4a9c64..cec5561cb 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -362,7 +362,9 @@ def generateChat(self, model, tokenizer, prompt=None): print(f"Result is: \n{output}") return output - def generate_with_limit(self, model, tokenizer, prompt, max_new_tokens=512): + # Use this helper for CI output assertions instead of raw model.generate(). + # It forces deterministic decoding so expected-text checks are less flaky across runs. + def generate_stable_with_limit(self, model, tokenizer, prompt, max_new_tokens=512): inputs = tokenizer(prompt, return_tensors="pt").to(model.device) pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id generated = model.generate( @@ -383,7 +385,7 @@ def run_generic_inference_checks(self, model, tokenizer, backend): prompt = item["prompt"] keywords = item["keywords"] try: - response = self.generate_with_limit(model, tokenizer, prompt) + response = self.generate_stable_with_limit(model, tokenizer, prompt) normalized = response.lower() matched = any(keyword.lower() in normalized for keyword in keywords) results.append( diff --git a/tests/models/test_bloom_bias_torch_fused.py b/tests/models/test_bloom_bias_torch_fused.py index ebae699db..19084eef5 100644 --- a/tests/models/test_bloom_bias_torch_fused.py +++ b/tests/models/test_bloom_bias_torch_fused.py @@ -35,7 +35,7 @@ def test_with_torch_fused_cpu(self, backend): backend=BACKEND.TORCH_FUSED, device=DEVICE.CPU, ) - generate_str = self.generate_with_limit( + generate_str = self.generate_stable_with_limit( model, tokenizer, "The capital city of France is named", diff --git a/tests/models/test_llama3_2_torch_fused.py b/tests/models/test_llama3_2_torch_fused.py index 51a6ac005..a516692be 100644 --- a/tests/models/test_llama3_2_torch_fused.py +++ b/tests/models/test_llama3_2_torch_fused.py @@ -24,7 +24,7 @@ def test_with_torch_fused_cpu(self, backend): device=DEVICE.CPU, ) tokenizer = model.tokenizer - generate_str = self.generate_with_limit( + generate_str = self.generate_stable_with_limit( model, tokenizer, "The capital of France is is", From 1d43ec9dc562e9a3bbe0135d76f87b7af21c1e02 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Fri, 13 Mar 2026 09:08:08 +0000 Subject: [PATCH 2/2] use generate_with_limit --- tests/models/model_test.py | 72 ++++++++++++++++--- tests/models/test_ovis2.py | 10 ++- tests/models/test_ovis_1_6_llama.py | 27 ++++--- tests/models/test_qwen2_5_omni.py | 9 ++- tests/models/test_qwen2_5_vl.py | 15 ++-- tests/models/test_qwen2_vl.py | 15 ++-- tests/models/test_qwen3_vl.py | 15 ++-- tests/test_awq.py | 25 +++++-- tests/test_awq_moe.py | 9 ++- tests/test_integration.py | 34 +++++---- tests/test_lora.py | 16 +++-- tests/test_modelscope.py | 7 +- tests/test_multi_gpu_inference.py | 16 ++--- tests/test_post_quant_eora.py | 8 ++- tests/test_q4_bitblas.py | 27 ++++--- tests/test_q4_exllama_v1.py | 13 ++-- tests/test_q4_exllama_v2.py | 27 ++++--- tests/test_qqq.py | 10 ++- tests/test_qqq_inference.py | 11 +-- tests/test_save_loaded_quantized_model.py | 22 +++--- .../test_save_loaded_quantized_model_ipex.py | 22 ++++-- tests/test_sharded.py | 25 ++++--- tests/test_torch_xpu.py | 7 +- 23 files changed, 298 insertions(+), 144 deletions(-) diff --git a/tests/models/model_test.py b/tests/models/model_test.py index cec5561cb..2c4aae263 100644 --- a/tests/models/model_test.py +++ b/tests/models/model_test.py @@ -37,7 +37,7 @@ import tempfile # noqa: E402 import textwrap # noqa: E402 import unittest # noqa: E402 -from collections.abc import Iterable # noqa: E402 +from collections.abc import Iterable, Mapping # noqa: E402 import torch.cuda # noqa: E402 from datasets import load_dataset # noqa: E402 @@ -362,20 +362,74 @@ def generateChat(self, model, tokenizer, prompt=None): print(f"Result is: \n{output}") return output - # Use this helper for CI output assertions instead of raw model.generate(). - # It forces deterministic decoding so expected-text checks are less flaky across runs. - def generate_stable_with_limit(self, model, tokenizer, prompt, max_new_tokens=512): - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id + # Use this helper for CI output assertions instead of raw model.generate(), + # including in standalone unittest cases, so expected-text checks stay deterministic. + @staticmethod + def generate_stable_with_limit( + model, + tokenizer, + prompt=None, + max_new_tokens=512, + min_new_tokens=None, + skip_special_tokens=True, + inputs=None, + decode_start_idx=None, + batch_decode=False, + clean_up_tokenization_spaces=None, + return_generate_output=False, + **generate_kwargs, + ): + if inputs is None: + inputs = tokenizer(prompt, return_tensors="pt").to(model.device) + elif hasattr(inputs, "to"): + inputs = inputs.to(model.device) + + generation_inputs = dict(inputs) if isinstance(inputs, Mapping) else {"input_ids": inputs} + + decoder = getattr(tokenizer, "tokenizer", tokenizer) + pad_token_id = decoder.pad_token_id if decoder.pad_token_id is not None else decoder.eos_token_id generated = model.generate( - **inputs, + **generation_inputs, max_new_tokens=max_new_tokens, + min_new_tokens=min_new_tokens, do_sample=False, num_beams=1, pad_token_id=pad_token_id, - eos_token_id=tokenizer.eos_token_id, + eos_token_id=decoder.eos_token_id, + **generate_kwargs, + ) + if return_generate_output: + return generated + + generated_ids = generated[0] if isinstance(generated, tuple) else generated + + if batch_decode: + if decode_start_idx is None: + if hasattr(inputs, "input_ids"): + decode_start_idx = [len(input_ids) for input_ids in inputs.input_ids] + else: + raise ValueError("decode_start_idx is required for batch_decode when inputs lack input_ids") + + if isinstance(decode_start_idx, int): + generated_ids = [output_ids[decode_start_idx:] for output_ids in generated_ids] + else: + generated_ids = [ + output_ids[start_idx:] + for start_idx, output_ids in zip(decode_start_idx, generated_ids) + ] + + decode_kwargs = {"skip_special_tokens": skip_special_tokens} + if clean_up_tokenization_spaces is not None: + decode_kwargs["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces + return tokenizer.batch_decode(generated_ids, **decode_kwargs)[0] + + if decode_start_idx is None: + decode_start_idx = 0 + + return tokenizer.decode( + generated_ids[0][decode_start_idx:], + skip_special_tokens=skip_special_tokens, ) - return tokenizer.decode(generated[0], skip_special_tokens=True) def run_generic_inference_checks(self, model, tokenizer, backend): model.eval() diff --git a/tests/models/test_ovis2.py b/tests/models/test_ovis2.py index babda3b9b..6fb808fa0 100644 --- a/tests/models/test_ovis2.py +++ b/tests/models/test_ovis2.py @@ -42,9 +42,13 @@ def test_ovis(self): inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16) with torch.inference_mode(): - output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False) - generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)] - output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + output = self.generate_stable_with_limit( + model, + processor, + inputs=inputs, + max_new_tokens=128, + batch_decode=True, + ) print(f'Output:\n{output}') self.assertIn("snow", output.lower()) diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py index a21493f46..828623f2c 100644 --- a/tests/models/test_ovis_1_6_llama.py +++ b/tests/models/test_ovis_1_6_llama.py @@ -38,23 +38,22 @@ def test_ovis_1_6(self): input_ids = input_ids.unsqueeze(0).to(device=model.device) attention_mask = attention_mask.unsqueeze(0).to(device=model.device) pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)] + inputs = { + "input_ids": input_ids, + "pixel_values": pixel_values, + "attention_mask": attention_mask, + } # generate output with torch.inference_mode(): - gen_kwargs = { - "max_new_tokens": 1024, - "do_sample": False, - "top_p": None, - "top_k": None, - "temperature": None, - "repetition_penalty": None, - "eos_token_id": model.generation_config.eos_token_id, - "pad_token_id": text_tokenizer.pad_token_id, - "use_cache": True - } - output_ids = \ - model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0] - output = text_tokenizer.decode(output_ids, skip_special_tokens=True) + output = self.generate_stable_with_limit( + model, + text_tokenizer, + inputs=inputs, + max_new_tokens=1024, + skip_special_tokens=True, + use_cache=True, + ) print(f'Output:\n{output}') diff --git a/tests/models/test_qwen2_5_omni.py b/tests/models/test_qwen2_5_omni.py index 311b1d317..24ba8a209 100644 --- a/tests/models/test_qwen2_5_omni.py +++ b/tests/models/test_qwen2_5_omni.py @@ -93,7 +93,14 @@ def test_qwen2_5_omni(self): # Inference: Generation of the output (text and audio) audio_file_name = 'output_gptq.wav' - generated_ids, audio = model.generate(**inputs, max_new_tokens=128, return_audio = True) + generated_ids, audio = self.generate_stable_with_limit( + model, + processor, + inputs=inputs, + max_new_tokens=128, + return_generate_output=True, + return_audio=True, + ) sf.write( audio_file_name, audio.reshape(-1).detach().cpu().numpy(), diff --git a/tests/models/test_qwen2_5_vl.py b/tests/models/test_qwen2_5_vl.py index 09609c796..cbcdc1e46 100644 --- a/tests/models/test_qwen2_5_vl.py +++ b/tests/models/test_qwen2_5_vl.py @@ -54,13 +54,14 @@ def test_qwen2_vl(self): inputs = inputs.to("cuda") # Inference: Generation of the output - generated_ids = model.generate(**inputs, max_new_tokens=128) - generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - )[0] + output_text = self.generate_stable_with_limit( + model, + processor, + inputs=inputs, + max_new_tokens=128, + batch_decode=True, + clean_up_tokenization_spaces=False, + ) print("output_text:", output_text) self.assertIn("dog", output_text) diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py index e9d234321..f9147abb6 100644 --- a/tests/models/test_qwen2_vl.py +++ b/tests/models/test_qwen2_vl.py @@ -54,13 +54,14 @@ def test_qwen2_vl(self): inputs = inputs.to("cuda") # Inference: Generation of the output - generated_ids = model.generate(**inputs, max_new_tokens=128) - generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - )[0] + output_text = self.generate_stable_with_limit( + model, + processor, + inputs=inputs, + max_new_tokens=128, + batch_decode=True, + clean_up_tokenization_spaces=False, + ) print("output_text:", output_text) self.assertIn("dog", output_text) diff --git a/tests/models/test_qwen3_vl.py b/tests/models/test_qwen3_vl.py index 411801a01..6b40db8b0 100644 --- a/tests/models/test_qwen3_vl.py +++ b/tests/models/test_qwen3_vl.py @@ -54,13 +54,14 @@ def test_qwen3_vl(self): inputs = inputs.to("cuda") # Inference: Generation of the output - generated_ids = model.generate(**inputs, max_new_tokens=128) - generated_ids_trimmed = [ - out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) - ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - )[0] + output_text = self.generate_stable_with_limit( + model, + processor, + inputs=inputs, + max_new_tokens=128, + batch_decode=True, + clean_up_tokenization_spaces=False, + ) print("output_text:", output_text) self.assertIn("dog", output_text) diff --git a/tests/test_awq.py b/tests/test_awq.py index 390047657..b9a86685c 100644 --- a/tests/test_awq.py +++ b/tests/test_awq.py @@ -152,8 +152,12 @@ def test_quant_and_inference(self, checkpoint_format, backend, group_size: int): self.assert_awq_linear(model, backend) - tokens = model.generate("Capital of France is", max_new_tokens=100)[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + self.tokenizer, + "The capital city of France is named", + max_new_tokens=100, + ) print(f"BACKEND: {backend}, Result: {result}") if "paris" not in result.lower() and "city" not in result.lower(): raise AssertionError(" `paris` not found in `result`") @@ -191,8 +195,12 @@ def test_inference_mistral_awq(self): device="cuda" ) - tokens = model.generate("Capital of France is", max_new_tokens=64)[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + max_new_tokens=64, + ) if "paris" not in result.lower() and "city" not in result.lower(): raise AssertionError(" `paris` not found in `result`") @@ -205,9 +213,12 @@ def test_inference_quantized_by_llm_awq(self): device="cuda" ) - tokens = model.generate("The capital city of France is named", - max_new_tokens=512)[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + max_new_tokens=512, + ) print("result", result) if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower() and "country" not in result.lower(): raise AssertionError(" `paris` not found in `result`") diff --git a/tests/test_awq_moe.py b/tests/test_awq_moe.py index b11998c5c..c2bdb4c9d 100644 --- a/tests/test_awq_moe.py +++ b/tests/test_awq_moe.py @@ -13,6 +13,7 @@ from datasets import load_dataset from parameterized import parameterized from transformers import AutoTokenizer +from models.model_test import ModelTest from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear from gptqmodel.quantization import FORMAT, METHOD, QUANT_CONFIG_FILENAME @@ -83,8 +84,12 @@ def test_quant_and_inference(self, group_size: int): # self.assert_awq_linear(model) - tokens = model.generate("Capital of France is", max_new_tokens=100)[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + max_new_tokens=100, + ) print(f"BACKEND: {BACKEND.GEMM}, Result: {result}") if "paris" not in result.lower() and "city" not in result.lower(): raise AssertionError(" `paris` not found in `result`") diff --git a/tests/test_integration.py b/tests/test_integration.py index 9d185bd3e..14a252f67 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -10,6 +10,7 @@ import unittest # noqa: E402 import torch +from models.model_test import ModelTest # noqa: E402 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig # noqa: E402 from gptqmodel.utils.torch import torch_empty_cache # noqa: E402 @@ -60,8 +61,13 @@ def _test_quantize(self, device_map): model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map=device_map) - generate_str = tokenizer.decode( - model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]) + generate_str = ModelTest.generate_stable_with_limit( + model, + tokenizer, + "gptqmodel is", + max_new_tokens=30, + skip_special_tokens=False, + ) self.assertIn("is a good", generate_str.lower()) @@ -103,9 +109,14 @@ def assertInference(self, model, tokenizer=None, keywords=None, prompt=INFERENCE def generate(self, model, tokenizer, prompt=None): if prompt is None: prompt = self.INFERENCE_PROMPT - inp = tokenizer(prompt, return_tensors="pt").to(model.device) - res = model.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=30) - output = tokenizer.decode(res[0]) + output = ModelTest.generate_stable_with_limit( + model, + tokenizer, + prompt, + min_new_tokens=10, + max_new_tokens=30, + skip_special_tokens=False, + ) print(f"Result is: >>\n{output}\n<<") return output @@ -117,18 +128,13 @@ def test_llm_awq(self): ) tokenizer = AutoTokenizer.from_pretrained(model_name) - inputs = tokenizer("Capital of France is", return_tensors="pt").to(model.device) - with torch.no_grad(): - outputs = model.generate( - **inputs, + result = ModelTest.generate_stable_with_limit( + model, + tokenizer, + "The capital city of France is named", max_new_tokens=128, - temperature=0.7, - top_p=0.9, - do_sample=True ) - - result = tokenizer.decode(outputs[0], skip_special_tokens=True) print("result:", result) if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower(): diff --git a/tests/test_lora.py b/tests/test_lora.py index 8e06bd50e..270403508 100644 --- a/tests/test_lora.py +++ b/tests/test_lora.py @@ -63,8 +63,11 @@ def test_load(self, backend: BACKEND): ) # print(model) - tokens = model.generate("The capital city of France is named")[0] - result = model.tokenizer.decode(tokens) + result = self.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + ) print(f"Result: {result}") self.assertIn("paris", result.lower()) @@ -81,8 +84,13 @@ def test_download(self, backend: BACKEND): device_map="auto", ) - tokens = model.generate("The capital city of France is named", min_new_tokens=128, max_new_tokens=128)[0] - result = model.tokenizer.decode(tokens) + result = self.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + min_new_tokens=128, + max_new_tokens=128, + ) print(f"Result: {result}") self.assertIn("paris", result.lower()) if "paris" not in result.lower() and "built" not in result.lower(): diff --git a/tests/test_modelscope.py b/tests/test_modelscope.py index b62c28a7f..a0bdc0f09 100644 --- a/tests/test_modelscope.py +++ b/tests/test_modelscope.py @@ -25,8 +25,11 @@ def setUpClass(self): def test_load_modelscope(self): model = GPTQModel.load(self.MODEL_ID) - result = model.generate("The capital city of France is named")[0] - str_output = model.tokenizer.decode(result) + str_output = self.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + ) assert "paris" in str_output.lower() or "city" in str_output.lower() del model diff --git a/tests/test_multi_gpu_inference.py b/tests/test_multi_gpu_inference.py index b2fb728ea..ee39c9ee5 100644 --- a/tests/test_multi_gpu_inference.py +++ b/tests/test_multi_gpu_inference.py @@ -16,6 +16,7 @@ import unittest # noqa: E402 +from models.model_test import ModelTest # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402 pytestmark = [pytest.mark.model, pytest.mark.slow] @@ -46,15 +47,14 @@ def test_multi_gpu_inference(self): return_tensors="pt" ) - outputs = model.generate( - **model_inputs.to(model.device), - max_length=512 - ) - input_ids = model_inputs["input_ids"] - result = self.tokenizer.decode( - outputs[0][input_ids.shape[1]:], - skip_special_tokens=False + result = ModelTest.generate_stable_with_limit( + model, + self.tokenizer, + inputs=model_inputs, + max_new_tokens=512, + decode_start_idx=input_ids.shape[1], + skip_special_tokens=False, ) self.assertIn("2<|im_end|>", result.lower(), "The generated result should contain '2<|im_end|>'") diff --git a/tests/test_post_quant_eora.py b/tests/test_post_quant_eora.py index ca872c2f3..aa6d8c73e 100644 --- a/tests/test_post_quant_eora.py +++ b/tests/test_post_quant_eora.py @@ -47,8 +47,12 @@ def bench(path: str, backend: BACKEND, adapter: Optional[Lora]): if backend == BACKEND.TORCH: model.optimize() - tokens = model.generate("Capital of France is")[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + max_new_tokens=128, + ) print(f"BACKEND: {backend}, Result: {result}") if "paris" not in result.lower(): raise AssertionError(" `paris` not found in `result`") diff --git a/tests/test_q4_bitblas.py b/tests/test_q4_bitblas.py index 7d9e0c497..787d1f828 100644 --- a/tests/test_q4_bitblas.py +++ b/tests/test_q4_bitblas.py @@ -14,6 +14,7 @@ import pytest # noqa: E402 import torch # noqa: E402 +from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 from gptqmodel import BACKEND, GPTQModel # noqa: E402 @@ -43,11 +44,14 @@ def test_generation(self): tokenizer = AutoTokenizer.from_pretrained(model_id) - inp = tokenizer(prompt, return_tensors="pt").to(device) - - res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - - predicted_text = tokenizer.decode(res[0]) + predicted_text = ModelTest.generate_stable_with_limit( + model_q, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) self.assertIn("paris", predicted_text.lower()) @@ -70,9 +74,12 @@ def test_bias(self): tokenizer = AutoTokenizer.from_pretrained(model_id) prompt = "The capital city of France is named" - inp = tokenizer(prompt, return_tensors="pt").to("cuda:0") - - res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - - predicted_text = tokenizer.decode(res[0]) + predicted_text = ModelTest.generate_stable_with_limit( + model_q, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) self.assertIn("paris", predicted_text.lower()) diff --git a/tests/test_q4_exllama_v1.py b/tests/test_q4_exllama_v1.py index d3384611c..d1570c76e 100644 --- a/tests/test_q4_exllama_v1.py +++ b/tests/test_q4_exllama_v1.py @@ -1176,11 +1176,14 @@ def test_generation_desc_act_false(self): ) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inp = tokenizer(prompt, return_tensors="pt").to(device) - - res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - - predicted_text = tokenizer.decode(res[0]) + predicted_text = ModelTest.generate_stable_with_limit( + model_q, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) print("predicted_text", predicted_text) assert "paris" in predicted_text.lower() or "city" in predicted_text.lower() or "country" in predicted_text.lower() diff --git a/tests/test_q4_exllama_v2.py b/tests/test_q4_exllama_v2.py index 4e417a12e..4f973c39c 100644 --- a/tests/test_q4_exllama_v2.py +++ b/tests/test_q4_exllama_v2.py @@ -15,6 +15,7 @@ import unittest # noqa: E402 import torch # noqa: E402 +from models.model_test import ModelTest # noqa: E402 from test_q4_exllama_v1 import REFERENCE, get_diff # noqa: E402 from transformers import AutoTokenizer # noqa: E402 @@ -97,11 +98,14 @@ def test_generation_desc_act_false(self): model_q = GPTQModel.load(model_id, device="cuda:0") tokenizer = AutoTokenizer.from_pretrained(model_id) - inp = tokenizer(prompt, return_tensors="pt").to(device) - - res = model_q.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=60, max_new_tokens=60) - - predicted_text = tokenizer.decode(res[0]) + predicted_text = ModelTest.generate_stable_with_limit( + model_q, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) self.assertEqual(predicted_text[:GENERATE_EVAL_SIZE], reference_output[:GENERATE_EVAL_SIZE]) @@ -120,11 +124,14 @@ def test_generation_desc_act_true(self): ) tokenizer = AutoTokenizer.from_pretrained(model_id) - inp = tokenizer(prompt, return_tensors="pt").to(device) - - res = model_q.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - - predicted_text = tokenizer.decode(res[0]) + predicted_text = ModelTest.generate_stable_with_limit( + model_q, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) print("predicted_text", predicted_text) assert "paris" in predicted_text.lower() or "city" in predicted_text.lower() diff --git a/tests/test_qqq.py b/tests/test_qqq.py index acb12c56f..75514d9f4 100644 --- a/tests/test_qqq.py +++ b/tests/test_qqq.py @@ -13,6 +13,7 @@ from datasets import load_dataset from parameterized import parameterized from transformers import AutoTokenizer +from models.model_test import ModelTest from gptqmodel.nn_modules.qlinear.qqq import QQQQuantLinear from gptqmodel.quantization import FORMAT, METHOD, QUANT_CONFIG_FILENAME @@ -87,8 +88,13 @@ def test_quant_and_inference(self, group_size: int): self.assert_qqq_linear(model) - tokens = model.generate("The capital city of France is named", min_new_tokens=128, max_new_tokens=128)[0] - result = model.tokenizer.decode(tokens) + result = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + min_new_tokens=128, + max_new_tokens=128, + ) print(f"BACKEND: {BACKEND.QQQ}, Result: {result}") if "paris" not in result.lower() and "city" not in result.lower() and "country" not in result.lower(): raise AssertionError(" `paris` not found in `result`") diff --git a/tests/test_qqq_inference.py b/tests/test_qqq_inference.py index 1c448dab0..5d33322e5 100644 --- a/tests/test_qqq_inference.py +++ b/tests/test_qqq_inference.py @@ -5,16 +5,17 @@ import pytest +from models.model_test import ModelTest from gptqmodel import GPTQModel from gptqmodel.utils.eval import EVAL pytestmark = [pytest.mark.model, pytest.mark.slow] - -from gptqmodel import GPTQModel - def test_qqq_inference(): model = GPTQModel.load("HandH1998/QQQ-Llama-3-8b-g128") - result = model.generate("The capital city of France is named")[0] - str_output = model.tokenizer.decode(result) + str_output = ModelTest.generate_stable_with_limit( + model, + model.tokenizer, + "The capital city of France is named", + ) assert "paris" in str_output.lower() or "city" in str_output.lower() diff --git a/tests/test_save_loaded_quantized_model.py b/tests/test_save_loaded_quantized_model.py index c829476b8..88c1c4bdd 100644 --- a/tests/test_save_loaded_quantized_model.py +++ b/tests/test_save_loaded_quantized_model.py @@ -15,6 +15,7 @@ from parameterized import parameterized # noqa: E402 import pytest # noqa: E402 import torch # noqa: E402 +from models.model_test import ModelTest # noqa: E402 from transformers import AutoTokenizer # noqa: E402 from gptqmodel import BACKEND, GPTQModel, get_best_device # noqa: E402 @@ -33,9 +34,9 @@ def _require_backend(self, backend: BACKEND): if not ok: self.skipTest(f"{backend} unavailable: {err}") - def _generate_or_skip(self, model, backend: BACKEND, **kwargs): + def _generate_or_skip(self, model, backend: BACKEND, tokenizer, prompt, **kwargs): try: - return model.generate(**kwargs) + return ModelTest.generate_stable_with_limit(model, tokenizer, prompt, **kwargs) except Exception as exc: if backend == BACKEND.BITBLAS: message = str(exc).lower() @@ -60,19 +61,18 @@ def test_save(self, backend: BACKEND): prompt = "I am in Paris and" device = get_best_device(backend) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inp = tokenizer(prompt, return_tensors="pt").to(device) # origin model produce correct output origin_model = GPTQModel.load(MODEL_ID, backend=backend, device=device) - origin_model_res = self._generate_or_skip( + origin_model_predicted_text = self._generate_or_skip( origin_model, backend, - **inp, - num_beams=1, + tokenizer, + prompt, min_new_tokens=60, max_new_tokens=60, + skip_special_tokens=False, ) - origin_model_predicted_text = tokenizer.decode(origin_model_res[0]) with tempfile.TemporaryDirectory() as tmpdir: origin_model.save(tmpdir) @@ -80,15 +80,15 @@ def test_save(self, backend: BACKEND): # saved model produce wrong output new_model = GPTQModel.load(tmpdir, backend=backend, device=device) - new_model_res = self._generate_or_skip( + new_model_predicted_text = self._generate_or_skip( new_model, backend, - **inp, - num_beams=1, + tokenizer, + prompt, min_new_tokens=60, max_new_tokens=60, + skip_special_tokens=False, ) - new_model_predicted_text = tokenizer.decode(new_model_res[0]) print("origin_model_predicted_text",origin_model_predicted_text) print("new_model_predicted_text",new_model_predicted_text) diff --git a/tests/test_save_loaded_quantized_model_ipex.py b/tests/test_save_loaded_quantized_model_ipex.py index ea4871af2..75099cb9a 100644 --- a/tests/test_save_loaded_quantized_model_ipex.py +++ b/tests/test_save_loaded_quantized_model_ipex.py @@ -12,6 +12,7 @@ import tempfile # noqa: E402 import unittest # noqa: E402 +from models.model_test import ModelTest # noqa: E402 from parameterized import parameterized # noqa: E402 from transformers import AutoTokenizer # noqa: E402 @@ -30,12 +31,17 @@ def test_save(self, backend: BACKEND): prompt = "I am in Paris and" device = get_best_device(backend) tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) - inp = tokenizer(prompt, return_tensors="pt").to(device) # origin model produce correct output origin_model = GPTQModel.load(MODEL_ID, backend=backend) - origin_model_res = origin_model.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - origin_model_predicted_text = tokenizer.decode(origin_model_res[0]) + origin_model_predicted_text = ModelTest.generate_stable_with_limit( + origin_model, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) with tempfile.TemporaryDirectory() as tmpdir: origin_model.save(tmpdir) @@ -43,8 +49,14 @@ def test_save(self, backend: BACKEND): # saved model produce wrong output new_model = GPTQModel.load(tmpdir, backend=backend) - new_model_res = new_model.generate(**inp, num_beams=1, min_new_tokens=60, max_new_tokens=60) - new_model_predicted_text = tokenizer.decode(new_model_res[0]) + new_model_predicted_text = ModelTest.generate_stable_with_limit( + new_model, + tokenizer, + prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) print("origin_model_predicted_text",origin_model_predicted_text) print("new_model_predicted_text",new_model_predicted_text) diff --git a/tests/test_sharded.py b/tests/test_sharded.py index 9b2a00bd3..755b5d5e9 100644 --- a/tests/test_sharded.py +++ b/tests/test_sharded.py @@ -14,6 +14,7 @@ import tempfile # noqa: E402 import unittest # noqa: E402 +from models.model_test import ModelTest # noqa: E402 import torch # noqa: E402 from transformers import AutoTokenizer # noqa: E402 @@ -53,10 +54,14 @@ def test_save_and_load(self): device_map="auto", ) - inp = tokenizer(self.prompt, return_tensors="pt").to(self.device) - - tokens = model.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=60, max_new_tokens=60) - result = tokenizer.decode(tokens[0]) + result = ModelTest.generate_stable_with_limit( + model, + tokenizer, + self.prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) self.assertEqual(result[:100], self.reference_output[:100]) @@ -84,9 +89,13 @@ def test_save_and_load_no_shard(self): device_map="auto", ) - inp = tokenizer(self.prompt, return_tensors="pt").to(self.device) - - tokens = model.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=60, max_new_tokens=60) - result = tokenizer.decode(tokens[0]) + result = ModelTest.generate_stable_with_limit( + model, + tokenizer, + self.prompt, + min_new_tokens=60, + max_new_tokens=60, + skip_special_tokens=False, + ) self.assertEqual(result[:100], self.reference_output[:100]) diff --git a/tests/test_torch_xpu.py b/tests/test_torch_xpu.py index 0547b79af..66985990e 100644 --- a/tests/test_torch_xpu.py +++ b/tests/test_torch_xpu.py @@ -39,7 +39,12 @@ def test(self): backend=BACKEND.TORCH, device=DEVICE.XPU, ) - generate_str = tokenizer.decode(model.generate(**tokenizer("The capital of France is is", return_tensors="pt").to(model.device), max_new_tokens=2)[0]) + generate_str = self.generate_stable_with_limit( + model, + tokenizer, + "The capital of France is is", + max_new_tokens=2, + ) print(f"generate_str: {generate_str}")