ModelCloud · Qubitium · Mar 13, 2026 · Mar 13, 2026 · Mar 13, 2026
diff --git a/tests/models/model_test.py b/tests/models/model_test.py
@@ -37,7 +37,7 @@
 import tempfile  # noqa: E402
 import textwrap  # noqa: E402
 import unittest  # noqa: E402
-from collections.abc import Iterable  # noqa: E402
+from collections.abc import Iterable, Mapping  # noqa: E402
 
 import torch.cuda  # noqa: E402
 from datasets import load_dataset  # noqa: E402
@@ -362,18 +362,74 @@ def generateChat(self, model, tokenizer, prompt=None):
         print(f"Result is: \n{output}")
         return output
 
-    def generate_with_limit(self, model, tokenizer, prompt, max_new_tokens=512):
-        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-        pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+    # Use this helper for CI output assertions instead of raw model.generate(),
+    # including in standalone unittest cases, so expected-text checks stay deterministic.
+    @staticmethod
+    def generate_stable_with_limit(
+        model,
+        tokenizer,
+        prompt=None,
+        max_new_tokens=512,
+        min_new_tokens=None,
+        skip_special_tokens=True,
+        inputs=None,
+        decode_start_idx=None,
+        batch_decode=False,
+        clean_up_tokenization_spaces=None,
+        return_generate_output=False,
+        **generate_kwargs,
+    ):
+        if inputs is None:
+            inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+        elif hasattr(inputs, "to"):
+            inputs = inputs.to(model.device)
+
+        generation_inputs = dict(inputs) if isinstance(inputs, Mapping) else {"input_ids": inputs}
+
+        decoder = getattr(tokenizer, "tokenizer", tokenizer)
+        pad_token_id = decoder.pad_token_id if decoder.pad_token_id is not None else decoder.eos_token_id
         generated = model.generate(
-            **inputs,
+            **generation_inputs,
             max_new_tokens=max_new_tokens,
+            min_new_tokens=min_new_tokens,
             do_sample=False,
             num_beams=1,
             pad_token_id=pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
+            eos_token_id=decoder.eos_token_id,
+            **generate_kwargs,
+        )
+        if return_generate_output:
+            return generated
+
+        generated_ids = generated[0] if isinstance(generated, tuple) else generated
+
+        if batch_decode:
+            if decode_start_idx is None:
+                if hasattr(inputs, "input_ids"):
+                    decode_start_idx = [len(input_ids) for input_ids in inputs.input_ids]
+                else:
+                    raise ValueError("decode_start_idx is required for batch_decode when inputs lack input_ids")
+
+            if isinstance(decode_start_idx, int):
+                generated_ids = [output_ids[decode_start_idx:] for output_ids in generated_ids]
+            else:
+                generated_ids = [
+                    output_ids[start_idx:]
+                    for start_idx, output_ids in zip(decode_start_idx, generated_ids)
+                ]
+
+            decode_kwargs = {"skip_special_tokens": skip_special_tokens}
+            if clean_up_tokenization_spaces is not None:
+                decode_kwargs["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+            return tokenizer.batch_decode(generated_ids, **decode_kwargs)[0]
+
+        if decode_start_idx is None:
+            decode_start_idx = 0
+
+        return tokenizer.decode(
+            generated_ids[0][decode_start_idx:],
+            skip_special_tokens=skip_special_tokens,
         )
-        return tokenizer.decode(generated[0], skip_special_tokens=True)
 
     def run_generic_inference_checks(self, model, tokenizer, backend):
         model.eval()
@@ -383,7 +439,7 @@ def run_generic_inference_checks(self, model, tokenizer, backend):
             prompt = item["prompt"]
             keywords = item["keywords"]
             try:
-                response = self.generate_with_limit(model, tokenizer, prompt)
+                response = self.generate_stable_with_limit(model, tokenizer, prompt)
                 normalized = response.lower()
                 matched = any(keyword.lower() in normalized for keyword in keywords)
                 results.append(

diff --git a/tests/models/test_bloom_bias_torch_fused.py b/tests/models/test_bloom_bias_torch_fused.py
@@ -35,7 +35,7 @@ def test_with_torch_fused_cpu(self, backend):
                 backend=BACKEND.TORCH_FUSED,
                 device=DEVICE.CPU,
             )
-            generate_str = self.generate_with_limit(
+            generate_str = self.generate_stable_with_limit(
                 model,
                 tokenizer,
                 "The capital city of France is named",

diff --git a/tests/models/test_llama3_2_torch_fused.py b/tests/models/test_llama3_2_torch_fused.py
@@ -24,7 +24,7 @@ def test_with_torch_fused_cpu(self, backend):
             device=DEVICE.CPU,
         )
         tokenizer = model.tokenizer
-        generate_str = self.generate_with_limit(
+        generate_str = self.generate_stable_with_limit(
             model,
             tokenizer,
             "The capital of France is is",

diff --git a/tests/models/test_ovis2.py b/tests/models/test_ovis2.py
@@ -42,9 +42,13 @@ def test_ovis(self):
         inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
 
         with torch.inference_mode():
-            output_ids = model.generate(**inputs, max_new_tokens=128, do_sample=False)
-            generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
-            output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            output = self.generate_stable_with_limit(
+                model,
+                processor,
+                inputs=inputs,
+                max_new_tokens=128,
+                batch_decode=True,
+            )
             print(f'Output:\n{output}')
 
             self.assertIn("snow", output.lower())
diff --git a/tests/models/test_ovis_1_6_llama.py b/tests/models/test_ovis_1_6_llama.py
@@ -38,23 +38,22 @@ def test_ovis_1_6(self):
         input_ids = input_ids.unsqueeze(0).to(device=model.device)
         attention_mask = attention_mask.unsqueeze(0).to(device=model.device)
         pixel_values = [pixel_values.to(dtype=visual_tokenizer.dtype, device=visual_tokenizer.device)]
+        inputs = {
+            "input_ids": input_ids,
+            "pixel_values": pixel_values,
+            "attention_mask": attention_mask,
+        }
 
         # generate output
         with torch.inference_mode():
-            gen_kwargs = {
-                "max_new_tokens": 1024,
-                "do_sample": False,
-                "top_p": None,
-                "top_k": None,
-                "temperature": None,
-                "repetition_penalty": None,
-                "eos_token_id": model.generation_config.eos_token_id,
-                "pad_token_id": text_tokenizer.pad_token_id,
-                "use_cache": True
-            }
-            output_ids = \
-                model.generate(input_ids, pixel_values=pixel_values, attention_mask=attention_mask, **gen_kwargs)[0]
-            output = text_tokenizer.decode(output_ids, skip_special_tokens=True)
+            output = self.generate_stable_with_limit(
+                model,
+                text_tokenizer,
+                inputs=inputs,
+                max_new_tokens=1024,
+                skip_special_tokens=True,
+                use_cache=True,
+            )
 
             print(f'Output:\n{output}')
 

diff --git a/tests/models/test_qwen2_5_omni.py b/tests/models/test_qwen2_5_omni.py
@@ -93,7 +93,14 @@ def test_qwen2_5_omni(self):
 
         # Inference: Generation of the output (text and audio)
         audio_file_name = 'output_gptq.wav'
-        generated_ids, audio = model.generate(**inputs, max_new_tokens=128, return_audio = True)
+        generated_ids, audio = self.generate_stable_with_limit(
+            model,
+            processor,
+            inputs=inputs,
+            max_new_tokens=128,
+            return_generate_output=True,
+            return_audio=True,
+        )
         sf.write(
             audio_file_name,
             audio.reshape(-1).detach().cpu().numpy(),

diff --git a/tests/models/test_qwen2_5_vl.py b/tests/models/test_qwen2_5_vl.py
@@ -54,13 +54,14 @@ def test_qwen2_vl(self):
         inputs = inputs.to("cuda")
 
         # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, max_new_tokens=128)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
+        output_text = self.generate_stable_with_limit(
+            model,
+            processor,
+            inputs=inputs,
+            max_new_tokens=128,
+            batch_decode=True,
+            clean_up_tokenization_spaces=False,
+        )
         print("output_text:", output_text)
 
         self.assertIn("dog", output_text)

diff --git a/tests/models/test_qwen2_vl.py b/tests/models/test_qwen2_vl.py
@@ -54,13 +54,14 @@ def test_qwen2_vl(self):
         inputs = inputs.to("cuda")
 
         # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, max_new_tokens=128)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
+        output_text = self.generate_stable_with_limit(
+            model,
+            processor,
+            inputs=inputs,
+            max_new_tokens=128,
+            batch_decode=True,
+            clean_up_tokenization_spaces=False,
+        )
         print("output_text:", output_text)
 
         self.assertIn("dog", output_text)

diff --git a/tests/models/test_qwen3_vl.py b/tests/models/test_qwen3_vl.py
@@ -54,13 +54,14 @@ def test_qwen3_vl(self):
         inputs = inputs.to("cuda")
 
         # Inference: Generation of the output
-        generated_ids = model.generate(**inputs, max_new_tokens=128)
-        generated_ids_trimmed = [
-            out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
-        ]
-        output_text = processor.batch_decode(
-            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-        )[0]
+        output_text = self.generate_stable_with_limit(
+            model,
+            processor,
+            inputs=inputs,
+            max_new_tokens=128,
+            batch_decode=True,
+            clean_up_tokenization_spaces=False,
+        )
         print("output_text:", output_text)
 
         self.assertIn("dog", output_text)

diff --git a/tests/test_awq.py b/tests/test_awq.py
@@ -152,8 +152,12 @@ def test_quant_and_inference(self, checkpoint_format, backend, group_size: int):
 
         self.assert_awq_linear(model, backend)
 
-        tokens = model.generate("Capital of France is", max_new_tokens=100)[0]
-        result = model.tokenizer.decode(tokens)
+        result = ModelTest.generate_stable_with_limit(
+            model,
+            self.tokenizer,
+            "The capital city of France is named",
+            max_new_tokens=100,
+        )
         print(f"BACKEND: {backend}, Result: {result}")
         if "paris" not in result.lower() and "city" not in result.lower():
             raise AssertionError(" `paris` not found in `result`")
@@ -191,8 +195,12 @@ def test_inference_mistral_awq(self):
             device="cuda"
         )
 
-        tokens = model.generate("Capital of France is", max_new_tokens=64)[0]
-        result = model.tokenizer.decode(tokens)
+        result = ModelTest.generate_stable_with_limit(
+            model,
+            model.tokenizer,
+            "The capital city of France is named",
+            max_new_tokens=64,
+        )
         if "paris" not in result.lower() and "city" not in result.lower():
             raise AssertionError(" `paris` not found in `result`")
 
@@ -205,9 +213,12 @@ def test_inference_quantized_by_llm_awq(self):
             device="cuda"
         )
 
-        tokens = model.generate("The capital city of France is named",
-                                max_new_tokens=512)[0]
-        result = model.tokenizer.decode(tokens)
+        result = ModelTest.generate_stable_with_limit(
+            model,
+            model.tokenizer,
+            "The capital city of France is named",
+            max_new_tokens=512,
+        )
         print("result", result)
         if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower() and "country" not in result.lower():
             raise AssertionError(" `paris` not found in `result`")

diff --git a/tests/test_awq_moe.py b/tests/test_awq_moe.py
@@ -13,6 +13,7 @@
 from datasets import load_dataset
 from parameterized import parameterized
 from transformers import AutoTokenizer
+from models.model_test import ModelTest
 
 from gptqmodel.nn_modules.qlinear.gemm_awq import AwqGEMMQuantLinear
 from gptqmodel.quantization import FORMAT, METHOD, QUANT_CONFIG_FILENAME
@@ -83,8 +84,12 @@ def test_quant_and_inference(self, group_size: int):
 
             # self.assert_awq_linear(model)
 
-            tokens = model.generate("Capital of France is", max_new_tokens=100)[0]
-            result = model.tokenizer.decode(tokens)
+            result = ModelTest.generate_stable_with_limit(
+                model,
+                model.tokenizer,
+                "The capital city of France is named",
+                max_new_tokens=100,
+            )
             print(f"BACKEND: {BACKEND.GEMM}, Result: {result}")
             if "paris" not in result.lower() and "city" not in result.lower():
                 raise AssertionError(" `paris` not found in `result`")

diff --git a/tests/test_integration.py b/tests/test_integration.py
@@ -10,6 +10,7 @@
 import unittest  # noqa: E402
 
 import torch
+from models.model_test import ModelTest  # noqa: E402
 from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig  # noqa: E402
 
 from gptqmodel.utils.torch import torch_empty_cache  # noqa: E402
@@ -60,8 +61,13 @@ def _test_quantize(self, device_map):
 
             model = AutoModelForCausalLM.from_pretrained(tmp_dir, device_map=device_map)
 
-            generate_str = tokenizer.decode(
-                model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0])
+            generate_str = ModelTest.generate_stable_with_limit(
+                model,
+                tokenizer,
+                "gptqmodel is",
+                max_new_tokens=30,
+                skip_special_tokens=False,
+            )
 
             self.assertIn("is a good", generate_str.lower())
 
@@ -103,9 +109,14 @@ def assertInference(self, model, tokenizer=None, keywords=None, prompt=INFERENCE
     def generate(self, model, tokenizer, prompt=None):
         if prompt is None:
             prompt = self.INFERENCE_PROMPT
-        inp = tokenizer(prompt, return_tensors="pt").to(model.device)
-        res = model.generate(**inp, num_beams=1, do_sample=False, min_new_tokens=10, max_new_tokens=30)
-        output = tokenizer.decode(res[0])
+        output = ModelTest.generate_stable_with_limit(
+            model,
+            tokenizer,
+            prompt,
+            min_new_tokens=10,
+            max_new_tokens=30,
+            skip_special_tokens=False,
+        )
         print(f"Result is: >>\n{output}\n<<")
         return output
 
@@ -117,18 +128,13 @@ def test_llm_awq(self):
         )
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-        inputs = tokenizer("Capital of France is", return_tensors="pt").to(model.device)
-
         with torch.no_grad():
-            outputs = model.generate(
-                **inputs,
+            result = ModelTest.generate_stable_with_limit(
+                model,
+                tokenizer,
+                "The capital city of France is named",
                 max_new_tokens=128,
-                temperature=0.7,
-                top_p=0.9,
-                do_sample=True
             )
-
-            result = tokenizer.decode(outputs[0], skip_special_tokens=True)
             print("result:", result)
 
             if "paris" not in result.lower() and "city" not in result.lower() and "food" not in result.lower() and "market" not in result.lower():