From 8d1ca9ffc1a9af380eb5d14ed1c2ef96e23c33b1 Mon Sep 17 00:00:00 2001 From: Qubitium Date: Sat, 14 Mar 2026 00:25:17 +0000 Subject: [PATCH] fix qwen3-vl compat --- gptqmodel/models/definitions/base_qwen3_vl.py | 39 +++++++++++------ tests/test_qwen3_vl_dependency.py | 43 +++++++++++++++++++ 2 files changed, 69 insertions(+), 13 deletions(-) diff --git a/gptqmodel/models/definitions/base_qwen3_vl.py b/gptqmodel/models/definitions/base_qwen3_vl.py index d469cf495..2a66ed931 100644 --- a/gptqmodel/models/definitions/base_qwen3_vl.py +++ b/gptqmodel/models/definitions/base_qwen3_vl.py @@ -54,30 +54,35 @@ class BaseQwen3VLGPTQ(BaseQModel): require_load_processor = True + def _core_multimodal_model(self): + return getattr(self.model, "model", self.model) + def pre_quantize_generate_hook_start(self): - self.model.language_model.embed_tokens = move_to(self.model.language_model.embed_tokens, device=self.quantize_config.device) - self.model.language_model.rotary_emb = move_to(self.model.language_model.rotary_emb, device=self.quantize_config.device) - self.model.visual = move_to(self.model.visual, device=self.quantize_config.device) + core_model = self._core_multimodal_model() + core_model.language_model.embed_tokens = move_to(core_model.language_model.embed_tokens, device=self.quantize_config.device) + core_model.language_model.rotary_emb = move_to(core_model.language_model.rotary_emb, device=self.quantize_config.device) + core_model.visual = move_to(core_model.visual, device=self.quantize_config.device) def pre_quantize_generate_hook_end(self): + core_model = self._core_multimodal_model() if self.quantize_config.offload_to_disk: - offload_to_disk(model=self.model.language_model, - module=self.model.language_model.embed_tokens, + offload_to_disk(model=core_model.language_model, + module=core_model.language_model.embed_tokens, disk_path=self.quantize_config.offload_to_disk_path, ) - offload_to_disk(model=self.model.language_model, - module=self.model.language_model.rotary_emb, + offload_to_disk(model=core_model.language_model, + module=core_model.language_model.rotary_emb, disk_path=self.quantize_config.offload_to_disk_path, ) - offload_to_disk(model=self.model, - module=self.model.visual, + offload_to_disk(model=core_model, + module=core_model.visual, disk_path=self.quantize_config.offload_to_disk_path, ) return - self.model.language_model.embed_tokens = move_to(self.model.language_model.embed_tokens, device=CPU) - self.model.language_model.rotary_emb = move_to(self.model.language_model.rotary_emb, device=CPU) - self.model.visual = move_to(self.model.visual, device=CPU) + core_model.language_model.embed_tokens = move_to(core_model.language_model.embed_tokens, device=CPU) + core_model.language_model.rotary_emb = move_to(core_model.language_model.rotary_emb, device=CPU) + core_model.visual = move_to(core_model.visual, device=CPU) @staticmethod def process_vision_info( @@ -126,6 +131,10 @@ def process_vision_info( if return_video_kwargs: return image_inputs, video_inputs, video_kwargs + if video_inputs is None and not return_video_metadata: + # Keep the image-only call contract aligned with the earlier VL + # adapters so processor(images=...) can use the return value directly. + return image_inputs return image_inputs, video_inputs def preprocess_dataset(self, sample: Dict) -> Dict: @@ -141,7 +150,11 @@ def prepare_dataset(self, calibration_dataset, batch_size: int = 1, **kwargs): text = processor.apply_chat_template( batch, tokenize=False, add_generation_prompt=True ) - image_inputs, video_inputs = self.process_vision_info(batch) + vision_inputs = self.process_vision_info(batch) + if isinstance(vision_inputs, tuple): + image_inputs, video_inputs = vision_inputs + else: + image_inputs, video_inputs = vision_inputs, None inputs = processor( text=text, images=image_inputs, diff --git a/tests/test_qwen3_vl_dependency.py b/tests/test_qwen3_vl_dependency.py index ac3e86ded..e8e3184ed 100644 --- a/tests/test_qwen3_vl_dependency.py +++ b/tests/test_qwen3_vl_dependency.py @@ -4,8 +4,11 @@ import builtins import sys +import types import pytest +from PIL import Image +from torch import nn from gptqmodel.models.definitions import base_qwen3_vl @@ -35,3 +38,43 @@ def fail_qwen_vl_import(name, *args, **kwargs): with pytest.raises(ImportError, match="pip install qwen-vl-utils"): base_qwen3_vl.BaseQwen3VLGPTQ.process_vision_info(messages) + + +def test_qwen3_vl_image_only_process_vision_info_returns_image_list(): + image = Image.new("RGB", (2, 2), color="white") + messages = [ + { + "role": "user", + "content": [ + {"type": "image", "image": image}, + {"type": "text", "text": "Describe this image."}, + ], + } + ] + + image_inputs = base_qwen3_vl.BaseQwen3VLGPTQ.process_vision_info(messages) + + assert isinstance(image_inputs, list) + assert image_inputs == [image] + + +def test_qwen3_vl_pre_quantize_hooks_use_inner_model_layout(): + instance = object.__new__(base_qwen3_vl.BaseQwen3VLGPTQ) + inner_model = types.SimpleNamespace( + language_model=types.SimpleNamespace( + embed_tokens=nn.Embedding(4, 4), + rotary_emb=nn.Identity(), + ), + visual=nn.Identity(), + ) + instance.model = types.SimpleNamespace(model=inner_model) + instance.quantize_config = types.SimpleNamespace( + device="cpu", + offload_to_disk=False, + offload_to_disk_path="/tmp/unused", + ) + + instance.pre_quantize_generate_hook_start() + instance.pre_quantize_generate_hook_end() + + assert instance.model.model.language_model.embed_tokens.weight.device.type == "cpu"