Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 26 additions & 13 deletions gptqmodel/models/definitions/base_qwen3_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,30 +54,35 @@ class BaseQwen3VLGPTQ(BaseQModel):

require_load_processor = True

def _core_multimodal_model(self):
return getattr(self.model, "model", self.model)

def pre_quantize_generate_hook_start(self):
self.model.language_model.embed_tokens = move_to(self.model.language_model.embed_tokens, device=self.quantize_config.device)
self.model.language_model.rotary_emb = move_to(self.model.language_model.rotary_emb, device=self.quantize_config.device)
self.model.visual = move_to(self.model.visual, device=self.quantize_config.device)
core_model = self._core_multimodal_model()
core_model.language_model.embed_tokens = move_to(core_model.language_model.embed_tokens, device=self.quantize_config.device)
core_model.language_model.rotary_emb = move_to(core_model.language_model.rotary_emb, device=self.quantize_config.device)
core_model.visual = move_to(core_model.visual, device=self.quantize_config.device)

def pre_quantize_generate_hook_end(self):
core_model = self._core_multimodal_model()
if self.quantize_config.offload_to_disk:
offload_to_disk(model=self.model.language_model,
module=self.model.language_model.embed_tokens,
offload_to_disk(model=core_model.language_model,
module=core_model.language_model.embed_tokens,
disk_path=self.quantize_config.offload_to_disk_path,
)
offload_to_disk(model=self.model.language_model,
module=self.model.language_model.rotary_emb,
offload_to_disk(model=core_model.language_model,
module=core_model.language_model.rotary_emb,
disk_path=self.quantize_config.offload_to_disk_path,
)
offload_to_disk(model=self.model,
module=self.model.visual,
offload_to_disk(model=core_model,
module=core_model.visual,
disk_path=self.quantize_config.offload_to_disk_path,
)
return

self.model.language_model.embed_tokens = move_to(self.model.language_model.embed_tokens, device=CPU)
self.model.language_model.rotary_emb = move_to(self.model.language_model.rotary_emb, device=CPU)
self.model.visual = move_to(self.model.visual, device=CPU)
core_model.language_model.embed_tokens = move_to(core_model.language_model.embed_tokens, device=CPU)
core_model.language_model.rotary_emb = move_to(core_model.language_model.rotary_emb, device=CPU)
core_model.visual = move_to(core_model.visual, device=CPU)

@staticmethod
def process_vision_info(
Expand Down Expand Up @@ -126,6 +131,10 @@ def process_vision_info(

if return_video_kwargs:
return image_inputs, video_inputs, video_kwargs
if video_inputs is None and not return_video_metadata:
# Keep the image-only call contract aligned with the earlier VL
# adapters so processor(images=...) can use the return value directly.
return image_inputs
return image_inputs, video_inputs

def preprocess_dataset(self, sample: Dict) -> Dict:
Expand All @@ -141,7 +150,11 @@ def prepare_dataset(self, calibration_dataset, batch_size: int = 1, **kwargs):
text = processor.apply_chat_template(
batch, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = self.process_vision_info(batch)
vision_inputs = self.process_vision_info(batch)
if isinstance(vision_inputs, tuple):
image_inputs, video_inputs = vision_inputs
else:
image_inputs, video_inputs = vision_inputs, None
inputs = processor(
text=text,
images=image_inputs,
Expand Down
43 changes: 43 additions & 0 deletions tests/test_qwen3_vl_dependency.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@

import builtins
import sys
import types

import pytest
from PIL import Image
from torch import nn

from gptqmodel.models.definitions import base_qwen3_vl

Expand Down Expand Up @@ -35,3 +38,43 @@ def fail_qwen_vl_import(name, *args, **kwargs):

with pytest.raises(ImportError, match="pip install qwen-vl-utils"):
base_qwen3_vl.BaseQwen3VLGPTQ.process_vision_info(messages)


def test_qwen3_vl_image_only_process_vision_info_returns_image_list():
image = Image.new("RGB", (2, 2), color="white")
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": image},
{"type": "text", "text": "Describe this image."},
],
}
]

image_inputs = base_qwen3_vl.BaseQwen3VLGPTQ.process_vision_info(messages)

assert isinstance(image_inputs, list)
assert image_inputs == [image]


def test_qwen3_vl_pre_quantize_hooks_use_inner_model_layout():
instance = object.__new__(base_qwen3_vl.BaseQwen3VLGPTQ)
inner_model = types.SimpleNamespace(
language_model=types.SimpleNamespace(
embed_tokens=nn.Embedding(4, 4),
rotary_emb=nn.Identity(),
),
visual=nn.Identity(),
)
instance.model = types.SimpleNamespace(model=inner_model)
instance.quantize_config = types.SimpleNamespace(
device="cpu",
offload_to_disk=False,
offload_to_disk_path="/tmp/unused",
)

instance.pre_quantize_generate_hook_start()
instance.pre_quantize_generate_hook_end()

assert instance.model.model.language_model.embed_tokens.weight.device.type == "cpu"
Loading