diff --git a/defuser/model_registry.py b/defuser/model_registry.py index 1e7a33d..31be712 100644 --- a/defuser/model_registry.py +++ b/defuser/model_registry.py @@ -4,6 +4,9 @@ # Contact: qubitium@modelcloud.ai, x.com/qubitium MODEL_CONFIG = { + "mixtral": { + "min_transformers_version": "5.0.0", + }, "qwen3_moe": { "min_transformers_version": "5.0.0", }, diff --git a/pyproject.toml b/pyproject.toml index 8c0dcd2..1f45e93 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,7 @@ build-backend = "setuptools.build_meta" [project] name = "Defuser" -version = "0.0.7" +version = "0.0.8" description = "Model defuser helper for HF Transformers." readme = "README.md" requires-python = ">=3.9" diff --git a/tests/test_convert_model.py b/tests/test_convert_model.py index 718d5f6..c03f90b 100644 --- a/tests/test_convert_model.py +++ b/tests/test_convert_model.py @@ -71,3 +71,45 @@ def test_qwen3_5_moe(): torch.testing.assert_close(expert0.gate_proj.weight, expected_gate) torch.testing.assert_close(expert0.up_proj.weight, expected_up) torch.testing.assert_close(expert0.down_proj.weight, expected_down) + + +def test_mixtral(): + from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock + + model_path = "/monster/data/model/Mixtral-8x7B-Instruct-v0.1" # "mistralai/Mixtral-8x7B-Instruct-v0.1" + config = AutoConfig.from_pretrained(model_path) + config.num_hidden_layers = 1 + model = AutoModelForCausalLM.from_pretrained( + model_path, + config=config, + ignore_mismatched_sizes=True, + ) + assert model.config.model_type == "mixtral" + + original_moe_block = model.model.layers[0].mlp + assert isinstance(original_moe_block, MixtralSparseMoeBlock) + + hidden_dim = original_moe_block.experts.gate_up_proj.shape[-1] + intermediate_dim = original_moe_block.experts.gate_up_proj.shape[1] // 2 + + expected_gate = original_moe_block.experts.gate_up_proj[0, :intermediate_dim, :hidden_dim].contiguous().clone() + expected_up = original_moe_block.experts.gate_up_proj[0, intermediate_dim:, :hidden_dim].contiguous().clone() + expected_down = original_moe_block.experts.down_proj[0, :hidden_dim, :intermediate_dim].contiguous().clone() + + converted = convert_model(model, cleanup_original=False, max_layers=1) + assert converted + + moe_block = model.model.layers[0].mlp + experts = moe_block.experts + + assert hasattr(experts, "0") + expert0 = getattr(experts, "0") + assert hasattr(expert0, "gate_proj") + assert hasattr(expert0, "up_proj") + assert hasattr(expert0, "down_proj") + + materialize_model(model.model.layers[0]) + + torch.testing.assert_close(expert0.gate_proj.weight, expected_gate) + torch.testing.assert_close(expert0.up_proj.weight, expected_up) + torch.testing.assert_close(expert0.down_proj.weight, expected_down) \ No newline at end of file