Skip to content

Commit 73a484c

Browse files
authored
[Model][Quantization] Fix / Add GGUF support for Qwen2 MoE models (#30307)
Signed-off-by: Tsukasa OI <floss_llm@irq.a4lg.com>
1 parent b37bf51 commit 73a484c

File tree

1 file changed

+8
-0
lines changed

1 file changed

+8
-0
lines changed

vllm/model_executor/models/qwen2_moe.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -367,6 +367,8 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
367367
self.embed_tokens = VocabParallelEmbedding(
368368
config.vocab_size,
369369
config.hidden_size,
370+
quant_config=quant_config,
371+
prefix=f"{prefix}.embed_tokens",
370372
)
371373
self.start_layer, self.end_layer, self.layers = make_layers(
372374
config.num_hidden_layers,
@@ -512,6 +514,12 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
512514
continue
513515
else:
514516
name = remapped_kv_scale_name
517+
# GGUF: make sure that shared_expert_gate is a 2D tensor.
518+
if (
519+
"mlp.shared_expert_gate" in name
520+
and len(loaded_weight.shape) == 1
521+
):
522+
loaded_weight = loaded_weight[None, :]
515523
param = params_dict[name]
516524
weight_loader = getattr(
517525
param, "weight_loader", default_weight_loader

0 commit comments

Comments
 (0)