Skip to content

Commit 1b29a6b

Browse files
authored
fix qwen2 pp model with aoa (#3127)
Co-authored-by: llbdyiu66 <llbdyiu66@users.noreply.github.com>
1 parent be9e7df commit 1b29a6b

File tree

4 files changed

+64
-34
lines changed

4 files changed

+64
-34
lines changed

paddleformers/transformers/qwen2/modeling.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -402,7 +402,7 @@ def _gen_aoa_config(cls, config: Qwen2Config):
402402

403403
# lm_head
404404
if config.tie_word_embeddings:
405-
aoa_config["aoa_statements"] += ["model.embed_tokens.weight^T -> lm_head.weight"]
405+
aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"]
406406

407407
return aoa_config
408408

@@ -427,13 +427,14 @@ def _gen_inv_aoa_config(cls, config: Qwen2Config):
427427
aoa_statements += [
428428
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
429429
]
430+
for layer_id in range(config.num_hidden_layers):
431+
for x in ("q", "k", "v"):
432+
aoa_statements += [
433+
f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
434+
]
430435
aoa_statements += [
431436
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
432437
]
433-
aoa_statements += [
434-
f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
435-
for x in ("q", "k", "v")
436-
]
437438

438439
if not config.fuse_attention_ffn:
439440
aoa_statements += [
@@ -443,9 +444,12 @@ def _gen_inv_aoa_config(cls, config: Qwen2Config):
443444
else:
444445
aoa_statements += [
445446
f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
446-
"model.layers.$LAYER_ID.mlp.gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.gate_proj.weight",
447-
"model.layers.$LAYER_ID.mlp.up_proj.weight^T -> model.layers.$LAYER_ID.mlp.up_proj.weight",
448447
]
448+
for layer_id in range(config.num_hidden_layers):
449+
aoa_statements += [
450+
f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
451+
f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
452+
]
449453

450454
if config.tie_word_embeddings:
451455
aoa_statements += ["lm_head.weight -> _"]
@@ -1003,6 +1007,8 @@ class Qwen2ForCausalLMPipe(GeneralModelForCausalLMPipe):
10031007
_rotary_emb_cls = Qwen2RotaryEmbedding
10041008
_tied_weights_keys = ["lm_head.weight"]
10051009
transpose_weight_keys = Qwen2Model.transpose_weight_keys
1010+
_gen_aoa_config = Qwen2ForCausalLM._gen_aoa_config
1011+
_gen_inv_aoa_config = Qwen2ForCausalLM._gen_inv_aoa_config
10061012

10071013

10081014
__all__ = [

paddleformers/transformers/qwen2_moe/modeling.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -641,7 +641,7 @@ def _gen_aoa_config(cls, config: Qwen2MoeConfig):
641641

642642
# lm_head
643643
if config.tie_word_embeddings:
644-
aoa_config["aoa_statements"] += ["model.embed_tokens.weight^T -> lm_head.weight"]
644+
aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"]
645645

646646
return aoa_config
647647

@@ -668,16 +668,16 @@ def _gen_inv_aoa_config(cls, config: Qwen2MoeConfig):
668668
aoa_statements += [
669669
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
670670
]
671+
for layer_id in range(config.num_hidden_layers):
672+
for x in ("q", "k", "v"):
673+
aoa_statements += [
674+
f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
675+
]
671676
if config.qkv_bias:
672677
aoa_statements += [
673678
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
674679
]
675680

676-
aoa_statements += [
677-
f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
678-
for x in ("q", "k", "v")
679-
]
680-
681681
if not config.fuse_attention_ffn:
682682
aoa_statements += [
683683
f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert.{y}_proj.weight"
@@ -691,12 +691,16 @@ def _gen_inv_aoa_config(cls, config: Qwen2MoeConfig):
691691
f"{model_prefix}layers.$LAYER_ID.mlp.shared_expert.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight, model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight, fused_ffn",
692692
f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
693693
]
694-
aoa_statements += [
695-
"model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert.gate_proj.weight",
696-
"model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight^T -> model.layers.$LAYER_ID.mlp.shared_expert.up_proj.weight",
697-
"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight",
698-
"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight",
699-
]
694+
for layer_id in range(config.num_hidden_layers):
695+
aoa_statements += [
696+
f"model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.gate_proj.weight",
697+
f"model.layers.{layer_id}.mlp.shared_expert.up_proj.weight^T -> model.layers.{layer_id}.mlp.shared_expert.up_proj.weight",
698+
]
699+
for expert_id in range(config.num_experts):
700+
aoa_statements += [
701+
f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
702+
f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
703+
]
700704

701705
if config.tie_word_embeddings:
702706
aoa_statements += ["lm_head.weight -> _"]
@@ -1077,6 +1081,8 @@ class Qwen2MoeForCausalLMPipe(GeneralModelForCausalLMPipe):
10771081
_rotary_emb_cls = Qwen2MoeRotaryEmbedding
10781082
_tied_weights_keys = ["lm_head.weight"]
10791083
transpose_weight_keys = Qwen2MoeModel.transpose_weight_keys
1084+
_gen_aoa_config = Qwen2MoeForCausalLM._gen_aoa_config
1085+
_gen_inv_aoa_config = Qwen2MoeForCausalLM._gen_inv_aoa_config
10801086

10811087

10821088
__all__ = [

paddleformers/transformers/qwen3/modeling.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -392,6 +392,8 @@ def _gen_aoa_config(cls, config: Qwen3Config):
392392
f"model.layers.$LAYER_ID.input_layernorm.weight -> {model_prefix}layers.$LAYER_ID.input_layernorm.weight",
393393
f"model.layers.$LAYER_ID.post_attention_layernorm.weight -> {model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight",
394394
f"model.norm.weight -> {model_prefix}norm.weight",
395+
f"model.layers.$LAYER_ID.self_attn.q_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.q_norm.weight",
396+
f"model.layers.$LAYER_ID.self_attn.k_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight",
395397
]
396398
}
397399

@@ -423,7 +425,7 @@ def _gen_aoa_config(cls, config: Qwen3Config):
423425

424426
# lm_head
425427
if config.tie_word_embeddings:
426-
aoa_config["aoa_statements"] += ["model.embed_tokens.weight^T -> lm_head.weight"]
428+
aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"]
427429

428430
return aoa_config
429431

@@ -437,6 +439,8 @@ def _gen_inv_aoa_config(cls, config: Qwen3Config):
437439
f"{model_prefix}layers.$LAYER_ID.input_layernorm.weight -> model.layers.$LAYER_ID.input_layernorm.weight",
438440
f"{model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight -> model.layers.$LAYER_ID.post_attention_layernorm.weight",
439441
f"{model_prefix}norm.weight -> model.norm.weight",
442+
f"{model_prefix}layers.$LAYER_ID.self_attn.q_norm.weight -> model.layers.$LAYER_ID.self_attn.q_norm.weight",
443+
f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight",
440444
]
441445

442446
if not config.fuse_attention_qkv:
@@ -448,14 +452,15 @@ def _gen_inv_aoa_config(cls, config: Qwen3Config):
448452
aoa_statements += [
449453
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
450454
]
455+
for layer_id in range(config.num_hidden_layers):
456+
for x in ("q", "k", "v"):
457+
aoa_statements += [
458+
f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
459+
]
451460
if config.attention_bias:
452461
aoa_statements += [
453462
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
454463
]
455-
aoa_statements += [
456-
f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
457-
for x in ("q", "k", "v")
458-
]
459464

460465
if not config.fuse_attention_ffn:
461466
aoa_statements += [
@@ -465,9 +470,12 @@ def _gen_inv_aoa_config(cls, config: Qwen3Config):
465470
else:
466471
aoa_statements += [
467472
f"{model_prefix}layers.$LAYER_ID.mlp.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.gate_proj.weight, model.layers.$LAYER_ID.mlp.up_proj.weight, fused_ffn",
468-
"model.layers.$LAYER_ID.mlp.gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.gate_proj.weight",
469-
"model.layers.$LAYER_ID.mlp.up_proj.weight^T -> model.layers.$LAYER_ID.mlp.up_proj.weight",
470473
]
474+
for layer_id in range(config.num_hidden_layers):
475+
aoa_statements += [
476+
f"model.layers.{layer_id}.mlp.gate_proj.weight^T -> model.layers.{layer_id}.mlp.gate_proj.weight",
477+
f"model.layers.{layer_id}.mlp.up_proj.weight^T -> model.layers.{layer_id}.mlp.up_proj.weight",
478+
]
471479

472480
if config.tie_word_embeddings:
473481
aoa_statements += ["lm_head.weight -> _"]
@@ -1025,6 +1033,8 @@ class Qwen3ForCausalLMPipe(GeneralModelForCausalLMPipe):
10251033
_rotary_emb_cls = Qwen3RotaryEmbedding
10261034
_tied_weights_keys = ["lm_head.weight"]
10271035
transpose_weight_keys = Qwen3Model.transpose_weight_keys
1036+
_gen_aoa_config = Qwen3ForCausalLM._gen_aoa_config
1037+
_gen_inv_aoa_config = Qwen3ForCausalLM._gen_inv_aoa_config
10281038

10291039

10301040
__all__ = [

paddleformers/transformers/qwen3_moe/modeling.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,8 @@ def _gen_aoa_config(cls, config: Qwen3MoeConfig):
638638
f"model.layers.$LAYER_ID.input_layernorm.weight -> {model_prefix}layers.$LAYER_ID.input_layernorm.weight",
639639
f"model.layers.$LAYER_ID.post_attention_layernorm.weight -> {model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight",
640640
f"model.norm.weight -> {model_prefix}norm.weight",
641+
f"model.layers.$LAYER_ID.self_attn.q_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.q_norm.weight",
642+
f"model.layers.$LAYER_ID.self_attn.k_norm.weight -> {model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight",
641643
]
642644
}
643645

@@ -669,7 +671,7 @@ def _gen_aoa_config(cls, config: Qwen3MoeConfig):
669671

670672
# lm_head
671673
if config.tie_word_embeddings:
672-
aoa_config["aoa_statements"] += ["model.embed_tokens.weight^T -> lm_head.weight"]
674+
aoa_config["aoa_statements"] += ["model.embed_tokens.weight -> lm_head.weight"]
673675

674676
return aoa_config
675677

@@ -684,6 +686,8 @@ def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig):
684686
f"{model_prefix}layers.$LAYER_ID.input_layernorm.weight -> model.layers.$LAYER_ID.input_layernorm.weight",
685687
f"{model_prefix}layers.$LAYER_ID.post_attention_layernorm.weight -> model.layers.$LAYER_ID.post_attention_layernorm.weight",
686688
f"{model_prefix}norm.weight -> model.norm.weight",
689+
f"{model_prefix}layers.$LAYER_ID.self_attn.q_norm.weight -> model.layers.$LAYER_ID.self_attn.q_norm.weight",
690+
f"{model_prefix}layers.$LAYER_ID.self_attn.k_norm.weight -> model.layers.$LAYER_ID.self_attn.k_norm.weight",
687691
]
688692

689693
if not config.fuse_attention_qkv:
@@ -695,16 +699,16 @@ def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig):
695699
aoa_statements += [
696700
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.weight -> model.layers.$LAYER_ID.self_attn.q_proj.weight, model.layers.$LAYER_ID.self_attn.k_proj.weight, model.layers.$LAYER_ID.self_attn.v_proj.weight , fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups = {config.num_key_value_heads}",
697701
]
702+
for layer_id in range(config.num_hidden_layers):
703+
for x in ("q", "k", "v"):
704+
aoa_statements += [
705+
f"model.layers.{layer_id}.self_attn.{x}_proj.weight^T -> model.layers.{layer_id}.self_attn.{x}_proj.weight"
706+
]
698707
if config.attention_bias:
699708
aoa_statements += [
700709
f"{model_prefix}layers.$LAYER_ID.self_attn.qkv_proj.bias -> model.layers.$LAYER_ID.self_attn.q_proj.bias, model.layers.$LAYER_ID.self_attn.k_proj.bias, model.layers.$LAYER_ID.self_attn.v_proj.bias, fused_qkv, num_heads={config.num_attention_heads}, num_key_value_groups={config.num_key_value_heads}, axis=0",
701710
]
702711

703-
aoa_statements += [
704-
f"model.layers.$LAYER_ID.self_attn.{x}_proj.weight^T -> model.layers.$LAYER_ID.self_attn.{x}_proj.weight"
705-
for x in ("q", "k", "v")
706-
]
707-
708712
if not config.fuse_attention_ffn:
709713
aoa_statements += [
710714
f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.{y}_proj.weight"
@@ -713,9 +717,13 @@ def _gen_inv_aoa_config(cls, config: Qwen3MoeConfig):
713717
else:
714718
aoa_statements += [
715719
f"{model_prefix}layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_gate_proj.weight -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight, model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight, fused_ffn",
716-
"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.gate_proj.weight",
717-
"model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight^T -> model.layers.$LAYER_ID.mlp.experts.$EXPERT_ID.up_proj.weight",
718720
]
721+
for layer_id in range(config.num_hidden_layers):
722+
for expert_id in range(config.num_experts):
723+
aoa_statements += [
724+
f"model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.gate_proj.weight",
725+
f"model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight^T -> model.layers.{layer_id}.mlp.experts.{expert_id}.up_proj.weight",
726+
]
719727

720728
if config.tie_word_embeddings:
721729
aoa_statements += ["lm_head.weight -> _"]

0 commit comments

Comments
 (0)