refactor model in intermediate api mode

waliwali777 · waliwali777 · commit d337a0b051c0 · 2025-11-13T13:23:58.000+08:00
diff --git a/paddleformers/transformers/__init__.py b/paddleformers/transformers/__init__.py
@@ -214,41 +214,6 @@
         "LlamaPretrainingCriterion",
         "LlamaNTKScalingRotaryEmbedding",
     ],
-    "llama.modeling_auto": [
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaDecoderLayerAuto",
-        "LlamaAttentionAuto",
-        "LlamaPretrainedModelAuto",
-        "LlamaLMHeadAuto",
-        "LlamaModelAuto",
-        "LlamaForCausalLM3DAuto",
-        "LlamaMLPAuto",
-        "get_mesh",
-        "LlamaRMSNormAuto",
-        "is_pp_enable",
-        "LlamaPretrainingCriterion3DAuto",
-        "global_mesh_starts_with_pp",
-        "scaled_dot_product_attention",
-    ],
-    "llama.modeling_network": [
-        "LlamaPretrainedModelNet",
-        "layer_input_parallel_row_and_col_hook",
-        "LlamaModelNet",
-        "LlamaPretrainingCriterionNet",
-        "layer_input_replicate_hook",
-        "LlamaLMHeadNet",
-        "LlamaForCausalLMNetDPO",
-        "GlobalOutputNet",
-        "layer_input_parallel_row_hook",
-        "LlamaRMSNormNet",
-        "LlamaAttentionNet",
-        "scaled_dot_product_attention",
-        "ReshardLayer",
-        "LlamaForCausalLMNet",
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaMLPNet",
-        "LlamaDecoderLayerNet",
-    ],
     "llama.modeling_pp": ["LlamaForCausalLMPipe"],
     "llama.tokenizer": ["LlamaTokenizer", "Llama3Tokenizer"],
     "llama.tokenizer_fast": ["LlamaTokenizerFast"],
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -537,6 +537,9 @@ class PretrainedConfig:
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
 
+        run_single_model (`bool`, *optional*, defaults to `False`):
+            Whether to run the model in single card mode. When enabled, all parallel degree configurations will be disabled.
+
         dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -601,6 +604,13 @@ def __init__(self, **kwargs):
         self.use_cache = kwargs.pop("use_cache", False)
         self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
+        # for run model in single card mode
+        self.run_single_model = kwargs.pop("run_single_model", False)
+        if self.run_single_model:
+            self.tensor_parallel_degree = 1
+            self.sep_parallel_degree = 1
+            self.context_parallel_degree = 1
+
         # for transformers fuse
         self.fuse_linear = kwargs.pop("fuse_linear", False)
         self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
diff --git a/paddleformers/transformers/llama/__init__.py b/paddleformers/transformers/llama/__init__.py
@@ -50,41 +50,6 @@
         "LlamaPretrainingCriterion",
         "LlamaNTKScalingRotaryEmbedding",
     ],
-    "modeling_auto": [
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaDecoderLayerAuto",
-        "LlamaAttentionAuto",
-        "LlamaPretrainedModelAuto",
-        "LlamaLMHeadAuto",
-        "LlamaModelAuto",
-        "LlamaForCausalLM3DAuto",
-        "LlamaMLPAuto",
-        "get_mesh",
-        "LlamaRMSNormAuto",
-        "is_pp_enable",
-        "LlamaPretrainingCriterion3DAuto",
-        "global_mesh_starts_with_pp",
-        "scaled_dot_product_attention",
-    ],
-    "modeling_network": [
-        "LlamaPretrainedModelNet",
-        "layer_input_parallel_row_and_col_hook",
-        "LlamaModelNet",
-        "LlamaPretrainingCriterionNet",
-        "layer_input_replicate_hook",
-        "LlamaLMHeadNet",
-        "LlamaForCausalLMNetDPO",
-        "GlobalOutputNet",
-        "layer_input_parallel_row_hook",
-        "LlamaRMSNormNet",
-        "LlamaAttentionNet",
-        "scaled_dot_product_attention",
-        "ReshardLayer",
-        "LlamaForCausalLMNet",
-        "enable_fuse_ffn_qkv_pass",
-        "LlamaMLPNet",
-        "LlamaDecoderLayerNet",
-    ],
     "modeling_pp": ["LlamaForCausalLMPipe"],
     "tokenizer": ["LlamaTokenizer", "Llama3Tokenizer"],
     "tokenizer_fast": ["LlamaTokenizerFast"],
diff --git a/paddleformers/transformers/llama/auto_dist_config.py b/paddleformers/transformers/llama/auto_dist_config.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle.distributed as dist
+from paddle.distributed.auto_parallel.intermediate.tensor_parallel import (
+    PrepareLayerInput,
+)
+
+
+def layer_input_parallel_row_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_parallel_row_and_col_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Shard(1)])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_replicate_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Replicate(), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Replicate(), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Replicate(), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def auto_dist_config(self, prefix=""):
+    if prefix != "":
+        assert prefix.endswith(".")
+    config = {
+        "sp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": [
+                    dist.ColWiseParallel(),
+                    dist.SequenceParallelBegin(),
+                ],
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn": dist.SequenceParallelDisable(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp": dist.SequenceParallelDisable(need_transpose=False),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+                f"{prefix}lm_head": dist.SequenceParallelEnd(),
+            }
+        },
+        "mp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": dist.ColWiseParallel(gather_output=True),
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+            }
+        },
+        "pp_config": {"split_spec": f"{prefix}llama.layers", "global_spec": f"{prefix}llama.global_layer"},
+    }
+
+    return config
diff --git a/paddleformers/transformers/llama/modeling.py b/paddleformers/transformers/llama/modeling.py
@@ -39,6 +39,7 @@
     get_skip_recompute_ops,
 )
 from ..refined_recompute import recompute as rr_recompute
+from .auto_dist_config import get_dist_config
 
 try:
     from paddle.incubate.nn.functional import fused_rotary_position_embedding
@@ -178,15 +179,16 @@ def assign_kv_heads(num_kv_heads: int, num_gpus: int):
     return assignment_list
 
 
-def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_parallel_output=True):
+def parallel_matmul(x: Tensor, y: Tensor, transpose_y=False, tensor_parallel_output=True, args=None):
     is_fleet_init = True
     tensor_parallel_degree = 1
-    try:
-        hcg = fleet.get_hybrid_communicate_group()
-        model_parallel_group = hcg.get_model_parallel_group()
-        tensor_parallel_degree = hcg.get_model_parallel_world_size()
-    except:
-        is_fleet_init = False
+    if args is None or not args.run_single_model:
+        try:
+            hcg = fleet.get_hybrid_communicate_group()
+            model_parallel_group = hcg.get_model_parallel_group()
+            tensor_parallel_degree = hcg.get_model_parallel_world_size()
+        except:
+            is_fleet_init = False
 
     if paddle.in_dynamic_mode():
         y_is_distributed = y.is_distributed
@@ -1326,6 +1328,8 @@ def _get_hardware_flops(self):
 
     @classmethod
     def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
+        if config.run_single_model:
+            return cls._get_name_mappings()
         mappings: list[StateDictNameMapping] = []
         model_mappings = [
             ["embed_tokens.weight"],
@@ -1360,7 +1364,8 @@ def _get_name_mappings(cls, config: LlamaConfig) -> list[StateDictNameMapping]:
 
     @classmethod
     def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
-
+        if config.run_single_model:
+            return {}
         from ..conversion_utils import split_or_merge_func
 
         fn = split_or_merge_func(
@@ -1420,6 +1425,8 @@ def get_tensor_parallel_split_mappings(num_layers):
 
     @classmethod
     def _get_fuse_or_split_param_mappings(cls, config: LlamaConfig, is_fuse=False):
+        if config.run_single_model:
+            return cls._get_fuse_or_split_param_mappings()
         # return parameter fuse utils
         from ..conversion_utils import split_or_fuse_func
 
@@ -1984,7 +1991,11 @@ def forward(self, hidden_states, tensor_parallel_output=None):
             )
         else:
             logits = parallel_matmul(
-                hidden_states, self.weight, transpose_y=self.transpose_y, tensor_parallel_output=tensor_parallel_output
+                hidden_states,
+                self.weight,
+                transpose_y=self.transpose_y,
+                tensor_parallel_output=tensor_parallel_output,
+                args=self.config,
             )
         return logits
 
@@ -2156,3 +2167,7 @@ def forward(
             hidden_states=outputs.hidden_states,
             attentions=outputs.attentions,
         )
+
+    def auto_dist_config(self, prefix=""):
+        assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
+        return get_dist_config(self, prefix)
diff --git a/paddleformers/transformers/llama/modeling_auto.py b/paddleformers/transformers/llama/modeling_auto.py
diff --git a/paddleformers/transformers/llama/modeling_network.py b/paddleformers/transformers/llama/modeling_network.py