refactor model in intermediate api mode

waliwali777 · sevenan2 · commit ef7b36ef791d · 2025-12-10T16:20:23.000+08:00
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -539,6 +539,9 @@ class PretrainedConfig:
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
 
+        run_single_model (`bool`, *optional*, defaults to `False`):
+            Whether to run the model in single card mode. When enabled, all parallel degree configurations will be disabled.
+
         dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -609,6 +612,13 @@ def __init__(self, **kwargs):
         self.use_cache = kwargs.pop("use_cache", False)
         self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
+        # for run model in single card mode
+        self.run_single_model = kwargs.pop("run_single_model", False)
+        if self.run_single_model:
+            self.tensor_parallel_degree = 1
+            self.sep_parallel_degree = 1
+            self.context_parallel_degree = 1
+
         # for transformers fuse
         self.fuse_linear = kwargs.pop("fuse_linear", False)
         self.fuse_attention_qkv = kwargs.pop("fuse_attention_qkv", False)
diff --git a/paddleformers/transformers/llama/auto_dist_config.py b/paddleformers/transformers/llama/auto_dist_config.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import paddle.distributed as dist
+from paddle.distributed.auto_parallel.intermediate.tensor_parallel import (
+    PrepareLayerInput,
+)
+
+
+def layer_input_parallel_row_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_parallel_row_and_col_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Shard(0), dist.Shard(1)])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Shard(0), dist.Shard(1)]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def layer_input_replicate_hook(process_mesh):
+    def hook(layer, inputs, output=None):
+        res_inputs = []
+        for input in inputs:
+            if not input.is_dist():
+                x = dist.shard_tensor(input, process_mesh, [dist.Replicate(), dist.Replicate()])
+                res_inputs.append(dist.reshard(x, process_mesh, [dist.Replicate(), dist.Replicate()]))
+            else:
+                res_inputs.append(dist.reshard(input, process_mesh, [dist.Replicate(), dist.Replicate()]))
+        return tuple(res_inputs)
+
+    return hook
+
+
+def auto_dist_config(self, prefix=""):
+    if prefix != "":
+        assert prefix.endswith(".")
+    config = {
+        "sp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": [
+                    dist.ColWiseParallel(),
+                    dist.SequenceParallelBegin(),
+                ],
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn": dist.SequenceParallelDisable(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp": dist.SequenceParallelDisable(need_transpose=False),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+                f"{prefix}lm_head": dist.SequenceParallelEnd(),
+            }
+        },
+        "mp_config": {
+            "parallelize_plan": {
+                f"{prefix}llama.embed_tokens": dist.ColWiseParallel(gather_output=True),
+                f"{prefix}llama.reshard_row": PrepareLayerInput(layer_input_parallel_row_hook),
+                f"{prefix}llama.reshard_row_and_col": PrepareLayerInput(layer_input_parallel_row_and_col_hook),
+                f"{prefix}llama.global_layer.reshard_replicate": PrepareLayerInput(layer_input_replicate_hook),
+                f"{prefix}llama.layers.*.self_attn.qkv_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.q_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.k_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.v_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.self_attn.o_proj": dist.RowWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.up_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.gate_up_fused_proj": dist.ColWiseParallel(),
+                f"{prefix}llama.layers.*.mlp.down_proj": dist.RowWiseParallel(),
+                f"{prefix}lm_head.weight": dist.ColWiseParallel(),
+            }
+        },
+        "pp_config": {"split_spec": f"{prefix}llama.layers", "global_spec": f"{prefix}llama.global_layer"},
+    }
+
+    return config
diff --git a/paddleformers/transformers/llama/modeling.py b/paddleformers/transformers/llama/modeling.py
@@ -28,6 +28,8 @@
 from ...nn.mlp import MLP
 from ...nn.norm import Norm as GeneralNorm
 from ...nn.pp_model import GeneralModelForCausalLMPipe
+from .auto_dist_config import get_dist_config
+
 from ...utils.log import logger
 from ..cache_utils import Cache, DynamicCache
 from ..masking_utils import create_causal_mask_and_row_indices
@@ -326,6 +328,8 @@ class LlamaPretrainedModel(PretrainedModel):
 
     @classmethod
     def _get_tensor_parallel_mappings(cls, config: LlamaConfig, is_split=True):
+        if config.run_single_model:
+            return {}
         from ..conversion_utils import split_or_merge_func
 
         fn = split_or_merge_func(
@@ -689,6 +693,10 @@ def forward(
             attentions=outputs.attentions,
         )
 
+    def auto_dist_config(self, prefix=""):
+        assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
+        return get_dist_config(self, prefix)
+
 
 class LlamaForCausalLMPipe(GeneralModelForCausalLMPipe):
     config_class = LlamaConfig