rename run_single_model and remove redundant code

root · root · commit 8d6ca3a89fc6 · 2025-12-05T11:28:54.000+08:00
diff --git a/paddleformers/cli/train/auto_parallel/workflow.py b/paddleformers/cli/train/auto_parallel/workflow.py
@@ -270,7 +270,7 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
         training_args.no_recompute_layers.sort()
 
     if training_args.use_intermediate_api:
-        config.run_single_model = True
+        config.use_single_model_implementation = True
         config.tensor_parallel_degree = 1
         config.sharding_parallel_degree = 1
         config.sep_parallel_degree = 1
@@ -290,13 +290,6 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
 
     if not training_args.enable_auto_parallel and training_args.pipeline_parallel_degree > 1:
         model_class = AutoModelForCausalLMPipe
-        if "LLama" in str(config.architectures):
-            try:
-                from utils.register_reshard import register_pp_reshard_information
-
-                register_pp_reshard_information(config.num_hidden_layers)
-            except:
-                print("Not register llama pp reshard information.")
 
     architectures_to_check = {"Qwen2Moe", "DeepseekV2", "DeepseekV3"}
     if (
@@ -306,7 +299,6 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
         training_args.use_expert_parallel = True
 
     if model_args.continue_training:
-        # NOTE(gongenlei): new add
         if training_args.autotuner_benchmark:
             model = model_class.from_config(config, dtype=dtype)
         else:
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -537,7 +537,7 @@ class PretrainedConfig:
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
 
-        run_single_model (`bool`, *optional*, defaults to `False`):
+        use_single_model_implementation (`bool`, *optional*, defaults to `False`):
             Whether to run the model in single card mode. When enabled, all parallel degree configurations will be disabled.
 
         dtype (`str`, *optional*):
@@ -605,8 +605,8 @@ def __init__(self, **kwargs):
         self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
         # for run model in single card mode
-        self.run_single_model = kwargs.pop("run_single_model", False)
-        if self.run_single_model:
+        self.use_single_model_implementation = kwargs.pop("use_single_model_implementation", False)
+        if self.use_single_model_implementation:
             self.tensor_parallel_degree = 1
             self.sep_parallel_degree = 1
             self.context_parallel_degree = 1
diff --git a/paddleformers/transformers/llama/modeling.py b/paddleformers/transformers/llama/modeling.py
@@ -2160,5 +2160,5 @@ def forward(
         )
 
     def auto_dist_config(self, prefix=""):
-        assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
+        assert self.config.use_single_model_implementation, "Use `get_dist_config` only in single card mode."
         return get_dist_config(self, prefix)

Original file line number	Diff line number	Diff line change
`@@ -2160,5 +2160,5 @@ def forward(`
`2160`	`2160`	`)`
`2161`	`2161`
`2162`	`2162`	`def auto_dist_config(self, prefix=""):`
`2163`		- assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
	`2163`	+ assert self.config.use_single_model_implementation, "Use `get_dist_config` only in single card mode."
`2164`	`2164`	`return get_dist_config(self, prefix)`