rename run_single_model and remove redundant code

sevenan2 · sevenan2 · commit facbe9b5ed21 · 2025-12-09T16:49:41.000+08:00
diff --git a/paddleformers/cli/train/auto_parallel/workflow.py b/paddleformers/cli/train/auto_parallel/workflow.py
@@ -268,7 +268,7 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
         training_args.no_recompute_layers.sort()
 
     if training_args.use_intermediate_api:
-        config.run_single_model = True
+        config.use_single_model_implementation = True
         config.tensor_parallel_degree = 1
         config.sharding_parallel_degree = 1
         config.sep_parallel_degree = 1
@@ -288,13 +288,6 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
 
     if not training_args.enable_auto_parallel and training_args.pipeline_parallel_degree > 1:
         model_class = AutoModelForCausalLMPipe
-        if "LLama" in str(config.architectures):
-            try:
-                from utils.register_reshard import register_pp_reshard_information
-
-                register_pp_reshard_information(config.num_hidden_layers)
-            except:
-                print("Not register llama pp reshard information.")
 
     architectures_to_check = {"Qwen2Moe", "DeepseekV2", "DeepseekV3"}
     if (
@@ -304,7 +297,6 @@ def run_auto_parallel(model_args, data_args, generating_args, training_args):
         training_args.use_expert_parallel = True
 
     if model_args.continue_training:
-        # NOTE(gongenlei): new add
         if training_args.autotuner_benchmark:
             model = model_class.from_config(config, dtype=dtype)
         else:
diff --git a/paddleformers/transformers/configuration_utils.py b/paddleformers/transformers/configuration_utils.py
@@ -539,7 +539,7 @@ class PretrainedConfig:
             Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
             model has a output word embedding layer.
 
-        run_single_model (`bool`, *optional*, defaults to `False`):
+        use_single_model_implementation (`bool`, *optional*, defaults to `False`):
             Whether to run the model in single card mode. When enabled, all parallel degree configurations will be disabled.
 
         dtype (`str`, *optional*):
@@ -613,8 +613,8 @@ def __init__(self, **kwargs):
         self.tie_word_embeddings = kwargs.pop("tie_word_embeddings", True)
 
         # for run model in single card mode
-        self.run_single_model = kwargs.pop("run_single_model", False)
-        if self.run_single_model:
+        self.use_single_model_implementation = kwargs.pop("use_single_model_implementation", False)
+        if self.use_single_model_implementation:
             self.tensor_parallel_degree = 1
             self.sep_parallel_degree = 1
             self.context_parallel_degree = 1
diff --git a/paddleformers/transformers/llama/modeling.py b/paddleformers/transformers/llama/modeling.py
@@ -693,7 +693,7 @@ def forward(
         )
 
     def auto_dist_config(self, prefix=""):
-        assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
+        assert self.config.use_single_model_implementation, "Use `get_dist_config` only in single card mode."
         return get_dist_config(self, prefix)
 
 

Original file line number	Diff line number	Diff line change
`@@ -693,7 +693,7 @@ def forward(`
`693`	`693`	`)`
`694`	`694`
`695`	`695`	`def auto_dist_config(self, prefix=""):`
`696`		- assert self.config.run_single_model, "Use `get_dist_config` only in single card mode."
	`696`	+ assert self.config.use_single_model_implementation, "Use `get_dist_config` only in single card mode."
`697`	`697`	`return get_dist_config(self, prefix)`
`698`	`698`
`699`	`699`