【fleet】fix Fleet lora model (#2997)

xiaoguoguo626807 · web-flow · commit be9e7dff2983 · 2025-12-09T15:15:24.000+08:00
diff --git a/paddleformers/peft/lora/lora_layers.py b/paddleformers/peft/lora/lora_layers.py
@@ -300,6 +300,19 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class FleetLoRALinear(LoRALinear):
+    def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, input: paddle.Tensor):
+        out_bias = self.bias if self.skip_bias_add else None
+        if self.skip_bias_add:
+            self.bias = None
+        output = super().forward(input)
+        return output, out_bias
+
+
 class RowParallelLoRALinear(RowParallelLinear):
     def __init__(
         self,
@@ -461,6 +474,19 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class FleetRowParallelLoRALinear(RowParallelLoRALinear):
+    def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, input: paddle.Tensor):
+        out_bias = self.bias if self.skip_bias_add else None
+        if self.skip_bias_add:
+            self.bias = None
+        output = super().forward(input)
+        return output, out_bias
+
+
 class RowSequenceParallelLoRALinear(RowSequenceParallelLinear):
     def __init__(
         self,
@@ -579,6 +605,19 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class FleetRowSequenceParallelLoRALinear(RowSequenceParallelLoRALinear):
+    def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, input: paddle.Tensor):
+        out_bias = self.bias if self.skip_bias_add else None
+        if self.skip_bias_add:
+            self.bias = None
+        output = super().forward(input)
+        return output, out_bias
+
+
 class ColumnParallelLoRALinear(ColumnParallelLinear):
     def __init__(
         self,
@@ -722,6 +761,19 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class FleetColumnParallelLoRALinear(ColumnParallelLoRALinear):
+    def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, input: paddle.Tensor):
+        out_bias = self.bias if self.skip_bias_add else None
+        if self.skip_bias_add:
+            self.bias = None
+        output = super().forward(input)
+        return output, out_bias
+
+
 class ColumnSequenceParallelLoRALinear(ColumnSequenceParallelLinear):
     def __init__(
         self,
@@ -843,6 +895,19 @@ def extra_repr(self):
         return f"in_features={self.weight.shape[0]}, out_features={self.weight.shape[1]}, rank={self.r}{name}"
 
 
+class FleetColumnSequenceParallelLoRALinear(ColumnSequenceParallelLoRALinear):
+    def __init__(self, in_features, out_features, skip_bias_add, **kwargs):
+        super().__init__(in_features, out_features, **kwargs)
+        self.skip_bias_add = skip_bias_add
+
+    def forward(self, input: paddle.Tensor):
+        out_bias = self.bias if self.skip_bias_add else None
+        if self.skip_bias_add:
+            self.bias = None
+        output = super().forward(input)
+        return output, out_bias
+
+
 class LoRAConv2D(nn.Conv2D):
     # LoRA implemented in a dense layer
     def __init__(
diff --git a/paddleformers/peft/lora/lora_model.py b/paddleformers/peft/lora/lora_model.py
@@ -33,6 +33,10 @@
     PipelineLayer,
     RowParallelLinear,
 )
+from paddlefleet.tensor_parallel import (
+    ColumnParallelLinear as FleetColumnParallelLinear,
+)
+from paddlefleet.tensor_parallel import RowParallelLinear as FleetRowParallelLinear
 
 from ...trainer.argparser import strtobool
 from ...transformers import linear_utils
@@ -92,6 +96,11 @@ def get_lora_layers():
         from .lora_layers import (
             ColumnParallelLoRALinear,
             ColumnSequenceParallelLoRALinear,
+            FleetColumnParallelLoRALinear,
+            FleetColumnSequenceParallelLoRALinear,
+            FleetLoRALinear,
+            FleetRowParallelLoRALinear,
+            FleetRowSequenceParallelLoRALinear,
             LoRAConv2D,
             LoRALinear,
             RowParallelLoRALinear,
@@ -105,6 +114,11 @@ def get_lora_layers():
         "LoRALinear": LoRALinear,
         "RowParallelLoRALinear": RowParallelLoRALinear,
         "RowSequenceParallelLoRALinear": RowSequenceParallelLoRALinear,
+        "FleetLoRALinear": FleetLoRALinear,
+        "FleetRowParallelLoRALinear": FleetRowParallelLoRALinear,
+        "FleetColumnParallelLoRALinear": FleetColumnParallelLoRALinear,
+        "FleetRowSequenceParallelLoRALinear": FleetRowSequenceParallelLoRALinear,
+        "FleetColumnSequenceParallelLoRALinear": FleetColumnSequenceParallelLoRALinear,
     }
 
 
@@ -115,6 +129,12 @@ def get_lora_layers():
 LoRALinear = lora_layers["LoRALinear"]
 RowParallelLoRALinear = lora_layers["RowParallelLoRALinear"]
 RowSequenceParallelLoRALinear = lora_layers["RowSequenceParallelLoRALinear"]
+FleetLoRALinear = lora_layers["FleetLoRALinear"]
+FleetRowParallelLoRALinear = lora_layers["FleetRowParallelLoRALinear"]
+FleetColumnParallelLoRALinear = lora_layers["FleetColumnParallelLoRALinear"]
+FleetRowSequenceParallelLoRALinear = lora_layers["FleetRowSequenceParallelLoRALinear"]
+FleetColumnSequenceParallelLoRALinear = lora_layers["FleetColumnSequenceParallelLoRALinear"]
+
 
 from ...quantization.quantization_linear import (
     ColumnParallelQuantizationLinear,
@@ -167,6 +187,8 @@ def __init__(self, model, lora_config: LoRAConfig) -> None:
             self.lora_config.lora_use_mixer or self.lora_config.use_mora
         ):
             raise NotImplementedError("lora_use_mixer or mora is not supported in tensor parallel mode.")
+        if hasattr(self.model.config, "tensor_model_parallel_size"):
+            self.model.config.tensor_parallel_degree = self.model.config.tensor_model_parallel_size
         if self.lora_config.tensor_parallel_degree != self.model.config.tensor_parallel_degree:
             self.lora_config.tensor_parallel_degree = self.model.config.tensor_parallel_degree
             logger.warning(
@@ -566,7 +588,10 @@ def replace_name_and_gen_index_lora(path):
         if is_main_process:
             lora_config_to_save.save_pretrained(save_directory)
             if save_model_config:
-                model_config_to_save = copy.deepcopy(self.model.config)
+                if hasattr(self.model, "config_to_save"):
+                    model_config_to_save = copy.deepcopy(self.model.config_to_save)
+                else:
+                    model_config_to_save = copy.deepcopy(self.model.config)
                 if merge_tensor_parallel:
                     model_config_to_save.tensor_parallel_degree = -1
                 model_config_to_save.save_pretrained(save_directory)
@@ -712,6 +737,115 @@ def _find_and_replace_module(self, model, module_name, lora_config):
                 self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
                 self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
                 self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+        elif isinstance(module, FleetColumnParallelLinear) or isinstance(module, FleetRowParallelLinear):
+            if module.world_size == 1:
+                lora_module = FleetLoRALinear(
+                    in_features=module.weight.shape[0],
+                    out_features=module.weight.shape[1],
+                    skip_bias_add=module.skip_bias_add,
+                    r=lora_config.r,
+                    lora_alpha=lora_config.lora_alpha,
+                    lora_dropout=lora_config.lora_dropout,
+                    rslora=lora_config.rslora,
+                    lora_plus_scale=lora_config.lora_plus_scale,
+                    pissa=lora_config.pissa,
+                    bias_attr=False if module.bias is None else None,
+                    use_quick_lora=lora_config.use_quick_lora,
+                    lora_use_mixer=lora_config.lora_use_mixer,
+                    use_mora=lora_config.use_mora,
+                    mp_moe=getattr(module.weight, "mp_moe", False),
+                    is_distributed=getattr(module.weight, "is_distributed", False),
+                    lorapro=lora_config.lorapro,
+                )
+            elif isinstance(module, FleetRowParallelLinear):
+                # recover the original output_features
+                if module.sequence_parallel:
+                    lora_module = FleetRowSequenceParallelLoRALinear(
+                        in_features=module.weight.shape[0] * module.world_size,
+                        out_features=module.weight.shape[1],
+                        skip_bias_add=module.skip_bias_add,
+                        has_bias=module.bias is not None,
+                        input_is_parallel=module.input_is_parallel,
+                        r=lora_config.r,
+                        lora_alpha=lora_config.lora_alpha,
+                        lora_dropout=lora_config.lora_dropout,
+                        rslora=lora_config.rslora,
+                        lora_plus_scale=lora_config.lora_plus_scale,
+                        use_quick_lora=lora_config.use_quick_lora,
+                    )
+                else:
+                    lora_module = FleetRowParallelLoRALinear(
+                        in_features=module.weight.shape[0] * module.world_size,
+                        out_features=module.weight.shape[1],
+                        skip_bias_add=module.skip_bias_add,
+                        has_bias=module.bias is not None,
+                        input_is_parallel=module.input_is_parallel,
+                        r=lora_config.r,
+                        lora_alpha=lora_config.lora_alpha,
+                        lora_dropout=lora_config.lora_dropout,
+                        rslora=lora_config.rslora,
+                        lora_plus_scale=lora_config.lora_plus_scale,
+                        pissa=lora_config.pissa,
+                        use_quick_lora=lora_config.use_quick_lora,
+                    )
+                # Lora column parallel will spilt lora A matrix
+                self.add_lora_split_mapping(module_name + ".lora_A", is_column=False)
+
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
+            elif isinstance(module, FleetColumnParallelLinear):
+                # recover the original output_features
+                output_features = module.weight.shape[1] * module.world_size
+                if module.sequence_parallel:
+                    lora_module = FleetColumnSequenceParallelLoRALinear(
+                        in_features=module.weight.shape[0],
+                        out_features=output_features,
+                        skip_bias_add=module.skip_bias_add,
+                        gather_output=module.gather_output,
+                        has_bias=module.bias is not None,
+                        r=lora_config.r,
+                        lora_alpha=lora_config.lora_alpha,
+                        lora_dropout=lora_config.lora_dropout,
+                        rslora=lora_config.rslora,
+                        lora_plus_scale=lora_config.lora_plus_scale,
+                        lora_A_weight_attr=paddle.ParamAttr(
+                            initializer=nn.initializer.KaimingUniform(
+                                negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                            )
+                        ),
+                        use_quick_lora=lora_config.use_quick_lora,
+                    )
+                else:
+                    lora_module = FleetColumnParallelLoRALinear(
+                        in_features=module.weight.shape[0],
+                        out_features=output_features,
+                        skip_bias_add=module.skip_bias_add,
+                        gather_output=module.gather_output,
+                        has_bias=module.bias is not None,
+                        r=lora_config.r,
+                        lora_alpha=lora_config.lora_alpha,
+                        lora_dropout=lora_config.lora_dropout,
+                        rslora=lora_config.rslora,
+                        lora_plus_scale=lora_config.lora_plus_scale,
+                        pissa=lora_config.pissa,
+                        lora_A_weight_attr=paddle.ParamAttr(
+                            initializer=nn.initializer.KaimingUniform(
+                                negative_slope=math.sqrt(5), nonlinearity="leaky_relu"
+                            )
+                        ),
+                        use_quick_lora=lora_config.use_quick_lora,
+                    )
+                # Lora column parallel will spilt lora B matrix
+                self.add_lora_split_mapping(module_name + ".lora_B", is_column=True)
+
+                # for lora qat
+                if self.lora_config.do_qat:
+                    self.add_lora_split_mapping(module_name + ".weight_quanter._scale", is_column=True)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter._scale", is_column=False)
+                    self.add_lora_split_mapping(module_name + ".activation_quanter.quanter._scale", is_column=False)
         elif isinstance(module, QuantizationLinear):
             lora_module = QuantizationLoRALinear(module, lora_config)
         elif isinstance(module, ColumnParallelQuantizationLinear):
diff --git a/paddleformers/transformers/glm4_moe/modeling.py b/paddleformers/transformers/glm4_moe/modeling.py
@@ -58,6 +58,8 @@ class GLMMoEModelProvider(GPTModelProvider):
 
     bias_activation_fusion: bool = True
 
+    transform_rules = {"tensor_parallel_degree": "tensor_model_parallel_size", "dtype": "params_dtype"}
+
 
 def eager_attention_forward(
     module: nn.Layer,
@@ -1494,6 +1496,7 @@ def __new__(cls, config):
         gpt_model = model_provider.provide()
         gpt_model._gen_aoa_config = cls._gen_aoa_config
         gpt_model._gen_inv_aoa_config = cls._gen_inv_aoa_config
+        gpt_model._get_tensor_parallel_mappings = cls._get_tensor_parallel_mappings
         gpt_model.config_to_save = config
         return gpt_model