From 228b3b7c058205ca6299726f155cf2b1532bf5d3 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Mon, 26 Jan 2026 06:30:30 +0000
Subject: [PATCH 1/3] autotune

---
 ...num=1,use_fp8_w8a8=false}_NVIDIA_H200.json | 83 +++++++++++++++++++
 ...num=8,use_fp8_w8a8=false}_NVIDIA_H200.json | 83 +++++++++++++++++++
 .../{topk_num=8}_NVIDIA_H200.json             | 38 +++++++++
 ...orch.bfloat16,topk_num=8}_NVIDIA_H200.json | 56 +++++++++++++
 ...out_dtype=torch.bfloat16}_NVIDIA_H200.json | 56 +++++++++++++
 5 files changed, 316 insertions(+)
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_H200.json
 create mode 100644 lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json

diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..c8100c676
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=192,N=2048,expert_num=128,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,83 @@
+{
+  "1024": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "512": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "800": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8192": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
new file mode 100644
index 000000000..4142ee983
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/grouped_matmul:v1/{K=2048,N=384,expert_num=128,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=false}_NVIDIA_H200.json
@@ -0,0 +1,83 @@
+{
+  "1": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 4,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE_K": 32,
+    "BLOCK_SIZE_M": 64,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 64,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE_K": 64,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 1,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "256": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 32,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 64,
+    "GROUP_SIZE_M": 16,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 2,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE_K": 128,
+    "BLOCK_SIZE_M": 16,
+    "BLOCK_SIZE_N": 128,
+    "GROUP_SIZE_M": 32,
+    "NEED_TRANS": false,
+    "num_stages": 3,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json
new file mode 100644
index 000000000..002b842cb
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json
@@ -0,0 +1,38 @@
+{
+  "1": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 4
+  },
+  "100": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "8": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_H200.json
new file mode 100644
index 000000000..bc904bb7f
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/moe_sum_reduce:v1/{hidden_dim=2048,out_dtype=torch.bfloat16,topk_num=8}_NVIDIA_H200.json
@@ -0,0 +1,56 @@
+{
+  "1": {
+    "BLOCK_DIM": 256,
+    "BLOCK_M": 2,
+    "NUM_STAGE": 2,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 8
+  },
+  "1024": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 1
+  },
+  "128": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 16
+  },
+  "16": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 2
+  },
+  "256": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 2
+  },
+  "32": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_DIM": 128,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 4,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_DIM": 1024,
+    "BLOCK_M": 1,
+    "NUM_STAGE": 1,
+    "num_warps": 16
+  }
+}
\ No newline at end of file
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json
new file mode 100644
index 000000000..50499a3e7
--- /dev/null
+++ b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.5.1/NVIDIA_H200/silu_and_mul_fwd:v1/{N=192,out_dtype=torch.bfloat16}_NVIDIA_H200.json
@@ -0,0 +1,56 @@
+{
+  "1024": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 1
+  },
+  "256": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 1,
+    "num_warps": 8
+  },
+  "512": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 128,
+    "NUM_STAGES": 2,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "8": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 64,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  },
+  "800": {
+    "BLOCK_M": 1,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 2,
+    "num_warps": 1
+  },
+  "8192": {
+    "BLOCK_M": 8,
+    "BLOCK_N": 256,
+    "NUM_STAGES": 4,
+    "num_warps": 1
+  }
+}
\ No newline at end of file

From 3c1313c36c521a5bbcdb0408451ab7f179421386 Mon Sep 17 00:00:00 2001
From: WANDY666 <1060304770@qq.com>
Date: Thu, 29 Jan 2026 06:49:31 +0000
Subject: [PATCH 2/3] merge qkv

---
 .../layer_weights/meta_weights/__init__.py    |  1 +
 .../meta_weights/mm_weight/__init__.py        |  2 +-
 .../meta_weights/mm_weight/rowmm_weight.py    | 46 +++++++++++++++++++
 .../layer_infer/transformer_layer_infer.py    |  6 ++-
 .../layer_weights/transformer_layer_weight.py | 15 +++++-
 5 files changed, 66 insertions(+), 4 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
index 8e884012d..edf7fe21b 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/__init__.py
@@ -4,6 +4,7 @@
     ROWMMWeight,
     KVROWNMMWeight,
     ROWBMMWeight,
+    QKVROWNMMWeight,
     COLMMWeight,
 )
 from .norm_weight import TpRMSNormWeight, RMSNormWeight, LayerNormWeight, NoTpGEMMANormWeight, QKRMSNORMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
index e9ae4f30a..958c688d4 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/__init__.py
@@ -1,5 +1,5 @@
 from .mm_weight import (
     MMWeightTpl,
 )
-from .rowmm_weight import ROWMMWeight, KVROWNMMWeight, ROWBMMWeight
+from .rowmm_weight import ROWMMWeight, KVROWNMMWeight, ROWBMMWeight, QKVROWNMMWeight
 from .colmm_weight import COLMMWeight
diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index 30a699bb6..4977c2971 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -92,6 +92,52 @@ def _get_tp_padded_head_num(self, head_num: int):
             )
 
 
+class QKVROWNMMWeight(MMWeightTpl):
+    def __init__(
+        self,
+        in_dim: int,
+        q_head_num: int,
+        kv_head_num: int,
+        head_dim: int,
+        weight_names: Union[str, List[str]],
+        data_type: torch.dtype,
+        bias_names: Optional[Union[str, List[str]]] = None,
+        quant_method: QuantizationMethod = None,
+        tp_rank: int = None,
+        tp_world_size: int = None,
+    ) -> None:
+        self.tp_rank_ = tp_rank if tp_rank is not None else get_current_rank_in_dp()
+        self.tp_world_size_ = tp_world_size if tp_world_size is not None else get_dp_world_size()
+        self.repeat_times = 1
+        assert q_head_num % self.tp_world_size_ == 0, (
+            f"q_head_num must be divisible by tp_world_size_, " f"but found: {q_head_num} % {self.tp_world_size_}"
+        )
+        assert kv_head_num % self.tp_world_size_ == 0 or self.tp_world_size_ % kv_head_num == 0, (
+            f"kv_head_num must be divisible by tp_world_size_ or "
+            f"tp_world_size_ must be divisible by kv_head_num, "
+            f"but found: {kv_head_num} % {self.tp_world_size_}"
+        )
+        q_hidden_size = (q_head_num // self.tp_world_size_) * head_dim
+        kv_hidden_size = (kv_head_num // self.tp_world_size_) * head_dim
+        out_dims = [q_hidden_size, kv_hidden_size, kv_hidden_size]
+        super().__init__(
+            in_dim=in_dim,
+            out_dims=out_dims,
+            weight_names=weight_names,
+            data_type=data_type,
+            bias_names=bias_names,
+            quant_method=quant_method,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
+        )
+        self.param_slicer = get_row_slice_mixin(
+            self.quant_method.method_name,
+            tp_rank=self.tp_rank_,
+            tp_world_size=self.tp_world_size_,
+            repeat_times=self.repeat_times,
+        )
+
+
 class ROWBMMWeight(BMMWeightTpl):
     def __init__(
         self,
diff --git a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
index 71b16cb34..9eccddffc 100644
--- a/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
+++ b/lightllm/models/qwen3_moe/layer_infer/transformer_layer_infer.py
@@ -60,8 +60,10 @@ def _get_qkv(
         layer_weight: Qwen3MOETransformerLayerWeight,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         input = input.view(-1, self.embed_dim_)
-        q = layer_weight.q_proj.mm(input)
-        cache_kv = layer_weight.kv_proj.mm(input)
+        qkv = layer_weight.qkv_proj.mm(input)
+        q, cache_kv = qkv.split(
+            [self.tp_q_head_num_ * self.head_dim_, (self.tp_k_head_num_ + self.tp_v_head_num_) * self.head_dim_], dim=-1
+        )
         layer_weight.q_norm_weight_(q, eps=self.eps_)
         layer_weight.k_norm_weight_(
             cache_kv[:, : self.tp_k_head_num_ * self.head_dim_],
diff --git a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
index a889609d7..13ba6cbe0 100644
--- a/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
+++ b/lightllm/models/qwen3_moe/layer_weights/transformer_layer_weight.py
@@ -1,6 +1,6 @@
 import os
 from lightllm.models.qwen3.layer_weights.transformer_layer_weight import Qwen3TransformerLayerWeight
-from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeight
+from lightllm.common.basemodel.layer_weights.meta_weights import ROWMMWeight, FusedMoeWeight, QKVROWNMMWeight
 
 
 class Qwen3MOETransformerLayerWeight(Qwen3TransformerLayerWeight):
@@ -66,3 +66,16 @@ def _init_moe(self):
             layer_num=self.layer_num_,
             network_config=self.network_config_,
         )
+
+    def _init_qkv(self):
+        in_dim = self.n_embed
+        self.qkv_proj = QKVROWNMMWeight(
+            in_dim=in_dim,
+            q_head_num=self.q_head_num_,
+            kv_head_num=self.k_head_num_,
+            head_dim=self.head_dim,
+            weight_names=[self._q_weight_name, self._k_weight_name, self._v_weight_name],
+            data_type=self.data_type_,
+            bias_names=[self._q_bias_name, self._k_bias_name, self._v_bias_name],
+            quant_method=self.get_quant_method("qkv_proj"),
+        )

From 45ce36a214cab9f937c8be1285438ee51f1da214 Mon Sep 17 00:00:00 2001
From: wangzaijun <wangzaijun@sensetime.com>
Date: Tue, 3 Feb 2026 03:13:34 +0000
Subject: [PATCH 3/3] fix assert.

---
 .../layer_weights/meta_weights/mm_weight/rowmm_weight.py    | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
index 4977c2971..0b8b6ac56 100644
--- a/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
+++ b/lightllm/common/basemodel/layer_weights/meta_weights/mm_weight/rowmm_weight.py
@@ -112,10 +112,8 @@ def __init__(
         assert q_head_num % self.tp_world_size_ == 0, (
             f"q_head_num must be divisible by tp_world_size_, " f"but found: {q_head_num} % {self.tp_world_size_}"
         )
-        assert kv_head_num % self.tp_world_size_ == 0 or self.tp_world_size_ % kv_head_num == 0, (
-            f"kv_head_num must be divisible by tp_world_size_ or "
-            f"tp_world_size_ must be divisible by kv_head_num, "
-            f"but found: {kv_head_num} % {self.tp_world_size_}"
+        assert kv_head_num % self.tp_world_size_ == 0, (
+            f"kv_head_num must be divisible by tp_world_size_" f"but found: {kv_head_num} % {self.tp_world_size_}"
         )
         q_hidden_size = (q_head_num // self.tp_world_size_) * head_dim
         kv_hidden_size = (kv_head_num // self.tp_world_size_) * head_dim