From b5f2a50fe97c34ef2804c9fbc217ed88327e5a71 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Tue, 10 Feb 2026 17:30:05 +0000 Subject: [PATCH 01/12] Added ability to accumulate in FP16 for GEMM. Widens once at the end of loops. --- kernel/riscv64/shgemm_kernel_16x8_zvl256b.c | 472 +++++++++++++++++++- 1 file changed, 462 insertions(+), 10 deletions(-) diff --git a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c index 746e07d55f..217b6dbfca 100644 --- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c @@ -1,6 +1,8 @@ - #include "common.h" #include + +#define FP16_NARROW + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -28,7 +30,17 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); ai += 16; - + +#ifdef FP16_NARROW + vfloat16m1_t result0 = __riscv_vfmul_vf_f16m1( A0, B0, gvl); + vfloat16m1_t result1 = __riscv_vfmul_vf_f16m1( A0, B1, gvl); + vfloat16m1_t result2 = __riscv_vfmul_vf_f16m1( A0, B2, gvl); + vfloat16m1_t result3 = __riscv_vfmul_vf_f16m1( A0, B3, gvl); + vfloat16m1_t result4 = __riscv_vfmul_vf_f16m1( A0, B4, gvl); + vfloat16m1_t result5 = __riscv_vfmul_vf_f16m1( A0, B5, gvl); + vfloat16m1_t result6 = __riscv_vfmul_vf_f16m1( A0, B6, gvl); + vfloat16m1_t result7 = __riscv_vfmul_vf_f16m1( A0, B7, gvl); +#else vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl); vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl); vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl); @@ -37,6 +49,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl); vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl); vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl); +#endif for(BLASLONG k=1; k Date: Tue, 10 Feb 2026 18:30:02 +0000 Subject: [PATCH 02/12] 128-bit versions. --- kernel/riscv64/shgemm_kernel_8x8_zvl128b.c | 372 ++++++++++++++++++++- 1 file changed, 370 insertions(+), 2 deletions(-) diff --git a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c index 3197f7c005..9e720fd844 100644 --- a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c @@ -1,4 +1,3 @@ - #include "common.h" #include @@ -30,6 +29,16 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat16m1_t A0 = __riscv_vle16_v_f16m1( &A[ai+0*gvl], gvl ); ai += 8; +#ifdef FP16_NARROW + vfloat16m1_t result0 = __riscv_vfmul_vf_f16m1( A0, B0, gvl); + vfloat16m1_t result1 = __riscv_vfmul_vf_f16m1( A0, B1, gvl); + vfloat16m1_t result2 = __riscv_vfmul_vf_f16m1( A0, B2, gvl); + vfloat16m1_t result3 = __riscv_vfmul_vf_f16m1( A0, B3, gvl); + vfloat16m1_t result4 = __riscv_vfmul_vf_f16m1( A0, B4, gvl); + vfloat16m1_t result5 = __riscv_vfmul_vf_f16m1( A0, B5, gvl); + vfloat16m1_t result6 = __riscv_vfmul_vf_f16m1( A0, B6, gvl); + vfloat16m1_t result7 = __riscv_vfmul_vf_f16m1( A0, B7, gvl); +#else vfloat32m2_t result0 = __riscv_vfwmul_vf_f32m2( A0, B0, gvl); vfloat32m2_t result1 = __riscv_vfwmul_vf_f32m2( A0, B1, gvl); vfloat32m2_t result2 = __riscv_vfwmul_vf_f32m2( A0, B2, gvl); @@ -38,6 +47,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat32m2_t result5 = __riscv_vfwmul_vf_f32m2( A0, B5, gvl); vfloat32m2_t result6 = __riscv_vfwmul_vf_f32m2( A0, B6, gvl); vfloat32m2_t result7 = __riscv_vfwmul_vf_f32m2( A0, B7, gvl); +#endif for(BLASLONG k=1; k Date: Tue, 10 Feb 2026 19:00:26 +0000 Subject: [PATCH 03/12] Forget to add defintion. --- kernel/riscv64/shgemm_kernel_8x8_zvl128b.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c index 9e720fd844..1173a4a0fe 100644 --- a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c @@ -1,6 +1,8 @@ #include "common.h" #include +#define FP16_NARROW + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; From e3cb067bf482a0cea305d09ffc388f9da969bc93 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Wed, 11 Feb 2026 00:27:27 +0000 Subject: [PATCH 04/12] Fixed MADD to use float16 values. Use LMUL = 2 in main loop. Now 1.85X faster on BananaPi. --- kernel/riscv64/shgemm_kernel_16x8_zvl256b.c | 448 ++++++++++++++++---- kernel/riscv64/shgemm_kernel_8x8_zvl128b.c | 153 +++---- 2 files changed, 433 insertions(+), 168 deletions(-) diff --git a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c index 217b6dbfca..4ec59f5e02 100644 --- a/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/shgemm_kernel_16x8_zvl256b.c @@ -8,13 +8,114 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; +#ifdef FP16_NARROW + IFLOAT alpha16 = (IFLOAT)(alpha); +#endif // -- MAIN PASS for (BLASLONG j=0; j Date: Wed, 11 Feb 2026 19:50:48 +0000 Subject: [PATCH 05/12] Convert inputs from BF16 to FP32 and use FP32 vector madds. 18% faster. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 128 ++++++++++++++++++++ kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 86 +++++++++++++ 2 files changed, 214 insertions(+) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 6e7b06884d..6a0844017c 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -1,6 +1,8 @@ #include "common.h" #include +#define BF16_WIDEN_ONE + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -28,6 +30,30 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat32m2_t result7 = __riscv_vfmv_v_f_f32m2(0.0f, gvl); for (BLASLONG k=0; k +#define BF16_WIDEN_ONE + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -28,6 +30,30 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, vfloat32m2_t result7 = __riscv_vfmv_v_f_f32m2(0.0f, gvl); for (BLASLONG k=0; k Date: Thu, 12 Feb 2026 18:45:39 +0000 Subject: [PATCH 06/12] Convert BF16 values once (and vectorized). --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 150 ++++++++++++++------ kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 116 +++++++++++---- 2 files changed, 195 insertions(+), 71 deletions(-) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 6a0844017c..68f8aefcb3 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -11,14 +11,41 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, __bf16 *BB = (__bf16 *)(B); __bf16 *AA = (__bf16 *)(A); +#ifdef BF16_WIDEN_ONE + FLOAT *B_CONV = NULL; + if ((M >= 4) && (N >= 4) && (K > 0)) { + B_CONV = (FLOAT *)(malloc(K * 8 * sizeof(FLOAT))); + if (!B_CONV) return 1; + } +#endif + // -- MAIN PASS for (BLASLONG j=0; j= 4) && (N >= 4) && (K > 0)) { + B_CONV = (FLOAT *)(malloc(K * 8 * sizeof(FLOAT))); + if (!B_CONV) return 1; + } +#endif + // -- MAIN PASS for (BLASLONG j=0; j Date: Thu, 12 Feb 2026 20:35:41 +0000 Subject: [PATCH 07/12] One small change. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 4 ++-- kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 68f8aefcb3..8aa114a10e 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -25,7 +25,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = __riscv_vsetvl_e16m1(16); #ifdef BF16_WIDEN_ONE BLASLONG bi2; - if (B_CONV) { + { BLASLONG bi3 = 0; BLASLONG gvl2; bi2 = K * 8; @@ -460,7 +460,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2; - if (B_CONV) { + { BLASLONG bi3 = 0; BLASLONG gvl2; bi2 = K * 4; diff --git a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c index 53099bc9ee..f251f90608 100644 --- a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c @@ -26,7 +26,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2; - if (B_CONV) { + { BLASLONG bi3 = 0; BLASLONG gvl2; bi2 = K * 8; @@ -362,7 +362,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2; - if (B_CONV) { + { BLASLONG bi3 = 0; BLASLONG gvl2; bi2 = K * 4; From 0acb60aab3c0134e879a68292904d8346dcd50ef Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 13 Feb 2026 17:55:15 +0000 Subject: [PATCH 08/12] Conversion from BF16 to FP32 only once. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 181 ++++++++++---------- kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 143 ++++++++-------- 2 files changed, 164 insertions(+), 160 deletions(-) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 8aa114a10e..1e2bf369e7 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -3,6 +3,32 @@ #define BF16_WIDEN_ONE +#ifdef BF16_WIDEN_ONE +#define FORCEINLINE inline __attribute__((always_inline)) +#define B_UNROLL 64 + +// Convert from BF16 to FP32 +static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) +{ + BLASLONG count2 = (count & (B_UNROLL - 1)); + count &= -B_UNROLL; + while (count) { + vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, B_UNROLL); + vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, B_UNROLL); + __riscv_vse32_v_f32m8(CONV, B0, B_UNROLL); + BB += B_UNROLL; + CONV += B_UNROLL; + count -= B_UNROLL; + } + if (count2) { + BLASLONG gvl2 = __riscv_vsetvl_e16m4(count2); + vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, gvl2); + vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, gvl2); + __riscv_vse32_v_f32m8(CONV, B0, gvl2); + } +} +#endif + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -12,10 +38,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, __bf16 *AA = (__bf16 *)(A); #ifdef BF16_WIDEN_ONE - FLOAT *B_CONV = NULL; + FLOAT *CONV = NULL; if ((M >= 4) && (N >= 4) && (K > 0)) { - B_CONV = (FLOAT *)(malloc(K * 8 * sizeof(FLOAT))); - if (!B_CONV) return 1; + CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); + if (!CONV) return 1; + B_CONV(AA, CONV + (K * 8), (M & -4) * K); } #endif @@ -24,26 +51,16 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, m_top = 0; BLASLONG gvl = __riscv_vsetvl_e16m1(16); #ifdef BF16_WIDEN_ONE - BLASLONG bi2; - { - BLASLONG bi3 = 0; - BLASLONG gvl2; - bi2 = K * 8; - do { - gvl2 = __riscv_vsetvl_e16m4(bi2); - vbfloat16m4_t A00 = __riscv_vle16_v_bf16m4(&BB[bi3 + (n_top*K)], gvl2); - vfloat32m8_t A0 = __riscv_vfwcvtbf16_f_f_v_f32m8(A00, gvl2); - __riscv_vse32_v_f32m8(&B_CONV[bi3], A0, gvl2); - bi3 += gvl2; - } while (bi2 -= gvl2); - } + BLASLONG bi2 = K * 8; + B_CONV(BB + (n_top*K), CONV, bi2); + BLASLONG ai2 = K * 8; #endif for (BLASLONG i=0; i= 4) && (N >= 4) && (K > 0)) { - B_CONV = (FLOAT *)(malloc(K * 8 * sizeof(FLOAT))); - if (!B_CONV) return 1; + CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); + if (!CONV) return 1; + B_CONV(AA, CONV + (K * 8), (M & -4) * K); } #endif @@ -25,26 +52,16 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = __riscv_vsetvl_e16m1(8); #ifdef BF16_WIDEN_ONE - BLASLONG bi2; - { - BLASLONG bi3 = 0; - BLASLONG gvl2; - bi2 = K * 8; - do { - gvl2 = __riscv_vsetvl_e16m4(bi2); - vbfloat16m4_t A00 = __riscv_vle16_v_bf16m4(&BB[bi3 + (n_top*K)], gvl2); - vfloat32m8_t A0 = __riscv_vfwcvtbf16_f_f_v_f32m8(A00, gvl2); - __riscv_vse32_v_f32m8(&B_CONV[bi3], A0, gvl2); - bi3 += gvl2; - } while (bi2 -= gvl2); - } + BLASLONG bi2 = K * 8; + B_CONV(BB + (n_top*K), CONV, bi2); + BLASLONG ai2 = K * 8; #endif for (BLASLONG i=0; i Date: Fri, 13 Feb 2026 18:14:11 +0000 Subject: [PATCH 09/12] Only convert B if M is greater or equal to 4. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 8 ++++++-- kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 1e2bf369e7..fc1a170a82 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -52,7 +52,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = __riscv_vsetvl_e16m1(16); #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; - B_CONV(BB + (n_top*K), CONV, bi2); + if (M >= 4) { + B_CONV(BB + (n_top*K), CONV, bi2); + } BLASLONG ai2 = K * 8; #endif @@ -474,7 +476,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 4; - B_CONV(BB + (n_top*K), CONV, bi2); + if (M >= 4) { + B_CONV(BB + (n_top*K), CONV, bi2); + } BLASLONG ai2 = K * 8; #endif diff --git a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c index 4caa3241e3..29521a286d 100644 --- a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c @@ -53,7 +53,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; - B_CONV(BB + (n_top*K), CONV, bi2); + if (M >= 4) { + B_CONV(BB + (n_top*K), CONV, bi2); + } BLASLONG ai2 = K * 8; #endif @@ -377,7 +379,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 4; - B_CONV(BB + (n_top*K), CONV, bi2); + if (M >= 4) { + B_CONV(BB + (n_top*K), CONV, bi2); + } BLASLONG ai2 = K * 8; #endif From 7a1d23400fef84f04c4cb61c6cf0f4fd94a64dc4 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 13 Feb 2026 19:00:41 +0000 Subject: [PATCH 10/12] Add flag for not converting A & B - will be used in future to do conversion during packing. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 6 ++++++ kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index fc1a170a82..9abdf689b7 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -42,7 +42,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if ((M >= 4) && (N >= 4) && (K > 0)) { CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); if (!CONV) return 1; +#ifndef BF16_DONT_CONV B_CONV(AA, CONV + (K * 8), (M & -4) * K); +#endif } #endif @@ -52,9 +54,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = __riscv_vsetvl_e16m1(16); #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; +#ifndef BF16_DONT_CONV if (M >= 4) { B_CONV(BB + (n_top*K), CONV, bi2); } +#endif BLASLONG ai2 = K * 8; #endif @@ -476,9 +480,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 4; +#ifndef BF16_DONT_CONV if (M >= 4) { B_CONV(BB + (n_top*K), CONV, bi2); } +#endif BLASLONG ai2 = K * 8; #endif diff --git a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c index 29521a286d..7a9f17e607 100644 --- a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c @@ -42,7 +42,9 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if ((M >= 4) && (N >= 4) && (K > 0)) { CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); if (!CONV) return 1; +#ifndef BF16_DONT_CONV B_CONV(AA, CONV + (K * 8), (M & -4) * K); +#endif } #endif @@ -53,9 +55,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; +#ifndef BF16_DONT_CONV if (M >= 4) { B_CONV(BB + (n_top*K), CONV, bi2); } +#endif BLASLONG ai2 = K * 8; #endif @@ -379,9 +383,11 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 4; +#ifndef BF16_DONT_CONV if (M >= 4) { B_CONV(BB + (n_top*K), CONV, bi2); } +#endif BLASLONG ai2 = K * 8; #endif From 1d6aa0dc3115485de5ca44e05d5bbaaeeee6d194 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Fri, 13 Feb 2026 20:03:35 +0000 Subject: [PATCH 11/12] Add dummy memsets - just in case. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 8 ++++++++ kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index 9abdf689b7..dab6361911 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -29,6 +29,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) } #endif +#ifndef VECTORIZE_MEMSET +#define memset_zero(ptr, size, dir) memset(ptr, 0, size) +#else +void memset_zero(void *input, BLASLONG size, bool dir); +#endif + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -44,6 +50,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if (!CONV) return 1; #ifndef BF16_DONT_CONV B_CONV(AA, CONV + (K * 8), (M & -4) * K); +#else + memset_zero(CONV, (K * (8 + (M & -4))) * sizeof(FLOAT), false); #endif } #endif diff --git a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c index 7a9f17e607..c062da045a 100644 --- a/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c @@ -29,6 +29,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) } #endif +#ifndef VECTORIZE_MEMSET +#define memset_zero(ptr, size, dir) memset(ptr, 0, size) +#else +void memset_zero(void *input, BLASLONG size, bool dir); +#endif + int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { BLASLONG gvl = 0; @@ -44,6 +50,8 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if (!CONV) return 1; #ifndef BF16_DONT_CONV B_CONV(AA, CONV + (K * 8), (M & -4) * K); +#else + memset_zero(CONV, (K * (8 + (M & -4))) * sizeof(FLOAT), false); #endif } #endif From efe63e797011e009b364e0ea47dd3e116b474eb6 Mon Sep 17 00:00:00 2001 From: Chip Kerchner Date: Sun, 15 Feb 2026 15:49:59 +0000 Subject: [PATCH 12/12] Add pre-RVA23 to BF16 GEMM. --- kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c | 290 +++++++++++++++++--- kernel/riscv64/sbgemm_kernel_8x8_zvl128b.c | 263 +++++++++++++++--- kernel/riscv64/shgemm_kernel_16x8_zvl256b.c | 2 +- kernel/riscv64/shgemm_kernel_8x8_zvl128b.c | 2 +- 4 files changed, 491 insertions(+), 66 deletions(-) diff --git a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c index dab6361911..f83abb2339 100644 --- a/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c +++ b/kernel/riscv64/sbgemm_kernel_16x8_zvl256b.c @@ -1,7 +1,8 @@ #include "common.h" #include -#define BF16_WIDEN_ONE +#define BF16_WIDEN_ONE // Convert pre-hand and do operations in FP32 +#define USE_BF16_CVT // Comment out for pre-RVA23 systems like BananaPi #ifdef BF16_WIDEN_ONE #define FORCEINLINE inline __attribute__((always_inline)) @@ -14,7 +15,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) count &= -B_UNROLL; while (count) { vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, B_UNROLL); +#ifdef USE_BF16_CVT vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, B_UNROLL); +#else + vfloat32m8_t B0 = __riscv_vreinterpret_v_u32m8_f32m8(__riscv_vsll_vx_u32m8( + __riscv_vwcvtu_x_x_v_u32m8(__riscv_vreinterpret_v_bf16m4_u16m4(B00), B_UNROLL), 16, B_UNROLL)); +#endif __riscv_vse32_v_f32m8(CONV, B0, B_UNROLL); BB += B_UNROLL; CONV += B_UNROLL; @@ -23,7 +29,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) if (count2) { BLASLONG gvl2 = __riscv_vsetvl_e16m4(count2); vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, gvl2); +#ifdef USE_BF16_CVT vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, gvl2); +#else + vfloat32m8_t B0 = __riscv_vreinterpret_v_u32m8_f32m8(__riscv_vsll_vx_u32m8( + __riscv_vwcvtu_x_x_v_u32m8(__riscv_vreinterpret_v_bf16m4_u16m4(B00), gvl2), 16, gvl2)); +#endif __riscv_vse32_v_f32m8(CONV, B0, gvl2); } } @@ -40,20 +51,19 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; +#if !defined(BF16_WIDEN_ONE) || !defined(BF16_DONT_CONV) __bf16 *BB = (__bf16 *)(B); __bf16 *AA = (__bf16 *)(A); +#endif #ifdef BF16_WIDEN_ONE - FLOAT *CONV = NULL; - if ((M >= 4) && (N >= 4) && (K > 0)) { - CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); - if (!CONV) return 1; + FLOAT *CONV = (FLOAT *)(malloc((K * (8 + M)) * sizeof(FLOAT))); + if (!CONV) return 1; #ifndef BF16_DONT_CONV - B_CONV(AA, CONV + (K * 8), (M & -4) * K); + B_CONV(AA, CONV + (K * 8), K * M); #else - memset_zero(CONV, (K * (8 + (M & -4))) * sizeof(FLOAT), false); + memset_zero(CONV, (K * (8 + M)) * sizeof(FLOAT), false); #endif - } #endif // -- MAIN PASS @@ -63,9 +73,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; #ifndef BF16_DONT_CONV - if (M >= 4) { - B_CONV(BB + (n_top*K), CONV, bi2); - } + B_CONV(BB + (n_top*K), CONV, bi2); #endif BLASLONG ai2 = K * 8; #endif @@ -394,10 +402,34 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, float result13 = 0; float result14 = 0; float result15 = 0; - BLASLONG ai = m_top * K; - BLASLONG bi = n_top * K; +#ifdef BF16_WIDEN_ONE + bi2 = 0; +#else + BLASLONG ai=m_top*K; + BLASLONG bi=n_top*K; +#endif for (BLASLONG k=0; k= 4) { - B_CONV(BB + (n_top*K), CONV, bi2); - } + B_CONV(BB + (n_top*K), CONV, bi2); #endif BLASLONG ai2 = K * 8; #endif @@ -709,10 +757,26 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, float result5 = 0; float result6 = 0; float result7 = 0; - BLASLONG ai = m_top * K; - BLASLONG bi = n_top * K; +#ifdef BF16_WIDEN_ONE + bi2 = 0; +#else + BLASLONG ai=m_top*K; + BLASLONG bi=n_top*K; +#endif for (BLASLONG k=0; k -#define BF16_WIDEN_ONE +#define BF16_WIDEN_ONE // Convert pre-hand and do operations in FP32 +#define USE_BF16_CVT // Comment out for pre-RVA23 systems #ifdef BF16_WIDEN_ONE #define FORCEINLINE inline __attribute__((always_inline)) @@ -14,7 +15,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) count &= -B_UNROLL; while (count) { vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, B_UNROLL); +#ifdef USE_BF16_CVT vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, B_UNROLL); +#else + vfloat32m8_t B0 = __riscv_vreinterpret_v_u32m8_f32m8(__riscv_vsll_vx_u32m8( + __riscv_vwcvtu_x_x_v_u32m8(__riscv_vreinterpret_v_bf16m4_u16m4(B00), B_UNROLL), 16, B_UNROLL)); +#endif __riscv_vse32_v_f32m8(CONV, B0, B_UNROLL); BB += B_UNROLL; CONV += B_UNROLL; @@ -23,7 +29,12 @@ static void FORCEINLINE B_CONV(__bf16 *BB, FLOAT *CONV, BLASLONG count) if (count2) { BLASLONG gvl2 = __riscv_vsetvl_e16m4(count2); vbfloat16m4_t B00 = __riscv_vle16_v_bf16m4(BB, gvl2); +#ifdef USE_BF16_CVT vfloat32m8_t B0 = __riscv_vfwcvtbf16_f_f_v_f32m8(B00, gvl2); +#else + vfloat32m8_t B0 = __riscv_vreinterpret_v_u32m8_f32m8(__riscv_vsll_vx_u32m8( + __riscv_vwcvtu_x_x_v_u32m8(__riscv_vreinterpret_v_bf16m4_u16m4(B00), gvl2), 16, gvl2)); +#endif __riscv_vse32_v_f32m8(CONV, B0, gvl2); } } @@ -40,20 +51,19 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, BLASLONG gvl = 0; BLASLONG m_top = 0; BLASLONG n_top = 0; +#if !defined(BF16_WIDEN_ONE) || !defined(BF16_DONT_CONV) __bf16 *BB = (__bf16 *)(B); __bf16 *AA = (__bf16 *)(A); +#endif #ifdef BF16_WIDEN_ONE - FLOAT *CONV = NULL; - if ((M >= 4) && (N >= 4) && (K > 0)) { - CONV = (FLOAT *)(malloc((K * (8 + (M & -4))) * sizeof(FLOAT))); - if (!CONV) return 1; + FLOAT *CONV = (FLOAT *)(malloc((K * (8 + M)) * sizeof(FLOAT))); + if (!CONV) return 1; #ifndef BF16_DONT_CONV - B_CONV(AA, CONV + (K * 8), (M & -4) * K); + B_CONV(AA, CONV + (K * 8), K * M); #else - memset_zero(CONV, (K * (8 + (M & -4))) * sizeof(FLOAT), false); + memset_zero(CONV, (K * (8 + M)) * sizeof(FLOAT), false); #endif - } #endif // -- MAIN PASS @@ -64,9 +74,7 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, #ifdef BF16_WIDEN_ONE BLASLONG bi2 = K * 8; #ifndef BF16_DONT_CONV - if (M >= 4) { - B_CONV(BB + (n_top*K), CONV, bi2); - } + B_CONV(BB + (n_top*K), CONV, bi2); #endif BLASLONG ai2 = K * 8; #endif @@ -280,8 +288,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if ( M & 2 ) { - BLASLONG ai = m_top * K; - BLASLONG bi = n_top * K; +#ifdef BF16_WIDEN_ONE + bi2 = 0; +#else + BLASLONG ai=m_top*K; + BLASLONG bi=n_top*K; +#endif float result0 = 0; float result1 = 0; @@ -301,6 +313,26 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, float result15 = 0; for (BLASLONG k=0; k= 4) { - B_CONV(BB + (n_top*K), CONV, bi2); - } + B_CONV(BB + (n_top*K), CONV, bi2); #endif BLASLONG ai2 = K * 8; #endif @@ -538,8 +586,12 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, if ( M & 2 ) { - BLASLONG ai = m_top * K; - BLASLONG bi = n_top * K; +#ifdef BF16_WIDEN_ONE + bi2 = 0; +#else + BLASLONG ai=m_top*K; + BLASLONG bi=n_top*K; +#endif float result0 = 0; float result1 = 0; @@ -551,6 +603,18 @@ int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, float result7 = 0; for (BLASLONG k=0; k -#define FP16_NARROW +#define FP16_NARROW // Accumulate in FP16 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) { diff --git a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c index 915e1032ad..03bdcc240e 100644 --- a/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c +++ b/kernel/riscv64/shgemm_kernel_8x8_zvl128b.c @@ -1,7 +1,7 @@ #include "common.h" #include -#define FP16_NARROW +#define FP16_NARROW // Accumulate in FP16 int CNAME(BLASLONG M, BLASLONG N, BLASLONG K, FLOAT alpha, IFLOAT *A, IFLOAT *B, FLOAT *C, BLASLONG ldc) {