@@ -224,13 +224,71 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
224224 }
225225 GGML_F16x_VEC_REDUCE (sumf [0 ], sum_00 , sum_01 , sum_02 , sum_03 );
226226 GGML_F16x_VEC_REDUCE (sumf [1 ], sum_10 , sum_11 , sum_12 , sum_13 );
227- #elif defined(__riscv_v_intrinsic )
228- // todo: RVV impl
229- for (int i = 0 ; i < n ; ++ i ) {
230- for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
231- sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
232- }
233- }
227+
228+ #elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
229+ size_t vl = __riscv_vsetvlmax_e32m4 ();
230+
231+ // initialize accumulators to all zeroes
232+ vfloat32m4_t vsum0_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
233+ vfloat32m4_t vsum0_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
234+ vfloat32m4_t vsum1_0 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
235+ vfloat32m4_t vsum1_1 = __riscv_vfmv_v_f_f32m4 (0.0f , vl );
236+
237+ // calculate step size
238+ const size_t epr = __riscv_vsetvlmax_e16m2 ();
239+ const size_t step = epr * 2 ;
240+ const int np = (n & ~(step - 1 ));
241+
242+ // unroll by 2 along the row dimension
243+ for (int i = 0 ; i < np ; i += step ) {
244+ vfloat16m2_t ay0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), epr );
245+ vfloat16m2_t ax0_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), epr );
246+ vfloat16m2_t ax1_0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), epr );
247+ vsum0_0 = __riscv_vfwmacc_vv_f32m4 (vsum0_0 , ax0_0 , ay0 , epr );
248+ vsum1_0 = __riscv_vfwmacc_vv_f32m4 (vsum1_0 , ax1_0 , ay0 , epr );
249+
250+ vfloat16m2_t ay1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i + epr ), epr );
251+ vfloat16m2_t ax0_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i + epr ), epr );
252+ vfloat16m2_t ax1_1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i + epr ), epr );
253+ vsum0_1 = __riscv_vfwmacc_vv_f32m4 (vsum0_1 , ax0_1 , ay1 , epr );
254+ vsum1_1 = __riscv_vfwmacc_vv_f32m4 (vsum1_1 , ax1_1 , ay1 , epr );
255+ }
256+
257+ vfloat32m4_t vsum0 = __riscv_vfadd_vv_f32m4 (vsum0_0 , vsum0_1 , vl );
258+ vfloat32m4_t vsum1 = __riscv_vfadd_vv_f32m4 (vsum1_0 , vsum1_1 , vl );
259+
260+ // leftovers
261+ for (int i = np ; i < n ; i += vl ) {
262+ vl = __riscv_vsetvl_e16m2 (n - i );
263+ vfloat16m2_t ay = __riscv_vle16_v_f16m2 ((const _Float16 * )(y + i ), vl );
264+ vfloat16m2_t ax0 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [0 ] + i ), vl );
265+ vfloat16m2_t ax1 = __riscv_vle16_v_f16m2 ((const _Float16 * )(x [1 ] + i ), vl );
266+
267+ vsum0 = __riscv_vfwmacc_vv_f32m4 (vsum0 , ax0 , ay , vl );
268+ vsum1 = __riscv_vfwmacc_vv_f32m4 (vsum1 , ax1 , ay , vl );
269+ }
270+
271+ // reduce
272+ vl = __riscv_vsetvlmax_e32m2 ();
273+ vfloat32m2_t acc0_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum0 , 0 ),
274+ __riscv_vget_v_f32m4_f32m2 (vsum0 , 1 ), vl );
275+ vl = __riscv_vsetvlmax_e32m1 ();
276+ vfloat32m1_t acc0_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc0_0 , 0 ),
277+ __riscv_vget_v_f32m2_f32m1 (acc0_0 , 1 ), vl );
278+ vfloat32m1_t redsum0 = __riscv_vfredusum_vs_f32m1_f32m1 (
279+ acc0_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
280+
281+ vl = __riscv_vsetvlmax_e32m2 ();
282+ vfloat32m2_t acc1_0 = __riscv_vfadd_vv_f32m2 (__riscv_vget_v_f32m4_f32m2 (vsum1 , 0 ),
283+ __riscv_vget_v_f32m4_f32m2 (vsum1 , 1 ), vl );
284+ vl = __riscv_vsetvlmax_e32m1 ();
285+ vfloat32m1_t acc1_1 = __riscv_vfadd_vv_f32m1 (__riscv_vget_v_f32m2_f32m1 (acc1_0 , 0 ),
286+ __riscv_vget_v_f32m2_f32m1 (acc1_0 , 1 ), vl );
287+ vfloat32m1_t redsum1 = __riscv_vfredusum_vs_f32m1_f32m1 (
288+ acc1_1 , __riscv_vfmv_v_f_f32m1 (0.0f , 1 ), vl );
289+ sumf [0 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum0 );
290+ sumf [1 ] = __riscv_vfmv_f_s_f32m1_f32 (redsum1 );
291+
234292 #else
235293 const int np = (n & ~(GGML_F16_STEP - 1 ));
236294
@@ -475,15 +533,39 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
475533 }
476534 np = n ;
477535#elif defined(__riscv_zvfh ) // implies __riscv_v_intrinsic
478- const int np = n ;
479- _Float16 hv = (_Float16 )v ;
480- for (int i = 0 , avl ; i < n ; i += avl ) {
481- avl = __riscv_vsetvl_e16m8 (n - i );
482- vfloat16m8_t ax = __riscv_vle16_v_f16m8 ((const _Float16 * )& x [i ], avl );
483- vfloat16m8_t ay = __riscv_vle16_v_f16m8 ((_Float16 * )& y [i ], avl );
484- vfloat16m8_t ny = __riscv_vfmadd_vf_f16m8 (ax , hv , ay , avl );
485- __riscv_vse16_v_f16m8 ((_Float16 * )& y [i ], ny , avl );
536+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16 (v );
537+ const _Float16 scale = * (const _Float16 * )(& s );
538+
539+ // calculate step size
540+ const int epr = __riscv_vsetvlmax_e16m4 ();
541+ const int step = epr * 2 ;
542+ int np = (n & ~(step - 1 ));
543+
544+ // unroll by 2
545+ for (int i = 0 ; i < np ; i += step ) {
546+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , epr );
547+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
548+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , epr );
549+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
550+ __asm__ __volatile__ ("" ::: "memory" );
551+
552+ vfloat16m4_t ax1 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i + epr , epr );
553+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
554+ ay1 = __riscv_vfmacc_vf_f16m4 (ay1 , scale , ax1 , epr );
555+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
556+ __asm__ __volatile__ ("" ::: "memory" );
557+ }
558+
559+ // leftovers
560+ int vl ;
561+ for (int i = np ; i < n ; i += vl ) {
562+ vl = __riscv_vsetvl_e16m4 (n - i );
563+ vfloat16m4_t ax0 = __riscv_vle16_v_f16m4 ((const _Float16 * )x + i , vl );
564+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
565+ ay0 = __riscv_vfmacc_vf_f16m4 (ay0 , scale , ax0 , vl );
566+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
486567 }
568+ np = n ;
487569#elif defined(GGML_SIMD )
488570 const int np = (n & ~(GGML_F16_STEP - 1 ));
489571
@@ -724,13 +806,34 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
724806 svst1_f16 (pg , (__fp16 * )(y + np ), out );
725807 }
726808#elif defined(__riscv_v_intrinsic ) && defined(__riscv_zvfh )
727- for (int i = 0 , vl ; i < n ; i += vl ) {
728- vl = __riscv_vsetvl_e16m2 (n - i );
729- vfloat16m2_t vy = __riscv_vle16_v_f16m2 ((_Float16 * )& y [i ], vl );
730- vfloat32m4_t vy32 = __riscv_vfwcvt_f_f_v_f32m4 (vy , vl );
731- vy32 = __riscv_vfmul_vf_f32m4 (vy32 , v , vl );
732- vy = __riscv_vfncvt_f_f_w_f16m2 (vy32 , vl );
733- __riscv_vse16_v_f16m2 ((_Float16 * )& y [i ], vy , vl );
809+ const ggml_fp16_t s = GGML_CPU_FP32_TO_FP16 (v );
810+ const _Float16 scale = * (const _Float16 * )(& s );
811+
812+ // calculate step size
813+ const int epr = __riscv_vsetvlmax_e16m4 ();
814+ const int step = epr * 2 ;
815+ const int np = (n & ~(step - 1 ));
816+
817+ // unroll by 2
818+ for (int i = 0 ; i < np ; i += step ) {
819+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , epr );
820+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , epr );
821+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , epr );
822+ __asm__ __volatile__ ("" ::: "memory" );
823+
824+ vfloat16m4_t ay1 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i + epr , epr );
825+ ay1 = __riscv_vfmul_vf_f16m4 (ay1 , scale , epr );
826+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i + epr , ay1 , epr );
827+ __asm__ __volatile__ ("" ::: "memory" );
828+ }
829+
830+ // leftovers
831+ int vl ;
832+ for (int i = np ; i < n ; i += vl ) {
833+ vl = __riscv_vsetvl_e16m4 (n - i );
834+ vfloat16m4_t ay0 = __riscv_vle16_v_f16m4 ((const _Float16 * )y + i , vl );
835+ ay0 = __riscv_vfmul_vf_f16m4 (ay0 , scale , vl );
836+ __riscv_vse16_v_f16m4 ((_Float16 * )y + i , ay0 , vl );
734837 }
735838#elif defined(GGML_SIMD )
736839 const int np = (n & ~(GGML_F16_STEP - 1 ));
0 commit comments