From 8f1e0e03858458a973da9cb6a4b7e1aa70d7c794 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Wed, 17 Dec 2025 16:07:45 -0500
Subject: [PATCH 1/5] feat: working gelu with src0 put on vtcm

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 193 ++++++++++++++++++++++------
 ggml/src/ggml-hexagon/htp/main.c    |   2 +-
 2 files changed, 153 insertions(+), 42 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 59c0a70963d..cb4be01a5dc 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -263,7 +263,9 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                                        struct htp_spad *         dst_spad,
                                        uint32_t                  nth,
                                        uint32_t                  ith,
-                                       uint32_t                  src0_nrows_per_thread) {
+                                       uint32_t                  src0_nrows_per_thread,
+                                       dma_queue * dma_queue
+                                    ) {
     htp_act_preamble2;
 
     uint64_t t1, t2;
@@ -271,6 +273,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
     const size_t src0_row_size = nb01;
     const size_t dst_row_size  = nb1;
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
 
     const uint32_t src0_nrows = ne01 * ne02 * ne03;
 
@@ -292,37 +296,126 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         opt_path = 1;
     }
 
-    const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
-    uint8_t * restrict data_dst        = (uint8_t *) dst->data;
+    const uint8_t *  data_src0 = (const uint8_t *) src0->data;
+    uint8_t *  data_dst        = (uint8_t *) dst->data;
 
-    uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
-    uint8_t * restrict dst_spad_data  = dst_spad->data + (ith * dst_row_size);
 
-    const int BLOCK = 8;
-    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
-        const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
 
-        // Prefetch next block
-        if (block_end < src0_end_row) {
-            const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
-            htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
-        }
 
-        // Process rows in current block
+
+
+
+    // While given src0_spad->size_per_thread, divide it to  two ping-pong buffer for src0
+    size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2;
+
+    uint8_t *  src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread)); 
+    uint8_t *  src0_spad_data_pong = src0_spad_data_ping  + src0_size_per_pingpong;
+    uint8_t *  dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
+
+    // const int BLOCK = 8;
+    // for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+    //     const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
+
+    //     // Prefetch next block
+    //     if (block_end < src0_end_row) {
+    //         const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
+    //         htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
+    //     }
+
+    //     // Process rows in current block
+    //     for (uint32_t ib = ir; ib < block_end; ib++) {
+    //         const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
+    //         float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
+
+    //         // gelu = x * sigmoid(1.702 * x) // current implementation
+    //         if (1 == opt_path) {
+    //             hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
+    //             hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+    //             hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+    //         } else {
+    //             hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
+    //             hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
+    //             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
+    //         }
+    //     }
+    // }
+    
+
+    // Maybe should always go to the optimal path?
+    // In gelu = x*sigmoid(x*1.702)
+    // Although we have src0_size_per_pingpong
+    const int BLOCK = src0_size_per_pingpong / src0_row_size_aligned; // How many rows can we process in one block
+
+    // TODO:
+    if(BLOCK == 0){
+        FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", src0_spad->size_per_thread,
+             src0_row_size_aligned );
+        return;
+    }
+    // Do the inital dma fecth
+    // fetch src0
+    dma_queue_push(dma_queue,   
+        src0_spad_data_ping, 
+        data_src0 + (src0_start_row * src0_row_size),
+        src0_row_size_aligned,
+        src0_row_size,
+        MIN(BLOCK, src0_end_row - src0_start_row)
+        
+    );
+    bool ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute
+
+
+
+
+    for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
+        const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block
+        
+
+        const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end);
+
+        // prefetch next loop iteration if any
+        
+        if (next_block_size > 0) {
+            if(ping_pong_flag == false){
+
+                dma_queue_push(dma_queue,   
+                    src0_spad_data_ping, 
+                    data_src0 + (block_end * src0_row_size),
+                    src0_row_size_aligned,
+                    src0_row_size,
+                    next_block_size
+                );
+
+
+            }else{
+                dma_queue_push(dma_queue,   
+                    src0_spad_data_pong, 
+                    data_src0 + (block_end * src0_row_size),
+                    src0_row_size_aligned,
+                    src0_row_size,
+                    next_block_size
+                );
+            }
+            ping_pong_flag=!ping_pong_flag;
+        }
+        const float * src0 = (float*)dma_queue_pop(dma_queue); 
         for (uint32_t ib = ir; ib < block_end; ib++) {
-            const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
+
+
             float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
 
-            // gelu = x * sigmoid(1.702 * x) // current implementation
-            if (1 == opt_path) {
-                hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
-                hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-                hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            } else {
-                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-                hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-                hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-            }
+            // // gelu = x * sigmoid(1.702 * x) // current implementation
+            // if (1 == opt_path) {
+            //     hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) dst_spad_data, ne0);
+            //     hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0);
+            //     hvx_mul_f32_opt((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0);  //TODO: can dma push dst_spad_data back?
+            // } else {
+                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) dst_spad_data, ne0);
+                hvx_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0);
+                hvx_mul_f32((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0);
+            //}
+
+            src0 +=  src0_row_size_aligned/sizeof(float); // Move to next row
         }
     }
 
@@ -335,7 +428,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
     struct htp_ops_context * octx = (struct htp_ops_context *) data;
     unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
-                               octx->src0_nrows_per_thread);
+                               octx->src0_nrows_per_thread, octx->ctx->dma[i]);
 }
 
 
@@ -469,16 +562,44 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
     const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
 
     const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
+    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : 0; // zero bytes if src1 is not used
     const size_t dst_row_size  = dst->nb[1];
 
+
+    const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
+    const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
+    const size_t dst_row_size_aligned  = htp_round_up(dst_row_size, VLEN);
     // VTCM scratchpads for all tensors
     // N rows per thread, padded to HVX vector size
-    octx->dst_spad.size  = htp_round_up(dst_row_size, 128) * octx->n_threads;
-    octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
-    octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;
 
-    size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;
+    
+    size_t spad_size_per_row = (src0_row_size_aligned +
+            src1_row_size_aligned) + dst_row_size_aligned; 
+
+    size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);
+
+
+    // Make sure the reserved vtcm size is sufficient
+    if(vtcm_row_per_thread ==0){
+        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
+             spad_size_per_row * n_threads);
+        return HTP_STATUS_VTCM_TOO_SMALL;
+    }
+
+
+
+    octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
+    octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
+    octx->dst_spad.size_per_thread  = dst_row_size_aligned * vtcm_row_per_thread;
+
+    octx->dst_spad.size  = n_threads* octx->dst_spad.size_per_thread;
+    octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
+    octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;
+
+    octx->src0_spad.data = octx->ctx->vtcm_base;
+    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
+    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
+
 
     if (src1->ne[0]) {
         FARF(HIGH,
@@ -492,16 +613,6 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
              octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
     }
 
-    // Make sure the reserved vtcm size is sufficient
-    if (octx->ctx->vtcm_size < spad_size) {
-        FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
-             spad_size);
-        return HTP_STATUS_VTCM_TOO_SMALL;
-    }
-
-    octx->src0_spad.data = octx->ctx->vtcm_base;
-    octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
-    octx->dst_spad.data  = octx->src1_spad.data + octx->src1_spad.size;
 
     if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
         uint32_t n_jobs = MIN(n_threads, src0_nrows);
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index e30ae695022..d047ef8a356 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
 
     ctx->n_threads = n_hvx;
     for (int i = 0; i < ctx->n_threads; i++) {
-        ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
+        ctx->dma[i] = dma_queue_create(ctx->vtcm_size); //NOTE: for now, whole vtcm size 
     }
 
     // init worker pool

From 8fd05731b053e5b372778e6dfe65c6a5f322e88a Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Wed, 17 Dec 2025 17:27:33 -0500
Subject: [PATCH 2/5] feat: gelu ping-pong for both in and out

---
 ggml/src/ggml-hexagon/htp/act-ops.c   | 131 +++++++++++++-------------
 ggml/src/ggml-hexagon/htp/htp-dma.h   |  14 ++-
 ggml/src/ggml-hexagon/htp/hvx-utils.h |  11 ++-
 3 files changed, 90 insertions(+), 66 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index cb4be01a5dc..4872dc02925 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -286,60 +286,20 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         return;
     }
 
-    int is_aligned = 1;
-    int opt_path   = 0;
-    if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
-        is_aligned = 0;
-        FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
-    }
-    if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
-        opt_path = 1;
-    }
 
     const uint8_t *  data_src0 = (const uint8_t *) src0->data;
     uint8_t *  data_dst        = (uint8_t *) dst->data;
 
 
 
-
-
-
-
     // While given src0_spad->size_per_thread, divide it to  two ping-pong buffer for src0
     size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2;
+    size_t dst_size_per_pingpong  = dst_spad->size_per_thread / 2;
 
     uint8_t *  src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread)); 
     uint8_t *  src0_spad_data_pong = src0_spad_data_ping  + src0_size_per_pingpong;
-    uint8_t *  dst_spad_data  = dst_spad->data + (ith * dst_spad->size_per_thread);
-
-    // const int BLOCK = 8;
-    // for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
-    //     const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
-
-    //     // Prefetch next block
-    //     if (block_end < src0_end_row) {
-    //         const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
-    //         htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
-    //     }
-
-    //     // Process rows in current block
-    //     for (uint32_t ib = ir; ib < block_end; ib++) {
-    //         const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
-    //         float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
-
-    //         // gelu = x * sigmoid(1.702 * x) // current implementation
-    //         if (1 == opt_path) {
-    //             hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
-    //             hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-    //             hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-    //         } else {
-    //             hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
-    //             hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
-    //             hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
-    //         }
-    //     }
-    // }
-    
+    uint8_t *  dst_spad_data_ping  = dst_spad->data + (ith * dst_spad->size_per_thread);
+    uint8_t *  dst_spad_data_pong  = dst_spad_data_ping  + dst_size_per_pingpong;
 
     // Maybe should always go to the optimal path?
     // In gelu = x*sigmoid(x*1.702)
@@ -362,10 +322,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         MIN(BLOCK, src0_end_row - src0_start_row)
         
     );
-    bool ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute
-
-
-
+    bool src0_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute
+    bool dst_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute
 
     for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
         const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block
@@ -373,10 +331,32 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
         const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end);
 
+
+        float* cur_dst_spad_ptr;
+        const float * src0;
+        if(dst_ping_pong_flag){
+            cur_dst_spad_ptr = (float*)dst_spad_data_ping;
+        }else{
+            cur_dst_spad_ptr = (float*)dst_spad_data_pong;
+        }
+
+        if(src0_ping_pong_flag){
+            src0 = (float*)src0_spad_data_ping;
+        }else{
+            src0 = (float*)src0_spad_data_pong;
+        }
+
+        dma_queue_pop(dma_queue); // wait for dma done for the previous src0 fetch
+
+        // Wait for the previous dst push to complete before we can reuse the dst buffer
+        if(ir != src0_start_row){
+            dma_queue_pop(dma_queue); // wait for dma done for the previous dst push
+        }
+
         // prefetch next loop iteration if any
         
         if (next_block_size > 0) {
-            if(ping_pong_flag == false){
+            if(src0_ping_pong_flag == false){
 
                 dma_queue_push(dma_queue,   
                     src0_spad_data_ping, 
@@ -396,29 +376,54 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
                     next_block_size
                 );
             }
-            ping_pong_flag=!ping_pong_flag;
+            src0_ping_pong_flag=!src0_ping_pong_flag;
         }
-        const float * src0 = (float*)dma_queue_pop(dma_queue); 
+        
         for (uint32_t ib = ir; ib < block_end; ib++) {
+            // gelu = x * sigmoid(1.702 * x) // current implementation
+            hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) cur_dst_spad_ptr, ne0);
+            hvx_fast_sigmoid_f32((const uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0);
+            hvx_mul_f32_opt((const uint8_t *) src0, (uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0);
+    
+
+            src0 +=  src0_row_size_aligned/sizeof(float); // Move to next row
+            cur_dst_spad_ptr+= dst_row_size_aligned/sizeof(float);
+        }
 
 
-            float * restrict dst        = (float *) (data_dst + (ib * dst_row_size));
+        float * restrict out_dst  = (float *) (data_dst + (ir * dst_row_size));
+
+        if(dst_ping_pong_flag){
+            dma_queue_push_width(dma_queue,   
+                out_dst,
+                dst_spad_data_ping,
+                dst_row_size,         // dst stride in DDR (actual row size)
+                dst_row_size_aligned, // src stride in VTCM (aligned)
+                dst_row_size,         // width
+                (block_end - ir)
+            );
+        }else{
+            dma_queue_push_width(dma_queue,   
+                out_dst,
+                dst_spad_data_pong,
+                dst_row_size,         // dst stride in DDR (actual row size)
+                dst_row_size_aligned, // src stride in VTCM (aligned)
+                dst_row_size,         // width
+                (block_end - ir)
+            );
+        }
 
-            // // gelu = x * sigmoid(1.702 * x) // current implementation
-            // if (1 == opt_path) {
-            //     hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) dst_spad_data, ne0);
-            //     hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0);
-            //     hvx_mul_f32_opt((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0);  //TODO: can dma push dst_spad_data back?
-            // } else {
-                hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) dst_spad_data, ne0);
-                hvx_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0);
-                hvx_mul_f32((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0);
-            //}
 
-            src0 +=  src0_row_size_aligned/sizeof(float); // Move to next row
+        if(ir != src0_start_row){
+            dma_queue_pop(dma_queue); // wait for dma done for the previous dst push
         }
+        // else is the first block,nothing to wait for dst push 
+
+        dst_ping_pong_flag = !dst_ping_pong_flag;
     }
 
+    dma_queue_pop(dma_queue); // wait for dma done for the last dst push
+
     t2 = HAP_perf_get_qtimer_count();
 
     FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
@@ -562,7 +567,7 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
     const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];
 
     const size_t src0_row_size = src0->nb[1];
-    const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : 0; // zero bytes if src1 is not used
+    const size_t src1_row_size = src1->ne[0] > 0 ? src1->nb[1] : 0; // zero bytes if src1 is not used
     const size_t dst_row_size  = dst->nb[1];
 
 
diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h
index 7d3fc4078cc..6f96f2f4f12 100644
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -49,11 +49,12 @@ static inline unsigned int dmwait(void) {
     return ret;
 }
 
-static inline bool dma_queue_push(dma_queue *  q,
+static inline bool dma_queue_push_width(dma_queue *  q,
                                   void *       dst,
                                   const void * src,
                                   size_t       dst_row_size,
                                   size_t       src_row_size,
+                                  size_t       width,
                                   size_t       nrows) {
     if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
         return false;
@@ -79,7 +80,7 @@ static inline bool dma_queue_push(dma_queue *  q,
     desc->dst            = (void *) dst;
     desc->allocation     = 0;
     desc->padding        = 0;
-    desc->roiwidth       = src_row_size;
+    desc->roiwidth       = width;
     desc->roiheight      = nrows;
     desc->srcstride      = src_row_size;
     desc->dststride      = dst_row_size;
@@ -96,6 +97,15 @@ static inline bool dma_queue_push(dma_queue *  q,
     return true;
 }
 
+static inline bool dma_queue_push(dma_queue *  q,
+                                  void *       dst,
+                                  const void * src,
+                                  size_t       dst_row_size,
+                                  size_t       src_row_size,
+                                  size_t       nrows) {
+    return dma_queue_push_width(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
 static inline uint8_t * dma_queue_pop(dma_queue * q) {
     if (q->push_idx == q->pop_idx) {
         return NULL;
diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h
index 566048297d3..d8f2130e31e 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h
@@ -980,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     int step_of_1 = num_elems >> 5;
     int remaining = num_elems - step_of_1 * VLEN_FP32;
 
-    assert(remaining == 0);
+ 
 
     const HVX_Vector * restrict v_src = (HVX_Vector *) src;
     HVX_Vector * restrict v_dst       = (HVX_Vector *) dst;
@@ -996,6 +996,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
     for (int i = 0; i < step_of_1; i++) {
         v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
     }
+
+    if(remaining > 0) {
+        const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
+        float *       dstf = (float *) dst + step_of_1*VLEN_FP32;
+
+        HVX_Vector in = *(HVX_UVector *) srcf;
+        HVX_Vector out =hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
+        hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
+    }
 }
 
 

From e875809ec7565384d45da758666e1910e63422b7 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Wed, 17 Dec 2025 17:34:08 -0500
Subject: [PATCH 3/5] fix: fixu compile error

---
 ggml/src/ggml-hexagon/htp/act-ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 4872dc02925..6d205408c1d 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -426,7 +426,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
     t2 = HAP_perf_get_qtimer_count();
 
-    FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
+    FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
          ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
 }
 

From 54f7bbdf952601572c6ddf3134eae9c41bd6c6b3 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Thu, 18 Dec 2025 11:13:27 -0500
Subject: [PATCH 4/5] break: distinguish dma ddr->vtcm and vtcm->ddr operation

---
 ggml/src/ggml-hexagon/htp/act-ops.c    | 12 +++++-------
 ggml/src/ggml-hexagon/htp/htp-dma.h    | 18 ++++++++++++++----
 ggml/src/ggml-hexagon/htp/matmul-ops.c | 24 ++++++++++++------------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c
index 6d205408c1d..1d2459d9fcb 100644
--- a/ggml/src/ggml-hexagon/htp/act-ops.c
+++ b/ggml/src/ggml-hexagon/htp/act-ops.c
@@ -314,7 +314,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
     }
     // Do the inital dma fecth
     // fetch src0
-    dma_queue_push(dma_queue,   
+    dma_queue_push_ddr_to_vtcm(dma_queue,   
         src0_spad_data_ping, 
         data_src0 + (src0_start_row * src0_row_size),
         src0_row_size_aligned,
@@ -358,7 +358,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         if (next_block_size > 0) {
             if(src0_ping_pong_flag == false){
 
-                dma_queue_push(dma_queue,   
+                dma_queue_push_ddr_to_vtcm(dma_queue,   
                     src0_spad_data_ping, 
                     data_src0 + (block_end * src0_row_size),
                     src0_row_size_aligned,
@@ -368,7 +368,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
 
 
             }else{
-                dma_queue_push(dma_queue,   
+                dma_queue_push_ddr_to_vtcm(dma_queue,   
                     src0_spad_data_pong, 
                     data_src0 + (block_end * src0_row_size),
                     src0_row_size_aligned,
@@ -394,21 +394,19 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
         float * restrict out_dst  = (float *) (data_dst + (ir * dst_row_size));
 
         if(dst_ping_pong_flag){
-            dma_queue_push_width(dma_queue,   
+            dma_queue_push_vtcm_to_ddr(dma_queue,   
                 out_dst,
                 dst_spad_data_ping,
                 dst_row_size,         // dst stride in DDR (actual row size)
                 dst_row_size_aligned, // src stride in VTCM (aligned)
-                dst_row_size,         // width
                 (block_end - ir)
             );
         }else{
-            dma_queue_push_width(dma_queue,   
+            dma_queue_push_vtcm_to_ddr(dma_queue,   
                 out_dst,
                 dst_spad_data_pong,
                 dst_row_size,         // dst stride in DDR (actual row size)
                 dst_row_size_aligned, // src stride in VTCM (aligned)
-                dst_row_size,         // width
                 (block_end - ir)
             );
         }
diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h
index 6f96f2f4f12..e1095c4a967 100644
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -49,12 +49,12 @@ static inline unsigned int dmwait(void) {
     return ret;
 }
 
-static inline bool dma_queue_push_width(dma_queue *  q,
+static inline bool dma_queue_push(dma_queue *  q,
                                   void *       dst,
                                   const void * src,
                                   size_t       dst_row_size,
                                   size_t       src_row_size,
-                                  size_t       width,
+                                  size_t       width, // width in bytes. number of bytes to transfer per row
                                   size_t       nrows) {
     if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
         return false;
@@ -97,13 +97,23 @@ static inline bool dma_queue_push_width(dma_queue *  q,
     return true;
 }
 
-static inline bool dma_queue_push(dma_queue *  q,
+static inline bool dma_queue_push_ddr_to_vtcm(dma_queue *  q,
+                                  void *       dst,
+                                  const void * src,
+                                  size_t       dst_row_size,
+                                  size_t       src_row_size,
+                                  size_t       nrows) {
+    return dma_queue_push(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows);
+}
+
+
+static inline bool dma_queue_push_vtcm_to_ddr(dma_queue *  q,
                                   void *       dst,
                                   const void * src,
                                   size_t       dst_row_size,
                                   size_t       src_row_size,
                                   size_t       nrows) {
-    return dma_queue_push_width(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows);
+    return dma_queue_push(q, dst, src, dst_row_size, src_row_size, dst_row_size, nrows);
 }
 
 static inline uint8_t * dma_queue_pop(dma_queue * q) {
diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c
index 346f0bd3398..d617da2cc4c 100644
--- a/ggml/src/ggml-hexagon/htp/matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c
@@ -1115,7 +1115,7 @@ static void matmul(struct htp_matmul_type * mt,
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
             break;
         }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                        src0_row_size_padded, src0_row_size, 2);
     }
 
@@ -1134,7 +1134,7 @@ static void matmul(struct htp_matmul_type * mt,
         const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
         const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
         if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 2);
         }
     }
@@ -1143,7 +1143,7 @@ static void matmul(struct htp_matmul_type * mt,
     if (src0_end_row != src0_end_row_x2) {
         uint32_t  ir0 = src0_end_row_x2;
         const int is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                        src0_row_size_padded, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue);
 
@@ -1217,7 +1217,7 @@ static void matvec(struct htp_matmul_type * mt,
         if (is0 >= HTP_SPAD_SRC0_NROWS) {
             break;
         }
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                        src0_row_size_padded, src0_row_size, 2);
     }
 
@@ -1230,7 +1230,7 @@ static void matvec(struct htp_matmul_type * mt,
         const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
         const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
         if (pr0 < src0_end_row_x2) {
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 2);
         }
     }
@@ -1239,7 +1239,7 @@ static void matvec(struct htp_matmul_type * mt,
     if (src0_end_row != src0_end_row_x2) {
         const uint32_t ir0 = src0_end_row_x2;
         const uint32_t is0 = (ir0 - src0_start_row);
-        dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+        dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                        src0_row_size_padded, src0_row_size, 1);
         const uint8_t * ss0 = dma_queue_pop(dma_queue);
         mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col);
@@ -1331,7 +1331,7 @@ static void matmul_id(struct htp_matmul_type * mt,
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
                 break;
             }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 2);
         }
 
@@ -1356,7 +1356,7 @@ static void matmul_id(struct htp_matmul_type * mt,
             const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
             const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
             if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
                                src0_row_size_padded, src0_row_size, 2);
             }
         }
@@ -1365,7 +1365,7 @@ static void matmul_id(struct htp_matmul_type * mt,
         if (src0_end_row != src0_end_row_x2) {
             uint32_t       ir0 = src0_end_row_x2;
             const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue);
 
@@ -1455,7 +1455,7 @@ static void matvec_id(struct htp_matmul_type * mt,
             if (is0 >= HTP_SPAD_SRC0_NROWS) {
                 break;
             }
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 2);
         }
 
@@ -1468,7 +1468,7 @@ static void matvec_id(struct htp_matmul_type * mt,
             const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS);
             const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS;
             if (pr0 < src0_end_row_x2) {
-                dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
+                dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size,
                                src0_row_size_padded, src0_row_size, 2);
             }
         }
@@ -1477,7 +1477,7 @@ static void matvec_id(struct htp_matmul_type * mt,
         if (src0_end_row != src0_end_row_x2) {
             uint32_t       ir0 = src0_end_row_x2;
             const uint32_t is0 = (ir0 - src0_start_row);
-            dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
+            dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size,
                            src0_row_size_padded, src0_row_size, 1);
             const uint8_t * ss0 = dma_queue_pop(dma_queue);
             mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);

From f3e9f6ac7f69c0db9427d8cb4cd8a634f4a4c6d4 Mon Sep 17 00:00:00 2001
From: shouyud <joeldushouyu@gmail.com>
Date: Thu, 18 Dec 2025 14:45:55 -0500
Subject: [PATCH 5/5] fix: fix dma queue size

---
 ggml/src/ggml-hexagon/htp/htp-dma.h | 1 +
 ggml/src/ggml-hexagon/htp/main.c    | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h
index e1095c4a967..ad871e6518e 100644
--- a/ggml/src/ggml-hexagon/htp/htp-dma.h
+++ b/ggml/src/ggml-hexagon/htp/htp-dma.h
@@ -57,6 +57,7 @@ static inline bool dma_queue_push(dma_queue *  q,
                                   size_t       width, // width in bytes. number of bytes to transfer per row
                                   size_t       nrows) {
     if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
+        FARF(ERROR, "dma-push: queue full\n");
         return false;
     }
 
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index d047ef8a356..c355adc8beb 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
 
     ctx->n_threads = n_hvx;
     for (int i = 0; i < ctx->n_threads; i++) {
-        ctx->dma[i] = dma_queue_create(ctx->vtcm_size); //NOTE: for now, whole vtcm size 
+        ctx->dma[i] = dma_queue_create(64); //see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541 
     }
 
     // init worker pool