From 8f1e0e03858458a973da9cb6a4b7e1aa70d7c794 Mon Sep 17 00:00:00 2001 From: shouyud Date: Wed, 17 Dec 2025 16:07:45 -0500 Subject: [PATCH 1/5] feat: working gelu with src0 put on vtcm --- ggml/src/ggml-hexagon/htp/act-ops.c | 193 ++++++++++++++++++++++------ ggml/src/ggml-hexagon/htp/main.c | 2 +- 2 files changed, 153 insertions(+), 42 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 59c0a70963d..cb4be01a5dc 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -263,7 +263,9 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, struct htp_spad * dst_spad, uint32_t nth, uint32_t ith, - uint32_t src0_nrows_per_thread) { + uint32_t src0_nrows_per_thread, + dma_queue * dma_queue + ) { htp_act_preamble2; uint64_t t1, t2; @@ -271,6 +273,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; + const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); + const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); const uint32_t src0_nrows = ne01 * ne02 * ne03; @@ -292,37 +296,126 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, opt_path = 1; } - const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; - uint8_t * restrict data_dst = (uint8_t *) dst->data; + const uint8_t * data_src0 = (const uint8_t *) src0->data; + uint8_t * data_dst = (uint8_t *) dst->data; - uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); - uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int BLOCK = 8; - for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { - const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); - // Prefetch next block - if (block_end < src0_end_row) { - const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); - htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size); - } - // Process rows in current block + + + + // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 + size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2; + + uint8_t * src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread)); + uint8_t * src0_spad_data_pong = src0_spad_data_ping + src0_size_per_pingpong; + uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread); + + // const int BLOCK = 8; + // for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { + // const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); + + // // Prefetch next block + // if (block_end < src0_end_row) { + // const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); + // htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size); + // } + + // // Process rows in current block + // for (uint32_t ib = ir; ib < block_end; ib++) { + // const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); + // float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); + + // // gelu = x * sigmoid(1.702 * x) // current implementation + // if (1 == opt_path) { + // hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); + // hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); + // hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + // } else { + // hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); + // hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); + // hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + // } + // } + // } + + + // Maybe should always go to the optimal path? + // In gelu = x*sigmoid(x*1.702) + // Although we have src0_size_per_pingpong + const int BLOCK = src0_size_per_pingpong / src0_row_size_aligned; // How many rows can we process in one block + + // TODO: + if(BLOCK == 0){ + FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", src0_spad->size_per_thread, + src0_row_size_aligned ); + return; + } + // Do the inital dma fecth + // fetch src0 + dma_queue_push(dma_queue, + src0_spad_data_ping, + data_src0 + (src0_start_row * src0_row_size), + src0_row_size_aligned, + src0_row_size, + MIN(BLOCK, src0_end_row - src0_start_row) + + ); + bool ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute + + + + + for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { + const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block + + + const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end); + + // prefetch next loop iteration if any + + if (next_block_size > 0) { + if(ping_pong_flag == false){ + + dma_queue_push(dma_queue, + src0_spad_data_ping, + data_src0 + (block_end * src0_row_size), + src0_row_size_aligned, + src0_row_size, + next_block_size + ); + + + }else{ + dma_queue_push(dma_queue, + src0_spad_data_pong, + data_src0 + (block_end * src0_row_size), + src0_row_size_aligned, + src0_row_size, + next_block_size + ); + } + ping_pong_flag=!ping_pong_flag; + } + const float * src0 = (float*)dma_queue_pop(dma_queue); for (uint32_t ib = ir; ib < block_end; ib++) { - const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); + + float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); - // gelu = x * sigmoid(1.702 * x) // current implementation - if (1 == opt_path) { - hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); - hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } + // // gelu = x * sigmoid(1.702 * x) // current implementation + // if (1 == opt_path) { + // hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) dst_spad_data, ne0); + // hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0); + // hvx_mul_f32_opt((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0); //TODO: can dma push dst_spad_data back? + // } else { + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) dst_spad_data, ne0); + hvx_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0); + hvx_mul_f32((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0); + //} + + src0 += src0_row_size_aligned/sizeof(float); // Move to next row } } @@ -335,7 +428,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, - octx->src0_nrows_per_thread); + octx->src0_nrows_per_thread, octx->ctx->dma[i]); } @@ -469,16 +562,44 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const size_t src0_row_size = src0->nb[1]; - const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1]; + const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : 0; // zero bytes if src1 is not used const size_t dst_row_size = dst->nb[1]; + + const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); + const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); + const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads; - octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads; - size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; + + size_t spad_size_per_row = (src0_row_size_aligned + + src1_row_size_aligned) + dst_row_size_aligned; + + size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row); + + + // Make sure the reserved vtcm size is sufficient + if(vtcm_row_per_thread ==0){ + FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size, + spad_size_per_row * n_threads); + return HTP_STATUS_VTCM_TOO_SMALL; + } + + + + octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread; + octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread; + octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread; + + octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread; + octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread; + octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread; + + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + if (src1->ne[0]) { FARF(HIGH, @@ -492,16 +613,6 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size); } - // Make sure the reserved vtcm size is sufficient - if (octx->ctx->vtcm_size < spad_size) { - FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, - spad_size); - return HTP_STATUS_VTCM_TOO_SMALL; - } - - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { uint32_t n_jobs = MIN(n_threads, src0_nrows); diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index e30ae695022..d047ef8a356 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { - ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2); + ctx->dma[i] = dma_queue_create(ctx->vtcm_size); //NOTE: for now, whole vtcm size } // init worker pool From 8fd05731b053e5b372778e6dfe65c6a5f322e88a Mon Sep 17 00:00:00 2001 From: shouyud Date: Wed, 17 Dec 2025 17:27:33 -0500 Subject: [PATCH 2/5] feat: gelu ping-pong for both in and out --- ggml/src/ggml-hexagon/htp/act-ops.c | 131 +++++++++++++------------- ggml/src/ggml-hexagon/htp/htp-dma.h | 14 ++- ggml/src/ggml-hexagon/htp/hvx-utils.h | 11 ++- 3 files changed, 90 insertions(+), 66 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index cb4be01a5dc..4872dc02925 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -286,60 +286,20 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, return; } - int is_aligned = 1; - int opt_path = 0; - if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { - is_aligned = 0; - FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); - } - if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { - opt_path = 1; - } const uint8_t * data_src0 = (const uint8_t *) src0->data; uint8_t * data_dst = (uint8_t *) dst->data; - - - - // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2; + size_t dst_size_per_pingpong = dst_spad->size_per_thread / 2; uint8_t * src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread)); uint8_t * src0_spad_data_pong = src0_spad_data_ping + src0_size_per_pingpong; - uint8_t * dst_spad_data = dst_spad->data + (ith * dst_spad->size_per_thread); - - // const int BLOCK = 8; - // for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { - // const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); - - // // Prefetch next block - // if (block_end < src0_end_row) { - // const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); - // htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size); - // } - - // // Process rows in current block - // for (uint32_t ib = ir; ib < block_end; ib++) { - // const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); - // float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); - - // // gelu = x * sigmoid(1.702 * x) // current implementation - // if (1 == opt_path) { - // hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); - // hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - // hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - // } else { - // hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - // hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - // hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - // } - // } - // } - + uint8_t * dst_spad_data_ping = dst_spad->data + (ith * dst_spad->size_per_thread); + uint8_t * dst_spad_data_pong = dst_spad_data_ping + dst_size_per_pingpong; // Maybe should always go to the optimal path? // In gelu = x*sigmoid(x*1.702) @@ -362,10 +322,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, MIN(BLOCK, src0_end_row - src0_start_row) ); - bool ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute - - - + bool src0_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute + bool dst_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block @@ -373,10 +331,32 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end); + + float* cur_dst_spad_ptr; + const float * src0; + if(dst_ping_pong_flag){ + cur_dst_spad_ptr = (float*)dst_spad_data_ping; + }else{ + cur_dst_spad_ptr = (float*)dst_spad_data_pong; + } + + if(src0_ping_pong_flag){ + src0 = (float*)src0_spad_data_ping; + }else{ + src0 = (float*)src0_spad_data_pong; + } + + dma_queue_pop(dma_queue); // wait for dma done for the previous src0 fetch + + // Wait for the previous dst push to complete before we can reuse the dst buffer + if(ir != src0_start_row){ + dma_queue_pop(dma_queue); // wait for dma done for the previous dst push + } + // prefetch next loop iteration if any if (next_block_size > 0) { - if(ping_pong_flag == false){ + if(src0_ping_pong_flag == false){ dma_queue_push(dma_queue, src0_spad_data_ping, @@ -396,29 +376,54 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, next_block_size ); } - ping_pong_flag=!ping_pong_flag; + src0_ping_pong_flag=!src0_ping_pong_flag; } - const float * src0 = (float*)dma_queue_pop(dma_queue); + for (uint32_t ib = ir; ib < block_end; ib++) { + // gelu = x * sigmoid(1.702 * x) // current implementation + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) cur_dst_spad_ptr, ne0); + hvx_fast_sigmoid_f32((const uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0); + hvx_mul_f32_opt((const uint8_t *) src0, (uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0); + + + src0 += src0_row_size_aligned/sizeof(float); // Move to next row + cur_dst_spad_ptr+= dst_row_size_aligned/sizeof(float); + } - float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); + float * restrict out_dst = (float *) (data_dst + (ir * dst_row_size)); + + if(dst_ping_pong_flag){ + dma_queue_push_width(dma_queue, + out_dst, + dst_spad_data_ping, + dst_row_size, // dst stride in DDR (actual row size) + dst_row_size_aligned, // src stride in VTCM (aligned) + dst_row_size, // width + (block_end - ir) + ); + }else{ + dma_queue_push_width(dma_queue, + out_dst, + dst_spad_data_pong, + dst_row_size, // dst stride in DDR (actual row size) + dst_row_size_aligned, // src stride in VTCM (aligned) + dst_row_size, // width + (block_end - ir) + ); + } - // // gelu = x * sigmoid(1.702 * x) // current implementation - // if (1 == opt_path) { - // hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) dst_spad_data, ne0); - // hvx_fast_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0); - // hvx_mul_f32_opt((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0); //TODO: can dma push dst_spad_data back? - // } else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) dst_spad_data, ne0); - hvx_sigmoid_f32((const uint8_t *) dst_spad_data, (uint8_t *) dst_spad_data, ne0); - hvx_mul_f32((const uint8_t *) src0, dst_spad_data, (uint8_t *) dst, ne0); - //} - src0 += src0_row_size_aligned/sizeof(float); // Move to next row + if(ir != src0_start_row){ + dma_queue_pop(dma_queue); // wait for dma done for the previous dst push } + // else is the first block,nothing to wait for dst push + + dst_ping_pong_flag = !dst_ping_pong_flag; } + dma_queue_pop(dma_queue); // wait for dma done for the last dst push + t2 = HAP_perf_get_qtimer_count(); FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02, @@ -562,7 +567,7 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const size_t src0_row_size = src0->nb[1]; - const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : 0; // zero bytes if src1 is not used + const size_t src1_row_size = src1->ne[0] > 0 ? src1->nb[1] : 0; // zero bytes if src1 is not used const size_t dst_row_size = dst->nb[1]; diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h index 7d3fc4078cc..6f96f2f4f12 100644 --- a/ggml/src/ggml-hexagon/htp/htp-dma.h +++ b/ggml/src/ggml-hexagon/htp/htp-dma.h @@ -49,11 +49,12 @@ static inline unsigned int dmwait(void) { return ret; } -static inline bool dma_queue_push(dma_queue * q, +static inline bool dma_queue_push_width(dma_queue * q, void * dst, const void * src, size_t dst_row_size, size_t src_row_size, + size_t width, size_t nrows) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { return false; @@ -79,7 +80,7 @@ static inline bool dma_queue_push(dma_queue * q, desc->dst = (void *) dst; desc->allocation = 0; desc->padding = 0; - desc->roiwidth = src_row_size; + desc->roiwidth = width; desc->roiheight = nrows; desc->srcstride = src_row_size; desc->dststride = dst_row_size; @@ -96,6 +97,15 @@ static inline bool dma_queue_push(dma_queue * q, return true; } +static inline bool dma_queue_push(dma_queue * q, + void * dst, + const void * src, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push_width(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows); +} + static inline uint8_t * dma_queue_pop(dma_queue * q) { if (q->push_idx == q->pop_idx) { return NULL; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 566048297d3..d8f2130e31e 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -980,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * int step_of_1 = num_elems >> 5; int remaining = num_elems - step_of_1 * VLEN_FP32; - assert(remaining == 0); + const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; @@ -996,6 +996,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } + + if(remaining > 0) { + const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32; + float * dstf = (float *) dst + step_of_1*VLEN_FP32; + + HVX_Vector in = *(HVX_UVector *) srcf; + HVX_Vector out =hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp); + hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out); + } } From e875809ec7565384d45da758666e1910e63422b7 Mon Sep 17 00:00:00 2001 From: shouyud Date: Wed, 17 Dec 2025 17:34:08 -0500 Subject: [PATCH 3/5] fix: fixu compile error --- ggml/src/ggml-hexagon/htp/act-ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 4872dc02925..6d205408c1d 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -426,7 +426,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02, + FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } From 54f7bbdf952601572c6ddf3134eae9c41bd6c6b3 Mon Sep 17 00:00:00 2001 From: shouyud Date: Thu, 18 Dec 2025 11:13:27 -0500 Subject: [PATCH 4/5] break: distinguish dma ddr->vtcm and vtcm->ddr operation --- ggml/src/ggml-hexagon/htp/act-ops.c | 12 +++++------- ggml/src/ggml-hexagon/htp/htp-dma.h | 18 ++++++++++++++---- ggml/src/ggml-hexagon/htp/matmul-ops.c | 24 ++++++++++++------------ 3 files changed, 31 insertions(+), 23 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 6d205408c1d..1d2459d9fcb 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -314,7 +314,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, } // Do the inital dma fecth // fetch src0 - dma_queue_push(dma_queue, + dma_queue_push_ddr_to_vtcm(dma_queue, src0_spad_data_ping, data_src0 + (src0_start_row * src0_row_size), src0_row_size_aligned, @@ -358,7 +358,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, if (next_block_size > 0) { if(src0_ping_pong_flag == false){ - dma_queue_push(dma_queue, + dma_queue_push_ddr_to_vtcm(dma_queue, src0_spad_data_ping, data_src0 + (block_end * src0_row_size), src0_row_size_aligned, @@ -368,7 +368,7 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, }else{ - dma_queue_push(dma_queue, + dma_queue_push_ddr_to_vtcm(dma_queue, src0_spad_data_pong, data_src0 + (block_end * src0_row_size), src0_row_size_aligned, @@ -394,21 +394,19 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, float * restrict out_dst = (float *) (data_dst + (ir * dst_row_size)); if(dst_ping_pong_flag){ - dma_queue_push_width(dma_queue, + dma_queue_push_vtcm_to_ddr(dma_queue, out_dst, dst_spad_data_ping, dst_row_size, // dst stride in DDR (actual row size) dst_row_size_aligned, // src stride in VTCM (aligned) - dst_row_size, // width (block_end - ir) ); }else{ - dma_queue_push_width(dma_queue, + dma_queue_push_vtcm_to_ddr(dma_queue, out_dst, dst_spad_data_pong, dst_row_size, // dst stride in DDR (actual row size) dst_row_size_aligned, // src stride in VTCM (aligned) - dst_row_size, // width (block_end - ir) ); } diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h index 6f96f2f4f12..e1095c4a967 100644 --- a/ggml/src/ggml-hexagon/htp/htp-dma.h +++ b/ggml/src/ggml-hexagon/htp/htp-dma.h @@ -49,12 +49,12 @@ static inline unsigned int dmwait(void) { return ret; } -static inline bool dma_queue_push_width(dma_queue * q, +static inline bool dma_queue_push(dma_queue * q, void * dst, const void * src, size_t dst_row_size, size_t src_row_size, - size_t width, + size_t width, // width in bytes. number of bytes to transfer per row size_t nrows) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { return false; @@ -97,13 +97,23 @@ static inline bool dma_queue_push_width(dma_queue * q, return true; } -static inline bool dma_queue_push(dma_queue * q, +static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, + void * dst, + const void * src, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows); +} + + +static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, void * dst, const void * src, size_t dst_row_size, size_t src_row_size, size_t nrows) { - return dma_queue_push_width(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows); + return dma_queue_push(q, dst, src, dst_row_size, src_row_size, dst_row_size, nrows); } static inline uint8_t * dma_queue_pop(dma_queue * q) { diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 346f0bd3398..d617da2cc4c 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -1115,7 +1115,7 @@ static void matmul(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1134,7 +1134,7 @@ static void matmul(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1143,7 +1143,7 @@ static void matmul(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const int is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); @@ -1217,7 +1217,7 @@ static void matvec(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1230,7 +1230,7 @@ static void matvec(struct htp_matmul_type * mt, const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1239,7 +1239,7 @@ static void matvec(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { const uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); @@ -1331,7 +1331,7 @@ static void matmul_id(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1356,7 +1356,7 @@ static void matmul_id(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1365,7 +1365,7 @@ static void matmul_id(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); @@ -1455,7 +1455,7 @@ static void matvec_id(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1468,7 +1468,7 @@ static void matvec_id(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1477,7 +1477,7 @@ static void matvec_id(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col); From f3e9f6ac7f69c0db9427d8cb4cd8a634f4a4c6d4 Mon Sep 17 00:00:00 2001 From: shouyud Date: Thu, 18 Dec 2025 14:45:55 -0500 Subject: [PATCH 5/5] fix: fix dma queue size --- ggml/src/ggml-hexagon/htp/htp-dma.h | 1 + ggml/src/ggml-hexagon/htp/main.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h index e1095c4a967..ad871e6518e 100644 --- a/ggml/src/ggml-hexagon/htp/htp-dma.h +++ b/ggml/src/ggml-hexagon/htp/htp-dma.h @@ -57,6 +57,7 @@ static inline bool dma_queue_push(dma_queue * q, size_t width, // width in bytes. number of bytes to transfer per row size_t nrows) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { + FARF(ERROR, "dma-push: queue full\n"); return false; } diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index d047ef8a356..c355adc8beb 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { - ctx->dma[i] = dma_queue_create(ctx->vtcm_size); //NOTE: for now, whole vtcm size + ctx->dma[i] = dma_queue_create(64); //see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541 } // init worker pool