Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
214 changes: 164 additions & 50 deletions ggml/src/ggml-hexagon/htp/act-ops.c
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,18 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
struct htp_spad * dst_spad,
uint32_t nth,
uint32_t ith,
uint32_t src0_nrows_per_thread) {
uint32_t src0_nrows_per_thread,
dma_queue * dma_queue
) {
htp_act_preamble2;

uint64_t t1, t2;
t1 = HAP_perf_get_qtimer_count();

const size_t src0_row_size = nb01;
const size_t dst_row_size = nb1;
const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);

const uint32_t src0_nrows = ne01 * ne02 * ne03;

Expand All @@ -282,60 +286,152 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0,
return;
}

int is_aligned = 1;
int opt_path = 0;
if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) {
is_aligned = 0;
FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n");
}
if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) {
opt_path = 1;
}

const uint8_t * restrict data_src0 = (const uint8_t *) src0->data;
uint8_t * restrict data_dst = (uint8_t *) dst->data;
const uint8_t * data_src0 = (const uint8_t *) src0->data;
uint8_t * data_dst = (uint8_t *) dst->data;

uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size);
uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size);

const int BLOCK = 8;

// While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0
size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2;
size_t dst_size_per_pingpong = dst_spad->size_per_thread / 2;

uint8_t * src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread));
uint8_t * src0_spad_data_pong = src0_spad_data_ping + src0_size_per_pingpong;
uint8_t * dst_spad_data_ping = dst_spad->data + (ith * dst_spad->size_per_thread);
uint8_t * dst_spad_data_pong = dst_spad_data_ping + dst_size_per_pingpong;

// Maybe should always go to the optimal path?
// In gelu = x*sigmoid(x*1.702)
// Although we have src0_size_per_pingpong
const int BLOCK = src0_size_per_pingpong / src0_row_size_aligned; // How many rows can we process in one block

// TODO:
if(BLOCK == 0){
FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", src0_spad->size_per_thread,
src0_row_size_aligned );
return;
}
// Do the inital dma fecth
// fetch src0
dma_queue_push_ddr_to_vtcm(dma_queue,
src0_spad_data_ping,
data_src0 + (src0_start_row * src0_row_size),
src0_row_size_aligned,
src0_row_size,
MIN(BLOCK, src0_end_row - src0_start_row)

);
bool src0_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute
bool dst_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute

for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) {
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row);
const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block


// Prefetch next block
if (block_end < src0_end_row) {
const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size));
htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size);
const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end);


float* cur_dst_spad_ptr;
const float * src0;
if(dst_ping_pong_flag){
cur_dst_spad_ptr = (float*)dst_spad_data_ping;
}else{
cur_dst_spad_ptr = (float*)dst_spad_data_pong;
}

// Process rows in current block
for (uint32_t ib = ir; ib < block_end; ib++) {
const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size));
float * restrict dst = (float *) (data_dst + (ib * dst_row_size));
if(src0_ping_pong_flag){
src0 = (float*)src0_spad_data_ping;
}else{
src0 = (float*)src0_spad_data_pong;
}

// gelu = x * sigmoid(1.702 * x) // current implementation
if (1 == opt_path) {
hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0);
hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
} else {
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0);
hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0);
hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0);
dma_queue_pop(dma_queue); // wait for dma done for the previous src0 fetch

// Wait for the previous dst push to complete before we can reuse the dst buffer
if(ir != src0_start_row){
dma_queue_pop(dma_queue); // wait for dma done for the previous dst push
}

// prefetch next loop iteration if any

if (next_block_size > 0) {
if(src0_ping_pong_flag == false){

dma_queue_push_ddr_to_vtcm(dma_queue,
src0_spad_data_ping,
data_src0 + (block_end * src0_row_size),
src0_row_size_aligned,
src0_row_size,
next_block_size
);


}else{
dma_queue_push_ddr_to_vtcm(dma_queue,
src0_spad_data_pong,
data_src0 + (block_end * src0_row_size),
src0_row_size_aligned,
src0_row_size,
next_block_size
);
}
src0_ping_pong_flag=!src0_ping_pong_flag;
}

for (uint32_t ib = ir; ib < block_end; ib++) {
// gelu = x * sigmoid(1.702 * x) // current implementation
hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) cur_dst_spad_ptr, ne0);
hvx_fast_sigmoid_f32((const uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0);
hvx_mul_f32_opt((const uint8_t *) src0, (uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0);


src0 += src0_row_size_aligned/sizeof(float); // Move to next row
cur_dst_spad_ptr+= dst_row_size_aligned/sizeof(float);
}


float * restrict out_dst = (float *) (data_dst + (ir * dst_row_size));

if(dst_ping_pong_flag){
dma_queue_push_vtcm_to_ddr(dma_queue,
out_dst,
dst_spad_data_ping,
dst_row_size, // dst stride in DDR (actual row size)
dst_row_size_aligned, // src stride in VTCM (aligned)
(block_end - ir)
);
}else{
dma_queue_push_vtcm_to_ddr(dma_queue,
out_dst,
dst_spad_data_pong,
dst_row_size, // dst stride in DDR (actual row size)
dst_row_size_aligned, // src stride in VTCM (aligned)
(block_end - ir)
);
}


if(ir != src0_start_row){
dma_queue_pop(dma_queue); // wait for dma done for the previous dst push
}
// else is the first block,nothing to wait for dst push

dst_ping_pong_flag = !dst_ping_pong_flag;
}

dma_queue_pop(dma_queue); // wait for dma done for the last dst push

t2 = HAP_perf_get_qtimer_count();

FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02,
FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02,
ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1));
}

static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) {
struct htp_ops_context * octx = (struct htp_ops_context *) data;
unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i,
octx->src0_nrows_per_thread);
octx->src0_nrows_per_thread, octx->ctx->dma[i]);
}


Expand Down Expand Up @@ -469,16 +565,44 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3];

const size_t src0_row_size = src0->nb[1];
const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1];
const size_t src1_row_size = src1->ne[0] > 0 ? src1->nb[1] : 0; // zero bytes if src1 is not used
const size_t dst_row_size = dst->nb[1];


const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN);
const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN);
const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN);
// VTCM scratchpads for all tensors
// N rows per thread, padded to HVX vector size
octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads;
octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads;
octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads;

size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size;

size_t spad_size_per_row = (src0_row_size_aligned +
src1_row_size_aligned) + dst_row_size_aligned;

size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row);


// Make sure the reserved vtcm size is sufficient
if(vtcm_row_per_thread ==0){
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size,
spad_size_per_row * n_threads);
return HTP_STATUS_VTCM_TOO_SMALL;
}



octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread;
octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread;
octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread;

octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread;
octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread;
octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread;

octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;


if (src1->ne[0]) {
FARF(HIGH,
Expand All @@ -492,16 +616,6 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) {
octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size);
}

// Make sure the reserved vtcm size is sufficient
if (octx->ctx->vtcm_size < spad_size) {
FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size,
spad_size);
return HTP_STATUS_VTCM_TOO_SMALL;
}

octx->src0_spad.data = octx->ctx->vtcm_base;
octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size;
octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size;

if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) {
uint32_t n_jobs = MIN(n_threads, src0_nrows);
Expand Down
23 changes: 22 additions & 1 deletion ggml/src/ggml-hexagon/htp/htp-dma.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,10 @@ static inline bool dma_queue_push(dma_queue * q,
const void * src,
size_t dst_row_size,
size_t src_row_size,
size_t width, // width in bytes. number of bytes to transfer per row
size_t nrows) {
if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) {
FARF(ERROR, "dma-push: queue full\n");
return false;
}

Expand All @@ -79,7 +81,7 @@ static inline bool dma_queue_push(dma_queue * q,
desc->dst = (void *) dst;
desc->allocation = 0;
desc->padding = 0;
desc->roiwidth = src_row_size;
desc->roiwidth = width;
desc->roiheight = nrows;
desc->srcstride = src_row_size;
desc->dststride = dst_row_size;
Expand All @@ -96,6 +98,25 @@ static inline bool dma_queue_push(dma_queue * q,
return true;
}

static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q,
void * dst,
const void * src,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows);
}


static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q,
void * dst,
const void * src,
size_t dst_row_size,
size_t src_row_size,
size_t nrows) {
return dma_queue_push(q, dst, src, dst_row_size, src_row_size, dst_row_size, nrows);
}

static inline uint8_t * dma_queue_pop(dma_queue * q) {
if (q->push_idx == q->pop_idx) {
return NULL;
Expand Down
11 changes: 10 additions & 1 deletion ggml/src/ggml-hexagon/htp/hvx-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -980,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
int step_of_1 = num_elems >> 5;
int remaining = num_elems - step_of_1 * VLEN_FP32;

assert(remaining == 0);


const HVX_Vector * restrict v_src = (HVX_Vector *) src;
HVX_Vector * restrict v_dst = (HVX_Vector *) dst;
Expand All @@ -996,6 +996,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t *
for (int i = 0; i < step_of_1; i++) {
v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp);
}

if(remaining > 0) {
const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32;
float * dstf = (float *) dst + step_of_1*VLEN_FP32;

HVX_Vector in = *(HVX_UVector *) srcf;
HVX_Vector out =hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp);
hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out);
}
}


Expand Down
2 changes: 1 addition & 1 deletion ggml/src/ggml-hexagon/htp/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que

ctx->n_threads = n_hvx;
for (int i = 0; i < ctx->n_threads; i++) {
ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2);
ctx->dma[i] = dma_queue_create(64); //see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541
}

// init worker pool
Expand Down
Loading
Loading