diff --git a/ggml/src/ggml-hexagon/htp/act-ops.c b/ggml/src/ggml-hexagon/htp/act-ops.c index 59c0a70963..1d2459d9fc 100644 --- a/ggml/src/ggml-hexagon/htp/act-ops.c +++ b/ggml/src/ggml-hexagon/htp/act-ops.c @@ -263,7 +263,9 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, struct htp_spad * dst_spad, uint32_t nth, uint32_t ith, - uint32_t src0_nrows_per_thread) { + uint32_t src0_nrows_per_thread, + dma_queue * dma_queue + ) { htp_act_preamble2; uint64_t t1, t2; @@ -271,6 +273,8 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, const size_t src0_row_size = nb01; const size_t dst_row_size = nb1; + const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); + const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); const uint32_t src0_nrows = ne01 * ne02 * ne03; @@ -282,60 +286,152 @@ static void unary_gelu_fp32_per_thread(const struct htp_tensor * src0, return; } - int is_aligned = 1; - int opt_path = 0; - if (!htp_is_aligned((void *) src0->data, VLEN) || !htp_is_aligned((void *) dst->data, VLEN)) { - is_aligned = 0; - FARF(HIGH, "silu-f32: unaligned addresses in elementwise op, possibly slower execution\n"); - } - if ((1 == is_aligned) && !(nb01 & (VLEN - 1))) { - opt_path = 1; - } - const uint8_t * restrict data_src0 = (const uint8_t *) src0->data; - uint8_t * restrict data_dst = (uint8_t *) dst->data; + const uint8_t * data_src0 = (const uint8_t *) src0->data; + uint8_t * data_dst = (uint8_t *) dst->data; - uint8_t * restrict src0_spad_data = src0_spad->data + (ith * src0_row_size); - uint8_t * restrict dst_spad_data = dst_spad->data + (ith * dst_row_size); - const int BLOCK = 8; + + // While given src0_spad->size_per_thread, divide it to two ping-pong buffer for src0 + size_t src0_size_per_pingpong = src0_spad->size_per_thread / 2; + size_t dst_size_per_pingpong = dst_spad->size_per_thread / 2; + + uint8_t * src0_spad_data_ping = src0_spad->data + (ith * (src0_spad->size_per_thread)); + uint8_t * src0_spad_data_pong = src0_spad_data_ping + src0_size_per_pingpong; + uint8_t * dst_spad_data_ping = dst_spad->data + (ith * dst_spad->size_per_thread); + uint8_t * dst_spad_data_pong = dst_spad_data_ping + dst_size_per_pingpong; + + // Maybe should always go to the optimal path? + // In gelu = x*sigmoid(x*1.702) + // Although we have src0_size_per_pingpong + const int BLOCK = src0_size_per_pingpong / src0_row_size_aligned; // How many rows can we process in one block + + // TODO: + if(BLOCK == 0){ + FARF(ERROR, "gelu-f32 : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", src0_spad->size_per_thread, + src0_row_size_aligned ); + return; + } + // Do the inital dma fecth + // fetch src0 + dma_queue_push_ddr_to_vtcm(dma_queue, + src0_spad_data_ping, + data_src0 + (src0_start_row * src0_row_size), + src0_row_size_aligned, + src0_row_size, + MIN(BLOCK, src0_end_row - src0_start_row) + + ); + bool src0_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute + bool dst_ping_pong_flag = true; // true means the program use ping data to compute, false means use pong data to compute + for (uint32_t ir = src0_start_row; ir < src0_end_row; ir += BLOCK) { - const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); + const uint32_t block_end = MIN(ir + BLOCK, src0_end_row); // The start index of next block + - // Prefetch next block - if (block_end < src0_end_row) { - const float * restrict prefetch_ptr = (float *) (data_src0 + (block_end * src0_row_size)); - htp_l2fetch(prefetch_ptr, 1, block_end * src0_row_size, src0_row_size); + const uint32_t next_block_size = MIN(BLOCK, src0_end_row - block_end); + + + float* cur_dst_spad_ptr; + const float * src0; + if(dst_ping_pong_flag){ + cur_dst_spad_ptr = (float*)dst_spad_data_ping; + }else{ + cur_dst_spad_ptr = (float*)dst_spad_data_pong; } - // Process rows in current block - for (uint32_t ib = ir; ib < block_end; ib++) { - const float * restrict src0 = (float *) (data_src0 + (ib * src0_row_size)); - float * restrict dst = (float *) (data_dst + (ib * dst_row_size)); + if(src0_ping_pong_flag){ + src0 = (float*)src0_spad_data_ping; + }else{ + src0 = (float*)src0_spad_data_pong; + } - // gelu = x * sigmoid(1.702 * x) // current implementation - if (1 == opt_path) { - hvx_mul_scalar_f32((const uint8_t *) src0, (float) 1.702, (uint8_t *) src0_spad_data, ne0); - hvx_fast_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32_opt((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); - } else { - hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) src0_spad_data, ne0); - hvx_sigmoid_f32((const uint8_t *) src0_spad_data, (uint8_t *) src0_spad_data, ne0); - hvx_mul_f32((const uint8_t *) src0, src0_spad_data, (uint8_t *) dst, ne0); + dma_queue_pop(dma_queue); // wait for dma done for the previous src0 fetch + + // Wait for the previous dst push to complete before we can reuse the dst buffer + if(ir != src0_start_row){ + dma_queue_pop(dma_queue); // wait for dma done for the previous dst push + } + + // prefetch next loop iteration if any + + if (next_block_size > 0) { + if(src0_ping_pong_flag == false){ + + dma_queue_push_ddr_to_vtcm(dma_queue, + src0_spad_data_ping, + data_src0 + (block_end * src0_row_size), + src0_row_size_aligned, + src0_row_size, + next_block_size + ); + + + }else{ + dma_queue_push_ddr_to_vtcm(dma_queue, + src0_spad_data_pong, + data_src0 + (block_end * src0_row_size), + src0_row_size_aligned, + src0_row_size, + next_block_size + ); } + src0_ping_pong_flag=!src0_ping_pong_flag; + } + + for (uint32_t ib = ir; ib < block_end; ib++) { + // gelu = x * sigmoid(1.702 * x) // current implementation + hvx_mul_scalar_f32( (const uint8_t *) src0, (float)1.702, (uint8_t *) cur_dst_spad_ptr, ne0); + hvx_fast_sigmoid_f32((const uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0); + hvx_mul_f32_opt((const uint8_t *) src0, (uint8_t *) cur_dst_spad_ptr, (uint8_t *) cur_dst_spad_ptr, ne0); + + + src0 += src0_row_size_aligned/sizeof(float); // Move to next row + cur_dst_spad_ptr+= dst_row_size_aligned/sizeof(float); } + + + float * restrict out_dst = (float *) (data_dst + (ir * dst_row_size)); + + if(dst_ping_pong_flag){ + dma_queue_push_vtcm_to_ddr(dma_queue, + out_dst, + dst_spad_data_ping, + dst_row_size, // dst stride in DDR (actual row size) + dst_row_size_aligned, // src stride in VTCM (aligned) + (block_end - ir) + ); + }else{ + dma_queue_push_vtcm_to_ddr(dma_queue, + out_dst, + dst_spad_data_pong, + dst_row_size, // dst stride in DDR (actual row size) + dst_row_size_aligned, // src stride in VTCM (aligned) + (block_end - ir) + ); + } + + + if(ir != src0_start_row){ + dma_queue_pop(dma_queue); // wait for dma done for the previous dst push + } + // else is the first block,nothing to wait for dst push + + dst_ping_pong_flag = !dst_ping_pong_flag; } + dma_queue_pop(dma_queue); // wait for dma done for the last dst push + t2 = HAP_perf_get_qtimer_count(); - FARF(HIGH, "gelu-f32 %d/%d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, opt_path, ne00, ne01, ne02, + FARF(HIGH, "gelu-f32 %d/%d: %ux%ux%ux%u (%u:%u) -> %ux%ux%ux%u usec %u\n", ith, nth, ne00, ne01, ne02, ne03, src0_start_row, src0_end_row, ne0, ne1, ne2, ne3, (unsigned) HAP_perf_qtimer_count_to_us(t2 - t1)); } static void unary_gelu_fp32(unsigned int n, unsigned int i, void * data) { struct htp_ops_context * octx = (struct htp_ops_context *) data; unary_gelu_fp32_per_thread(&octx->src0, &octx->dst, octx->op_params, &octx->src0_spad, &octx->dst_spad, n, i, - octx->src0_nrows_per_thread); + octx->src0_nrows_per_thread, octx->ctx->dma[i]); } @@ -469,16 +565,44 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { const uint32_t src0_nrows = src0->ne[1] * src0->ne[2] * src0->ne[3]; const size_t src0_row_size = src0->nb[1]; - const size_t src1_row_size = src1->ne[0] ? src1->nb[1] : src0->nb[1]; + const size_t src1_row_size = src1->ne[0] > 0 ? src1->nb[1] : 0; // zero bytes if src1 is not used const size_t dst_row_size = dst->nb[1]; + + const size_t src0_row_size_aligned = htp_round_up(src0_row_size, VLEN); + const size_t src1_row_size_aligned = htp_round_up(src1_row_size, VLEN); + const size_t dst_row_size_aligned = htp_round_up(dst_row_size, VLEN); // VTCM scratchpads for all tensors // N rows per thread, padded to HVX vector size - octx->dst_spad.size = htp_round_up(dst_row_size, 128) * octx->n_threads; - octx->src0_spad.size = htp_round_up(src0_row_size, 128) * octx->n_threads; - octx->src1_spad.size = htp_round_up(src1_row_size, 128) * octx->n_threads; - size_t spad_size = octx->src0_spad.size + octx->src1_spad.size + octx->dst_spad.size; + + size_t spad_size_per_row = (src0_row_size_aligned + + src1_row_size_aligned) + dst_row_size_aligned; + + size_t vtcm_row_per_thread = (octx->ctx->vtcm_size)/ (n_threads* spad_size_per_row); + + + // Make sure the reserved vtcm size is sufficient + if(vtcm_row_per_thread ==0){ + FARF(ERROR, "act-%s : current VTCM reservation %zu is too small for even 1 row per thread, needed at least %zu\n", op_type, octx->ctx->vtcm_size, + spad_size_per_row * n_threads); + return HTP_STATUS_VTCM_TOO_SMALL; + } + + + + octx->src0_spad.size_per_thread = src0_row_size_aligned * vtcm_row_per_thread; + octx->src1_spad.size_per_thread = src1_row_size_aligned * vtcm_row_per_thread; + octx->dst_spad.size_per_thread = dst_row_size_aligned * vtcm_row_per_thread; + + octx->dst_spad.size = n_threads* octx->dst_spad.size_per_thread; + octx->src0_spad.size = n_threads* octx->src0_spad.size_per_thread; + octx->src1_spad.size = n_threads* octx->src1_spad.size_per_thread; + + octx->src0_spad.data = octx->ctx->vtcm_base; + octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; + octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; + if (src1->ne[0]) { FARF(HIGH, @@ -492,16 +616,6 @@ static int execute_op_activations_fp32(struct htp_ops_context * octx) { octx->src0_spad.size, octx->src1_spad.size, octx->dst_spad.size); } - // Make sure the reserved vtcm size is sufficient - if (octx->ctx->vtcm_size < spad_size) { - FARF(ERROR, "act-%s : current VTCM reservation %zu is too small, needed %zu\n", op_type, octx->ctx->vtcm_size, - spad_size); - return HTP_STATUS_VTCM_TOO_SMALL; - } - - octx->src0_spad.data = octx->ctx->vtcm_base; - octx->src1_spad.data = octx->src0_spad.data + octx->src0_spad.size; - octx->dst_spad.data = octx->src1_spad.data + octx->src1_spad.size; if (!(octx->flags & HTP_OPFLAGS_SKIP_COMPUTE)) { uint32_t n_jobs = MIN(n_threads, src0_nrows); diff --git a/ggml/src/ggml-hexagon/htp/htp-dma.h b/ggml/src/ggml-hexagon/htp/htp-dma.h index 7d3fc4078c..ad871e6518 100644 --- a/ggml/src/ggml-hexagon/htp/htp-dma.h +++ b/ggml/src/ggml-hexagon/htp/htp-dma.h @@ -54,8 +54,10 @@ static inline bool dma_queue_push(dma_queue * q, const void * src, size_t dst_row_size, size_t src_row_size, + size_t width, // width in bytes. number of bytes to transfer per row size_t nrows) { if (((q->push_idx + 1) & q->idx_mask) == q->pop_idx) { + FARF(ERROR, "dma-push: queue full\n"); return false; } @@ -79,7 +81,7 @@ static inline bool dma_queue_push(dma_queue * q, desc->dst = (void *) dst; desc->allocation = 0; desc->padding = 0; - desc->roiwidth = src_row_size; + desc->roiwidth = width; desc->roiheight = nrows; desc->srcstride = src_row_size; desc->dststride = dst_row_size; @@ -96,6 +98,25 @@ static inline bool dma_queue_push(dma_queue * q, return true; } +static inline bool dma_queue_push_ddr_to_vtcm(dma_queue * q, + void * dst, + const void * src, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push(q, dst, src, dst_row_size, src_row_size, src_row_size, nrows); +} + + +static inline bool dma_queue_push_vtcm_to_ddr(dma_queue * q, + void * dst, + const void * src, + size_t dst_row_size, + size_t src_row_size, + size_t nrows) { + return dma_queue_push(q, dst, src, dst_row_size, src_row_size, dst_row_size, nrows); +} + static inline uint8_t * dma_queue_pop(dma_queue * q) { if (q->push_idx == q->pop_idx) { return NULL; diff --git a/ggml/src/ggml-hexagon/htp/hvx-utils.h b/ggml/src/ggml-hexagon/htp/hvx-utils.h index 566048297d..d8f2130e31 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hvx-utils.h @@ -980,7 +980,7 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * int step_of_1 = num_elems >> 5; int remaining = num_elems - step_of_1 * VLEN_FP32; - assert(remaining == 0); + const HVX_Vector * restrict v_src = (HVX_Vector *) src; HVX_Vector * restrict v_dst = (HVX_Vector *) dst; @@ -996,6 +996,15 @@ static inline void hvx_fast_sigmoid_f32(const uint8_t * restrict src, uint8_t * for (int i = 0; i < step_of_1; i++) { v_dst[i] = hvx_vec_fast_sigmoid_fp32_guard(v_src[i], one, max_exp, min_exp); } + + if(remaining > 0) { + const float * srcf = ((const float *) src) + step_of_1* VLEN_FP32; + float * dstf = (float *) dst + step_of_1*VLEN_FP32; + + HVX_Vector in = *(HVX_UVector *) srcf; + HVX_Vector out =hvx_vec_fast_sigmoid_fp32_guard(in, one, max_exp, min_exp); + hvx_vec_store_u((void *) dstf, remaining * SIZEOF_FP32, out); + } } diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index e30ae69502..c355adc8be 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -299,7 +299,7 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que ctx->n_threads = n_hvx; for (int i = 0; i < ctx->n_threads; i++) { - ctx->dma[i] = dma_queue_create(HTP_SPAD_SRC0_NROWS * 2); + ctx->dma[i] = dma_queue_create(64); //see discussion https://github.com/ggml-org/llama.cpp/pull/18151#discussion_r2632388541 } // init worker pool diff --git a/ggml/src/ggml-hexagon/htp/matmul-ops.c b/ggml/src/ggml-hexagon/htp/matmul-ops.c index 346f0bd339..d617da2cc4 100644 --- a/ggml/src/ggml-hexagon/htp/matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/matmul-ops.c @@ -1115,7 +1115,7 @@ static void matmul(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1134,7 +1134,7 @@ static void matmul(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1143,7 +1143,7 @@ static void matmul(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const int is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); @@ -1217,7 +1217,7 @@ static void matvec(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1230,7 +1230,7 @@ static void matvec(struct htp_matmul_type * mt, const uint32_t pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const uint32_t is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1239,7 +1239,7 @@ static void matvec(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { const uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); mt->vec_dot(ne00, &tmp[ir0 - src0_start_row], ss0, src1_col); @@ -1331,7 +1331,7 @@ static void matmul_id(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1356,7 +1356,7 @@ static void matmul_id(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1365,7 +1365,7 @@ static void matmul_id(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); @@ -1455,7 +1455,7 @@ static void matvec_id(struct htp_matmul_type * mt, if (is0 >= HTP_SPAD_SRC0_NROWS) { break; } - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } @@ -1468,7 +1468,7 @@ static void matvec_id(struct htp_matmul_type * mt, const int pr0 = (ir0 + HTP_SPAD_SRC0_NROWS); const int is0 = (pr0 - src0_start_row) % HTP_SPAD_SRC0_NROWS; if (pr0 < src0_end_row_x2) { - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + pr0 * src0_row_size, src0_row_size_padded, src0_row_size, 2); } } @@ -1477,7 +1477,7 @@ static void matvec_id(struct htp_matmul_type * mt, if (src0_end_row != src0_end_row_x2) { uint32_t ir0 = src0_end_row_x2; const uint32_t is0 = (ir0 - src0_start_row); - dma_queue_push(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, + dma_queue_push_ddr_to_vtcm(dma_queue, spad_src0 + is0 * src0_row_size_padded, src0_row + ir0 * src0_row_size, src0_row_size_padded, src0_row_size, 1); const uint8_t * ss0 = dma_queue_pop(dma_queue); mt->vec_dot(ne00, &dst_row[ir0], ss0, src1_col);