From 160c71e22c7bbfcfff09a3528198839d626dd1f6 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Sun, 30 Nov 2025 22:16:00 -0800 Subject: [PATCH 01/31] Initial commit --- onnxruntime/core/mlas/lib/convolve.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 9518134631f2d..fab0f742a125b 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -985,7 +985,7 @@ Return Value: return; } -#if defined(MLAS_TARGET_WASM_SCALAR) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) if (Algorithm == MlasConvAlgorithmDepthwise) { // Fill the Working Buffer with Zero for use by the depthwise kernel. @@ -1082,7 +1082,7 @@ Return Value: break; } -#if defined(MLAS_TARGET_WASM_SCALAR) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) case MlasConvAlgorithmDepthwise: { @@ -1337,9 +1337,9 @@ Return Value: } else { -#if defined(MLAS_TARGET_WASM_SCALAR) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) - // Scalar direct conv for depthwise convolution. + // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution. // Currently only support 3x3 kernel with padding <=1 and dilations = 1. // TODO: support more general depthwise convolution. @@ -1411,8 +1411,8 @@ Return Value: if (Parameters->BatchCount > 1 || Parameters->GroupCount > 1) { - size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K, - Parameters->FilterCount * Parameters->OutputSize, + size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K, + Parameters->FilterCount * Parameters->OutputSize, static_cast(MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD)}); TargetThreadCount = MaximumThreadCount; if (static_cast(TargetThreadCount) >= Parameters->BatchCount * Parameters->GroupCount) { From a134ea00ff98ff2a3c425f1f880cec201beeeb7a Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Sun, 30 Nov 2025 22:24:59 -0800 Subject: [PATCH 02/31] More changes --- onnxruntime/core/mlas/inc/mlas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 9d98a15d8457a..edf2bb7582a90 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM { MlasConvAlgorithmGemmDirect, MlasConvAlgorithmExpandThenGemm, MlasConvAlgorithmExpandThenGemmSegmented, -#if defined(MLAS_TARGET_WASM_SCALAR) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) MlasConvAlgorithmDepthwise, #endif }; From a44f7085ea60d3b79d4de4f762387722f060ad92 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 02:06:52 -0800 Subject: [PATCH 03/31] More changes --- cmake/onnxruntime_mlas.cmake | 2 +- .../core/mlas/lib/sconv_kernel_neon.cpp | 180 +++++++++++++++++- 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index c0ab948b41fff..e113a9f0cc81c 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -307,7 +307,6 @@ endfunction() function (setup_arm_neon_nchwc) target_sources(onnxruntime_mlas PRIVATE ${MLAS_SRC_DIR}/sconv.h - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp ${MLAS_SRC_DIR}/spool_kernel_neon.cpp ) list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC) @@ -460,6 +459,7 @@ else() ${MLAS_SRC_DIR}/eltwise_kernel_neon.h ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp + ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp ) # Conditionally add the SVE implementation if compiler supports it diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp index 4c5f50adb929c..3909495f24743 100644 --- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp @@ -14,9 +14,187 @@ Module Name: --*/ -#if defined(MLAS_USE_ARM_NEON_NCHWC) #include "mlasi.h" + +static +void +MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros + ) +/*++ + +Routine Description: + + This routine is an inner kernel to compute convolution on one channel input with one filter channel. + +Arguments: + + Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. + + Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + + Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + + Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + + Zeroes - Point to working buffer where all 0.0f are filled. + +--*/ +{ + const size_t W = Parameters->InputShape[1]; + const float beta = Parameters->Beta; + + if (W > 1) { + + const float w00 = Filter[0]; + const float w01 = Filter[1]; + const float w02 = Filter[2]; + const float w10 = Filter[3]; + const float w11 = Filter[4]; + const float w12 = Filter[5]; + const float w20 = Filter[6]; + const float w21 = Filter[7]; + const float w22 = Filter[8]; + + const size_t H = Parameters->InputShape[0]; + const size_t pad_top = Parameters->Padding[0]; + const size_t pad_left = Parameters->Padding[1]; + const size_t stride_h = Parameters->StrideShape[0]; + const size_t stride_w = Parameters->StrideShape[1]; + + // We treat pad_left, pad_top are hard require. + // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters. + const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0; + + const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left); + // Need to handle effective pad_bottom is 2 when H == 1 + const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left; + const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W); + + for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) { + auto out_col = Parameters->OutputShape[1]; + + if (pad_left == 1) { + float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] + + w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + out_col--; + row0 += stride_w; + row1 += stride_w; + row2 += stride_w; + } + + for (; out_col > pad_right; out_col--) { + float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] + + w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] + + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + row0 += stride_w; + row1 += stride_w; + row2 += stride_w; + } + + if (out_col == 1) { // pad_right == 1 + float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] + + w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + } + + h += stride_h; + row0 = (Input + (h - pad_top) * W) - pad_left; + row1 = row0 + W; + row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W); + } + + } else { // W == 1 + + const size_t H = Parameters->InputShape[0]; + const size_t pad_left = Parameters->Padding[1]; + const size_t pad_top = Parameters->Padding[0]; + const size_t stride_h = Parameters->StrideShape[0]; + size_t out_row = Parameters->OutputShape[0]; + + // Make sure pad_bottom is consistent with other parameters. + size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ? + ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0; + + const float w0 = Filter[pad_left ? 1 : 0]; + const float w1 = Filter[pad_left ? 4 : 3]; + const float w2 = Filter[pad_left ? 7 : 6]; + auto init_v = (beta == 0.f ? 0.f : *Output * beta); + + if (pad_top == 1) { + *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v; + out_row--; + } + + for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) { + // All pixels are in the input col + auto init = (beta == 0.f ? 0.f : *Output * beta); + *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init; + row += stride_h; + } + + if (out_row > 0) { + // last 1 or 2 rows are from the padding zero row. + // out_row == 1 when arrive here + if (pad_bottom == 1) { + const float* row = Input + H - 2; + *Output++ = w0 * row[0] + w1 * row[1] + init_v; + } else { // pad_bottom == 2 and H == 1 and padding_top == 0 + *Output++ = w0 * Input[0] + init_v; + } + } + } + +} + + +void +MlasConvDepthwiseFloat_CHW( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros + ) +/*++ + +Routine Description: + + This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel. + +Arguments: + + Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. + + Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + + Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + + Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + + Zeroes - Point to working buffer where all 0.0f are filled. + +Note: + No checking here as it is inner loop. Logic in generating Parameters controls the check. + + Currently only support 2d kernel 3x3. + Will add general case and more special case if needed later. + +--*/ +{ + MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros); +} + +#if defined(MLAS_USE_ARM_NEON_NCHWC) +// Everything below these are NCHWC related kernels + #include "sconv.h" constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE; From 3a3ccf76783e1b301ead69db91d6a61af80fcc8e Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 07:38:57 -0800 Subject: [PATCH 04/31] Fix builds --- cmake/onnxruntime_mlas.cmake | 1 + onnxruntime/core/mlas/inc/mlas.h | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index e113a9f0cc81c..0ba48c2b094c5 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -113,6 +113,7 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp + ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp ) set(mlas_platform_preprocess_srcs diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index edf2bb7582a90..f59ac92e1a9c2 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM { MlasConvAlgorithmGemmDirect, MlasConvAlgorithmExpandThenGemm, MlasConvAlgorithmExpandThenGemmSegmented, -#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined (MLAS_TARGET_ARM64) MlasConvAlgorithmDepthwise, #endif }; From 212dbf1e9f89b533654f06660cefd5b3db312b1e Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 07:42:49 -0800 Subject: [PATCH 05/31] Fix builds 2 --- onnxruntime/core/mlas/lib/mlasi.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index ad62cccbfb9c7..386570454b2fd 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -1601,7 +1601,8 @@ MlasFp32FromBits( #pragma warning(pop) #endif -#if defined(MLAS_TARGET_WASM_SCALAR) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) + void MLASCALL From fceae09b1ac6e820c567b89d35478974b137cbcb Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 08:27:52 -0800 Subject: [PATCH 06/31] Threaded --- onnxruntime/core/mlas/lib/convolve.cpp | 105 +++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index fab0f742a125b..f18632829e662 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -805,6 +805,86 @@ Return Value: } } +void +MlasDepthwiseThreaded( + void* Context, + ptrdiff_t Index +) + +/*++ + +Routine Description: + + This routine is invoked from a worker thread to execute a segment of a + convolution operation. + + If using this, the entire convolution operation is parallelized on the + (batch size * group count) parameter and this routine has logic to + perform a specific thread's shard of the entire Convolution operation. + +Arguments: + + Context - Supplies the pointer to the context for the threaded operation. + + Index - Supplies the current index of the threaded operation. + +Return Value: + + None. + +--*/ + +{ + + MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context; + + const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters; + + const size_t GroupCount = Parameters->GroupCount; + const size_t BatchGroupCount = Parameters->BatchCount * GroupCount; + + const size_t TargetThreadCount = WorkBlock->TargetThreadCount; + + const size_t BatchGroupCountPerThread = BatchGroupCount / TargetThreadCount; + const size_t BatchGroupCountExtra = BatchGroupCount % TargetThreadCount; + + size_t BatchGroupStart; + size_t BatchGroupEnd; + + if (static_cast(Index) < BatchGroupCountExtra) { + BatchGroupStart = (BatchGroupCountPerThread + 1) * Index; + BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread + 1; + } else { + BatchGroupStart = BatchGroupCountPerThread * Index + BatchGroupCountExtra; + BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread; + } + + const size_t FilterCount = Parameters->FilterCount; + const size_t OutputSize = Parameters->OutputSize; + const size_t K = Parameters->K; + + const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize; + const size_t OutputGroupSize = FilterCount * OutputSize; + const size_t FilterGroupSize = FilterCount * K; + + for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) { + size_t group = bg % GroupCount; + + const float* input = WorkBlock->Input + bg * InputGroupSize; + const float* filter = WorkBlock->Filter + group * FilterGroupSize; + float* output = WorkBlock->Output + bg * OutputGroupSize; + const float* bias = WorkBlock->Bias; + if (bias != nullptr) { + bias += group * FilterCount; + } + + float* WorkingBuffer = WorkBlock->WorkingBuffer; + + MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer); + MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); + } +} + inline bool MlasConvTryMultithread( @@ -1019,6 +1099,31 @@ Return Value: return; } + if (Algorithm == MlasConvAlgorithmDepthwise && ((BatchCount > 1) || (GroupCount > 1))) { + const size_t BatchGroupCount = BatchCount * GroupCount; + + ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); + + + if (static_cast(TargetThreadCount) >= BatchGroupCount) { + TargetThreadCount = static_cast(BatchGroupCount); + } + + MLAS_CONV_WORK_BLOCK WorkBlock; + + WorkBlock.Parameters = Parameters; + WorkBlock.Input = Input; + WorkBlock.Filter = Filter; + WorkBlock.Bias = Bias; + WorkBlock.WorkingBuffer = WorkingBuffer; + WorkBlock.Output = Output; + WorkBlock.TargetThreadCount = TargetThreadCount; + + MlasExecuteThreaded(MlasDepthwiseThreaded, &WorkBlock, TargetThreadCount, ThreadPool); + + return; + } + // // Iterate over each batch and group. // From 3793d70f9afd953c7b8814d782f7a4968de2f938 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 08:42:38 -0800 Subject: [PATCH 07/31] Fix x64 builds --- onnxruntime/core/mlas/lib/convolve.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index f18632829e662..24c4f9ef8be5d 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -805,6 +805,8 @@ Return Value: } } +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) + void MlasDepthwiseThreaded( void* Context, @@ -885,6 +887,8 @@ Return Value: } } +#endif + inline bool MlasConvTryMultithread( @@ -1099,6 +1103,9 @@ Return Value: return; } + +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) + if (Algorithm == MlasConvAlgorithmDepthwise && ((BatchCount > 1) || (GroupCount > 1))) { const size_t BatchGroupCount = BatchCount * GroupCount; @@ -1124,6 +1131,8 @@ Return Value: return; } +#endif + // // Iterate over each batch and group. // From 481a7f668ca11853a126d22aa495e03a8601b5ab Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 18:07:41 -0800 Subject: [PATCH 08/31] Experiment --- onnxruntime/core/mlas/lib/convolve.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 24c4f9ef8be5d..c9916f9ef580b 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -883,7 +883,7 @@ Return Value: float* WorkingBuffer = WorkBlock->WorkingBuffer; MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer); - MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); + //MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); } } From 8993a0a0c0bcfccc4b24dd77322d699828e92549 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Mon, 1 Dec 2025 19:11:29 -0800 Subject: [PATCH 09/31] Experiment revert --- onnxruntime/core/mlas/lib/convolve.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index c9916f9ef580b..24c4f9ef8be5d 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -883,7 +883,7 @@ Return Value: float* WorkingBuffer = WorkBlock->WorkingBuffer; MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer); - //MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); + MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize); } } From d765c1a145b1a9b4922c468179dbb79e5ba4c0e0 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Tue, 2 Dec 2025 07:28:42 -0800 Subject: [PATCH 10/31] Refactor --- cmake/onnxruntime_mlas.cmake | 9 +- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 194 ++++++++++++++++++ ...l_neon.cpp => sconv_nchwc_kernel_neon.cpp} | 185 +---------------- .../{sconv.h => sconv_nchwc_kernel_neon.h} | 0 ...l_neon.cpp => spool_nchwc_kernel_neon.cpp} | 0 5 files changed, 202 insertions(+), 186 deletions(-) create mode 100644 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp rename onnxruntime/core/mlas/lib/{sconv_kernel_neon.cpp => sconv_nchwc_kernel_neon.cpp} (77%) rename onnxruntime/core/mlas/lib/{sconv.h => sconv_nchwc_kernel_neon.h} (100%) rename onnxruntime/core/mlas/lib/{spool_kernel_neon.cpp => spool_nchwc_kernel_neon.cpp} (100%) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 0ba48c2b094c5..11c32dac1df04 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -113,7 +113,7 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp + ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp ) set(mlas_platform_preprocess_srcs @@ -307,8 +307,9 @@ endfunction() function (setup_arm_neon_nchwc) target_sources(onnxruntime_mlas PRIVATE - ${MLAS_SRC_DIR}/sconv.h - ${MLAS_SRC_DIR}/spool_kernel_neon.cpp + ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.h + ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.cpp + ${MLAS_SRC_DIR}/spool_nchwc_kernel_neon.cpp ) list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC) set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE) @@ -460,7 +461,7 @@ else() ${MLAS_SRC_DIR}/eltwise_kernel_neon.h ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp - ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp + ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp ) # Conditionally add the SVE implementation if compiler supports it diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp new file mode 100644 index 0000000000000..906d54fc2d4c5 --- /dev/null +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -0,0 +1,194 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + sconv_nchw_kernel_neon.cpp + +Abstract: + + This module implements the single precision NCHW convolution kernels for ARM NEON. + +--*/ + + +#include "mlasi.h" + +static +void +MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros + ) +/*++ + +Routine Description: + + This routine is an inner kernel to compute convolution on one channel input with one filter channel. + +Arguments: + + Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. + + Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + + Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + + Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + + Zeroes - Point to working buffer where all 0.0f are filled. + +--*/ +{ + const size_t W = Parameters->InputShape[1]; + const float beta = Parameters->Beta; + + if (W > 1) { + + const float w00 = Filter[0]; + const float w01 = Filter[1]; + const float w02 = Filter[2]; + const float w10 = Filter[3]; + const float w11 = Filter[4]; + const float w12 = Filter[5]; + const float w20 = Filter[6]; + const float w21 = Filter[7]; + const float w22 = Filter[8]; + + const size_t H = Parameters->InputShape[0]; + const size_t pad_top = Parameters->Padding[0]; + const size_t pad_left = Parameters->Padding[1]; + const size_t stride_h = Parameters->StrideShape[0]; + const size_t stride_w = Parameters->StrideShape[1]; + + // We treat pad_left, pad_top are hard require. + // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters. + const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0; + + const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left); + // Need to handle effective pad_bottom is 2 when H == 1 + const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left; + const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W); + + for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) { + auto out_col = Parameters->OutputShape[1]; + + if (pad_left == 1) { + float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] + + w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + out_col--; + row0 += stride_w; + row1 += stride_w; + row2 += stride_w; + } + + for (; out_col > pad_right; out_col--) { + float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] + + w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] + + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + row0 += stride_w; + row1 += stride_w; + row2 += stride_w; + } + + if (out_col == 1) { // pad_right == 1 + float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] + + w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta); + *Output++ = dotsum; + } + + h += stride_h; + row0 = (Input + (h - pad_top) * W) - pad_left; + row1 = row0 + W; + row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W); + } + + } else { // W == 1 + + const size_t H = Parameters->InputShape[0]; + const size_t pad_left = Parameters->Padding[1]; + const size_t pad_top = Parameters->Padding[0]; + const size_t stride_h = Parameters->StrideShape[0]; + size_t out_row = Parameters->OutputShape[0]; + + // Make sure pad_bottom is consistent with other parameters. + size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ? + ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0; + + const float w0 = Filter[pad_left ? 1 : 0]; + const float w1 = Filter[pad_left ? 4 : 3]; + const float w2 = Filter[pad_left ? 7 : 6]; + auto init_v = (beta == 0.f ? 0.f : *Output * beta); + + if (pad_top == 1) { + *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v; + out_row--; + } + + for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) { + // All pixels are in the input col + auto init = (beta == 0.f ? 0.f : *Output * beta); + *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init; + row += stride_h; + } + + if (out_row > 0) { + // last 1 or 2 rows are from the padding zero row. + // out_row == 1 when arrive here + if (pad_bottom == 1) { + const float* row = Input + H - 2; + *Output++ = w0 * row[0] + w1 * row[1] + init_v; + } else { // pad_bottom == 2 and H == 1 and padding_top == 0 + *Output++ = w0 * Input[0] + init_v; + } + } + } + +} + + +void +MlasConvDepthwiseFloat_CHW( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros + ) +/*++ + +Routine Description: + + This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel. + +Arguments: + + Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. + + Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + + Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + + Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + + Zeroes - Point to working buffer where all 0.0f are filled. + +Note: + No checking here as it is inner loop. Logic in generating Parameters controls the check. + + Currently only support 2d kernel 3x3. + Will add general case and more special case if needed later. + +--*/ +{ + MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros); +} + diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp similarity index 77% rename from onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp rename to onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp index 3909495f24743..6f0e6c07eb8c0 100644 --- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp @@ -6,196 +6,17 @@ Licensed under the MIT License. Module Name: - sconv_kernel_neon.cpp + sconv_nchwc_kernel_neon.cpp Abstract: - This module implements the single precision convolution kernels for ARM NEON. + This module implements the single precision NCHWC convolution kernels for ARM NEON. --*/ - -#include "mlasi.h" - -static -void -MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( - const MLAS_CONV_PARAMETERS* Parameters, - const float* Input, - const float* Filter, - float* Output, - const float* Zeros - ) -/*++ - -Routine Description: - - This routine is an inner kernel to compute convolution on one channel input with one filter channel. - -Arguments: - - Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. - - Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. - - Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. - - Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. - - Zeroes - Point to working buffer where all 0.0f are filled. - ---*/ -{ - const size_t W = Parameters->InputShape[1]; - const float beta = Parameters->Beta; - - if (W > 1) { - - const float w00 = Filter[0]; - const float w01 = Filter[1]; - const float w02 = Filter[2]; - const float w10 = Filter[3]; - const float w11 = Filter[4]; - const float w12 = Filter[5]; - const float w20 = Filter[6]; - const float w21 = Filter[7]; - const float w22 = Filter[8]; - - const size_t H = Parameters->InputShape[0]; - const size_t pad_top = Parameters->Padding[0]; - const size_t pad_left = Parameters->Padding[1]; - const size_t stride_h = Parameters->StrideShape[0]; - const size_t stride_w = Parameters->StrideShape[1]; - - // We treat pad_left, pad_top are hard require. - // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters. - const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0; - - const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left); - // Need to handle effective pad_bottom is 2 when H == 1 - const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left; - const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W); - - for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) { - auto out_col = Parameters->OutputShape[1]; - - if (pad_left == 1) { - float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] + - w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - out_col--; - row0 += stride_w; - row1 += stride_w; - row2 += stride_w; - } - - for (; out_col > pad_right; out_col--) { - float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] + - w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] + - w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - row0 += stride_w; - row1 += stride_w; - row2 += stride_w; - } - - if (out_col == 1) { // pad_right == 1 - float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] + - w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - } - - h += stride_h; - row0 = (Input + (h - pad_top) * W) - pad_left; - row1 = row0 + W; - row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W); - } - - } else { // W == 1 - - const size_t H = Parameters->InputShape[0]; - const size_t pad_left = Parameters->Padding[1]; - const size_t pad_top = Parameters->Padding[0]; - const size_t stride_h = Parameters->StrideShape[0]; - size_t out_row = Parameters->OutputShape[0]; - - // Make sure pad_bottom is consistent with other parameters. - size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ? - ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0; - - const float w0 = Filter[pad_left ? 1 : 0]; - const float w1 = Filter[pad_left ? 4 : 3]; - const float w2 = Filter[pad_left ? 7 : 6]; - auto init_v = (beta == 0.f ? 0.f : *Output * beta); - - if (pad_top == 1) { - *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v; - out_row--; - } - - for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) { - // All pixels are in the input col - auto init = (beta == 0.f ? 0.f : *Output * beta); - *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init; - row += stride_h; - } - - if (out_row > 0) { - // last 1 or 2 rows are from the padding zero row. - // out_row == 1 when arrive here - if (pad_bottom == 1) { - const float* row = Input + H - 2; - *Output++ = w0 * row[0] + w1 * row[1] + init_v; - } else { // pad_bottom == 2 and H == 1 and padding_top == 0 - *Output++ = w0 * Input[0] + init_v; - } - } - } - -} - - -void -MlasConvDepthwiseFloat_CHW( - const MLAS_CONV_PARAMETERS* Parameters, - const float* Input, - const float* Filter, - float* Output, - const float* Zeros - ) -/*++ - -Routine Description: - - This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel. - -Arguments: - - Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. - - Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. - - Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. - - Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. - - Zeroes - Point to working buffer where all 0.0f are filled. - -Note: - No checking here as it is inner loop. Logic in generating Parameters controls the check. - - Currently only support 2d kernel 3x3. - Will add general case and more special case if needed later. - ---*/ -{ - MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros); -} - #if defined(MLAS_USE_ARM_NEON_NCHWC) -// Everything below these are NCHWC related kernels -#include "sconv.h" +#include "sconv_nchwc_kernel_neon.h" constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE; diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.h similarity index 100% rename from onnxruntime/core/mlas/lib/sconv.h rename to onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.h diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_nchwc_kernel_neon.cpp similarity index 100% rename from onnxruntime/core/mlas/lib/spool_kernel_neon.cpp rename to onnxruntime/core/mlas/lib/spool_nchwc_kernel_neon.cpp From a428d509675c34f7aa0156a5d4dd281f35c35723 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Tue, 2 Dec 2025 07:30:51 -0800 Subject: [PATCH 11/31] More changes --- onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp index 6f0e6c07eb8c0..a387c6a07992a 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp @@ -16,6 +16,7 @@ Module Name: #if defined(MLAS_USE_ARM_NEON_NCHWC) +#include "mlasi.h" #include "sconv_nchwc_kernel_neon.h" constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE; From d53dd15b58ca18afdb5f14f7ba04c18a257a1d12 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Tue, 2 Dec 2025 21:33:21 -0800 Subject: [PATCH 12/31] a --- onnxruntime/core/mlas/lib/convolve.cpp | 4 +- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 363 ++++++++++++------ 2 files changed, 257 insertions(+), 110 deletions(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 24c4f9ef8be5d..098b800b81353 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1460,8 +1460,8 @@ Return Value: if (Dimensions == 2 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3 - && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1 - && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1 + && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0 + && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0 && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) { *WorkingBufferSize = Parameters->InputShape[1] + 2; diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index 906d54fc2d4c5..c559399cdcfd3 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -16,10 +16,251 @@ Module Name: #include "mlasi.h" +#include -static -void -MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( +#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) +#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) +#else +#define C10_LIKELY(expr) (expr) +#endif + + +#if defined(_MSC_VER) && defined(MLAS_TARGET_ARM64) +MLAS_FORCEINLINE float32x4_t operator+(float32x4_t lhs, float32x4_t rhs) +{ + return vaddq_f32(lhs, rhs); +} + +MLAS_FORCEINLINE float32x4_t operator-(float32x4_t lhs, float32x4_t rhs) +{ + return vsubq_f32(lhs, rhs); +} + +MLAS_FORCEINLINE float32x4_t operator*(float32x4_t lhs, float32x4_t rhs) +{ + return vmulq_f32(lhs, rhs); +} + +MLAS_FORCEINLINE float32x4_t operator-(float32x4_t value) +{ + return vnegq_f32(value); +} +#endif + +struct Arguments final { + // Input layer dimensions + int64_t in_rows; + int64_t in_cols; + + // Output layer dimensions + int64_t out_rows; + int64_t out_cols; + + // Padding info + int64_t pad_rows; + int64_t pad_cols; +}; + +inline void winograd_f2k3_input_transform_inplace__neon( + float32x4_t* const d0, + float32x4_t* const d1, + float32x4_t* const d2, + float32x4_t* const d3 +) +{ + const float32x4_t wd0 = *d0 - *d2; + const float32x4_t wd1 = *d1 + *d2; + const float32x4_t wd2 = -*d1 + *d2; + const float32x4_t wd3 = *d1 - *d3; + *d0 = wd0; + *d1 = wd1; + *d2 = wd2; + *d3 = wd3; +} + +inline void winograd_f2k3_output_transform_inplace__neon( + float32x4_t* const m0, + float32x4_t* const m1, + const float32x4_t* const m2, + const float32x4_t* const m3 +) +{ + *m0 = *m0 + *m1 + *m2; + *m1 = *m1 - *m2 - *m3; +} + +inline float32x4_t vmuladdq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b) +{ + return vfmaq_f32(c, a, b); + // TODO: Support ARMv7 +} + +inline float32x4_t vmulsubq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b) +{ + return vfmsq_f32(c, a, b); + // TODO: Support ARMv7 +} + +inline void winograd_f2k3_kernel_transform__neon( + const float32x4_t g0, + const float32x4_t g1, + const float32x4_t g2, + float32x4_t* const transform0, + float32x4_t* const transform1, + float32x4_t* const transform2, + float32x4_t* const transform3 +) +{ + const float32x4_t const_half = vdupq_n_f32(0.5f); + float32x4_t half_g0_plus_g2 = const_half * (g0 + g2); + *transform0 = g0; + *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1); + *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1); + *transform3 = g2; +} + +inline float32x4x4_t v4f_transpose4x4__neon(const float32x4x4_t m) +{ + float32x4x4_t ret; + vst4q_f32((float*)(&ret), m); + return ret; +} + +void convolution_depthwise3x3_winograd_impl( + const Arguments& args, + const float* const input, + const float* const kernel, + float* const output +) +{ + //const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1); + float32x4x4_t kernel_tile; + + { + const float32x4_t g0 = vld1q_f32(kernel); + const float32x4_t g1 = vld1q_f32(kernel + 3); + // g2[3] is junk + const float32x4_t g2 = + vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1); + float32x4x4_t w; + winograd_f2k3_kernel_transform__neon( + g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3] + ); + w = v4f_transpose4x4__neon(w); + + winograd_f2k3_kernel_transform__neon( + w.val[0], + w.val[1], + w.val[2], + &kernel_tile.val[0], + &kernel_tile.val[1], + &kernel_tile.val[2], + &kernel_tile.val[3] + ); + } + + #define TILE \ + winograd_f2k3_input_transform_inplace__neon( \ + &input_tile.val[0], \ + &input_tile.val[1], \ + &input_tile.val[2], \ + &input_tile.val[3] \ + ); \ + input_tile = v4f_transpose4x4__neon(input_tile); \ + winograd_f2k3_input_transform_inplace__neon( \ + &input_tile.val[0], \ + &input_tile.val[1], \ + &input_tile.val[2], \ + &input_tile.val[3] \ + ); \ + \ + for (size_t row = 0; row < 4; ++row) { \ + input_tile.val[row] = \ + vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \ + } \ + \ + winograd_f2k3_output_transform_inplace__neon( \ + &input_tile.val[0], \ + &input_tile.val[1], \ + &input_tile.val[2], \ + &input_tile.val[3] \ + ); \ + input_tile = v4f_transpose4x4__neon(input_tile); \ + winograd_f2k3_output_transform_inplace__neon( \ + &input_tile.val[0], \ + &input_tile.val[1], \ + &input_tile.val[2], \ + &input_tile.val[3] \ + ) + + // Non-padded regime. + + // Iterate over non-padded output tiles. + // TODO: avoid spilling W by breaking out the non-padded vs padded case. + for (int64_t oth = 0; oth < (args.out_rows + 1) / 2; ++oth) { + for (int64_t otw = 0; otw < (args.out_cols + 1) / 2; ++otw) { + // load input tile for [oth, otw]; + int64_t ih = oth * 2 - args.pad_rows; + int64_t iw = otw * 2 - args.pad_cols; + // fast-path, all accesses in-bounds + if (C10_LIKELY( + ih >= 0 && iw >= 0 && ih + 3 < args.in_rows && + iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows && + 2 * otw + 1 < args.out_cols + )) { + float32x4x4_t input_tile; + for (int64_t row = 0; row < 4; ++row) { + input_tile.val[row] = + vld1q_f32(input + (ih + row) * args.in_cols + iw); + } + + TILE; + + for (int64_t row = 0; row < 2; ++row) { + vst1_f32( + output + (oth * 2 + row) * args.out_cols + otw * 2, + vget_low_f32(input_tile.val[row]) + ); + } + } else { + float block[4][4]; + for (int64_t row = 0; row < 4; ++row) { + for (int64_t col = 0; col < 4; ++col) { + if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows && + iw + col < args.in_cols) { + block[row][col] = input[(ih + row) * args.in_cols + iw + col]; + } else { + block[row][col] = 0.0; + } + } + } + + float32x4x4_t input_tile; + for (int64_t row = 0; row < 4; ++row) { + input_tile.val[row] = vld1q_f32(&block[row][0]); + } + + TILE; + + float oblock[2][2]; + for (int64_t row = 0; row < 2; ++row) { + vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row])); + } + for (int64_t row = 0; row < 2; ++row) { + for (int64_t col = 0; col < 2; ++col) { + if (2 * oth + row < args.out_rows && + 2 * otw + col < args.out_cols) { + output[(2 * oth + row) * args.out_cols + 2 * otw + col] = + oblock[row][col]; + } + } + } + } + } + } +} + + static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( const MLAS_CONV_PARAMETERS* Parameters, const float* Input, const float* Filter, @@ -46,117 +287,23 @@ Routine Description: --*/ { - const size_t W = Parameters->InputShape[1]; - const float beta = Parameters->Beta; - - if (W > 1) { - - const float w00 = Filter[0]; - const float w01 = Filter[1]; - const float w02 = Filter[2]; - const float w10 = Filter[3]; - const float w11 = Filter[4]; - const float w12 = Filter[5]; - const float w20 = Filter[6]; - const float w21 = Filter[7]; - const float w22 = Filter[8]; - - const size_t H = Parameters->InputShape[0]; - const size_t pad_top = Parameters->Padding[0]; - const size_t pad_left = Parameters->Padding[1]; - const size_t stride_h = Parameters->StrideShape[0]; - const size_t stride_w = Parameters->StrideShape[1]; - - // We treat pad_left, pad_top are hard require. - // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters. - const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0; - - const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left); - // Need to handle effective pad_bottom is 2 when H == 1 - const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left; - const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W); - - for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) { - auto out_col = Parameters->OutputShape[1]; - - if (pad_left == 1) { - float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] + - w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - out_col--; - row0 += stride_w; - row1 += stride_w; - row2 += stride_w; - } - - for (; out_col > pad_right; out_col--) { - float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] + - w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] + - w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - row0 += stride_w; - row1 += stride_w; - row2 += stride_w; - } + MLAS_UNREFERENCED_PARAMETER(Zeros); - if (out_col == 1) { // pad_right == 1 - float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] + - w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta); - *Output++ = dotsum; - } - - h += stride_h; - row0 = (Input + (h - pad_top) * W) - pad_left; - row1 = row0 + W; - row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W); - } - - } else { // W == 1 - - const size_t H = Parameters->InputShape[0]; - const size_t pad_left = Parameters->Padding[1]; - const size_t pad_top = Parameters->Padding[0]; - const size_t stride_h = Parameters->StrideShape[0]; - size_t out_row = Parameters->OutputShape[0]; - - // Make sure pad_bottom is consistent with other parameters. - size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ? - ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0; - - const float w0 = Filter[pad_left ? 1 : 0]; - const float w1 = Filter[pad_left ? 4 : 3]; - const float w2 = Filter[pad_left ? 7 : 6]; - auto init_v = (beta == 0.f ? 0.f : *Output * beta); + Arguments args; + args.in_rows = Parameters->InputShape[0]; + args.in_cols = Parameters->InputShape[1]; + + args.out_rows = Parameters->OutputShape[0]; + args.out_cols = Parameters->OutputShape[1]; - if (pad_top == 1) { - *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v; - out_row--; - } - - for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) { - // All pixels are in the input col - auto init = (beta == 0.f ? 0.f : *Output * beta); - *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init; - row += stride_h; - } - - if (out_row > 0) { - // last 1 or 2 rows are from the padding zero row. - // out_row == 1 when arrive here - if (pad_bottom == 1) { - const float* row = Input + H - 2; - *Output++ = w0 * row[0] + w1 * row[1] + init_v; - } else { // pad_bottom == 2 and H == 1 and padding_top == 0 - *Output++ = w0 * Input[0] + init_v; - } - } - } + args.pad_rows = Parameters->Padding[0]; + args.pad_cols = Parameters->Padding[1]; + convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output); } -void -MlasConvDepthwiseFloat_CHW( +void MlasConvDepthwiseFloat_CHW( const MLAS_CONV_PARAMETERS* Parameters, const float* Input, const float* Filter, From 8f12c51ce910e95fac66e6e1c0dda3ee0bcf2bc1 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Tue, 2 Dec 2025 23:56:58 -0800 Subject: [PATCH 13/31] Try --- onnxruntime/test/optimizer/conv_add_act_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc index f61f9b29d9cce..a557f7a15be2b 100644 --- a/onnxruntime/test/optimizer/conv_add_act_test.cc +++ b/onnxruntime/test/optimizer/conv_add_act_test.cc @@ -18,8 +18,8 @@ void TestConvPath(const std::vector& input_shape, const std::vector(input_shape, -31, 31); auto* output_arg = builder.MakeOutput(); - auto* bias_arg = builder.MakeInitializer({weights_shape[0]}, -20.f, 20.f); - auto* add_arg = builder.MakeInput(output_shape, -20.f, 20.f); + auto* bias_arg = builder.MakeInitializer({weights_shape[0]}, 0.f, 0.f); + auto* add_arg = builder.MakeInput(output_shape, 0.f, 0.f); auto* weight_arg = builder.MakeInitializer(weights_shape, -2.f, 2.f); auto* conv_out_arg = builder.MakeIntermediate(); From 67b680130bb1ef43fda7026b6a4ed624a8292edc Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Wed, 3 Dec 2025 05:42:05 -0800 Subject: [PATCH 14/31] More changes --- onnxruntime/core/mlas/lib/convolve.cpp | 3 +++ onnxruntime/test/optimizer/conv_add_act_test.cc | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 098b800b81353..628a99a424384 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1462,6 +1462,9 @@ Return Value: && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3 && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0 && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0 +#if defined(MLAS_TARGET_ARM64) + && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1 +#endif && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) { *WorkingBufferSize = Parameters->InputShape[1] + 2; diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc index a557f7a15be2b..1797ede3d24c5 100644 --- a/onnxruntime/test/optimizer/conv_add_act_test.cc +++ b/onnxruntime/test/optimizer/conv_add_act_test.cc @@ -18,7 +18,7 @@ void TestConvPath(const std::vector& input_shape, const std::vector(input_shape, -31, 31); auto* output_arg = builder.MakeOutput(); - auto* bias_arg = builder.MakeInitializer({weights_shape[0]}, 0.f, 0.f); + auto* bias_arg = builder.MakeInitializer({weights_shape[0]}, -20.f, 20.f); auto* add_arg = builder.MakeInput(output_shape, 0.f, 0.f); auto* weight_arg = builder.MakeInitializer(weights_shape, -2.f, 2.f); auto* conv_out_arg = builder.MakeIntermediate(); From 01b43fb68a0c1740577a8693c55d606bb4f71e23 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Wed, 3 Dec 2025 19:55:15 -0800 Subject: [PATCH 15/31] Relax padding --- onnxruntime/core/mlas/lib/convolve.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 628a99a424384..7cbc7de86238a 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1460,8 +1460,8 @@ Return Value: if (Dimensions == 2 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3 - && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0 - && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0 + && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1 + && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1 #if defined(MLAS_TARGET_ARM64) && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1 #endif From ea8339475baad05ef3a51059a501a94df6b628e3 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 00:48:10 -0800 Subject: [PATCH 16/31] Vanilla NEON Depthwise --- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 255 +++++++++++++++++- 1 file changed, 251 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index c559399cdcfd3..d147ea6276566 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -260,6 +260,222 @@ void convolution_depthwise3x3_winograd_impl( } } +MLAS_FORCEINLINE float DepthwiseSampleValue( + const float* row, + ptrdiff_t col, + size_t width +) +{ + if (row == nullptr || col < 0 || col >= static_cast(width)) { + return 0.0f; + } + return row[col]; +} + +MLAS_FORCEINLINE float DepthwiseAccumulateRowScalar( + float acc, + const float* row, + size_t base, + float w0, + float w1, + float w2 +) +{ + if (row == nullptr) { + return acc; + } + + acc += row[base] * w0; + acc += row[base + 1] * w1; + acc += row[base + 2] * w2; + return acc; +} + +MLAS_FORCEINLINE void DepthwiseAccumulateRowVector( + float32x4_t& acc, + const float* row, + size_t base, + float w0, + float w1, + float w2 +) +{ + if (row == nullptr) { + return; + } + + const float* r = row + base; + const float32x4_t c0 = vld1q_f32(r); + const float32x4_t c1 = vld1q_f32(r + 1); + const float32x4_t c2 = vld1q_f32(r + 2); + + acc = vmlaq_n_f32(acc, c0, w0); + acc = vmlaq_n_f32(acc, c1, w1); + acc = vmlaq_n_f32(acc, c2, w2); +} + +MLAS_FORCEINLINE float DepthwiseComputeEdge( + const float* row0, + const float* row1, + const float* row2, + ptrdiff_t iw, + size_t width, + const float w00, + const float w01, + const float w02, + const float w10, + const float w11, + const float w12, + const float w20, + const float w21, + const float w22 +) +{ + float acc = 0.0f; + const ptrdiff_t c0 = iw; + const ptrdiff_t c1 = iw + 1; + const ptrdiff_t c2 = iw + 2; + + acc += DepthwiseSampleValue(row0, c0, width) * w00; + acc += DepthwiseSampleValue(row0, c1, width) * w01; + acc += DepthwiseSampleValue(row0, c2, width) * w02; + acc += DepthwiseSampleValue(row1, c0, width) * w10; + acc += DepthwiseSampleValue(row1, c1, width) * w11; + acc += DepthwiseSampleValue(row1, c2, width) * w12; + acc += DepthwiseSampleValue(row2, c0, width) * w20; + acc += DepthwiseSampleValue(row2, c1, width) * w21; + acc += DepthwiseSampleValue(row2, c2, width) * w22; + + return acc; +} + +static void DepthwiseConv3x3Stride1PadLe1Neon( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros +) +{ + MLAS_UNREFERENCED_PARAMETER(Zeros); + + const size_t H = Parameters->InputShape[0]; + const size_t W = Parameters->InputShape[1]; + const size_t out_rows = Parameters->OutputShape[0]; + const size_t out_cols = Parameters->OutputShape[1]; + + const size_t pad_top = Parameters->Padding[0]; + const size_t pad_left = Parameters->Padding[1]; + const size_t pad_right = Parameters->Padding[3]; + + const float beta = Parameters->Beta; + const bool accumulate_output = beta != 0.0f; + + const float w00 = Filter[0]; + const float w01 = Filter[1]; + const float w02 = Filter[2]; + const float w10 = Filter[3]; + const float w11 = Filter[4]; + const float w12 = Filter[5]; + const float w20 = Filter[6]; + const float w21 = Filter[7]; + const float w22 = Filter[8]; + + for (size_t oh = 0; oh < out_rows; ++oh) { + const ptrdiff_t ih = static_cast(oh) - static_cast(pad_top); + + const ptrdiff_t row0_index = ih - 1; + const ptrdiff_t row1_index = ih; + const ptrdiff_t row2_index = ih + 1; + + const float* row0 = nullptr; + const float* row1 = nullptr; + const float* row2 = nullptr; + + if (row0_index >= 0 && row0_index < static_cast(H)) { + row0 = Input + static_cast(row0_index) * W; + } + if (row1_index >= 0 && row1_index < static_cast(H)) { + row1 = Input + static_cast(row1_index) * W; + } + if (row2_index >= 0 && row2_index < static_cast(H)) { + row2 = Input + static_cast(row2_index) * W; + } + + float* out_row = Output + oh * out_cols; + size_t ow = 0; + + if (pad_left && ow < out_cols) { + const ptrdiff_t iw = static_cast(ow) - static_cast(pad_left); + float acc = DepthwiseComputeEdge( + row0, row1, row2, iw, W, + w00, w01, w02, w10, w11, w12, w20, w21, w22 + ); + if (accumulate_output) { + acc += beta * out_row[ow]; + } + out_row[ow++] = acc; + } + + size_t interior_cols = 0; + if (out_cols > pad_left + pad_right) { + interior_cols = out_cols - pad_left - pad_right; + } + + size_t processed = 0; + while (processed + 4 <= interior_cols) { + const ptrdiff_t iw = static_cast(ow) - static_cast(pad_left); + if ((iw + 5) >= static_cast(W)) { + break; + } + + const size_t base = static_cast(iw); + float32x4_t acc = vdupq_n_f32(0.0f); + + DepthwiseAccumulateRowVector(acc, row0, base, w00, w01, w02); + DepthwiseAccumulateRowVector(acc, row1, base, w10, w11, w12); + DepthwiseAccumulateRowVector(acc, row2, base, w20, w21, w22); + + if (accumulate_output) { + const float32x4_t prev = vld1q_f32(out_row + ow); + acc = vmlaq_n_f32(acc, prev, beta); + } + + vst1q_f32(out_row + ow, acc); + ow += 4; + processed += 4; + } + + for (; processed < interior_cols; ++processed) { + const ptrdiff_t iw = static_cast(ow) - static_cast(pad_left); + const size_t base = static_cast(iw); + + float acc = 0.0f; + acc = DepthwiseAccumulateRowScalar(acc, row0, base, w00, w01, w02); + acc = DepthwiseAccumulateRowScalar(acc, row1, base, w10, w11, w12); + acc = DepthwiseAccumulateRowScalar(acc, row2, base, w20, w21, w22); + + if (accumulate_output) { + acc += beta * out_row[ow]; + } + out_row[ow++] = acc; + } + + if (pad_right && ow < out_cols) { + const ptrdiff_t iw = static_cast(ow) - static_cast(pad_left); + float acc = DepthwiseComputeEdge( + row0, row1, row2, iw, W, + w00, w01, w02, w10, w11, w12, w20, w21, w22 + ); + if (accumulate_output) { + acc += beta * out_row[ow]; + } + out_row[ow++] = acc; + } + } +} + +/* static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( const MLAS_CONV_PARAMETERS* Parameters, const float* Input, @@ -267,7 +483,7 @@ void convolution_depthwise3x3_winograd_impl( float* Output, const float* Zeros ) -/*++ +++ Routine Description: @@ -285,14 +501,14 @@ Routine Description: Zeroes - Point to working buffer where all 0.0f are filled. ---*/ +-- { MLAS_UNREFERENCED_PARAMETER(Zeros); Arguments args; args.in_rows = Parameters->InputShape[0]; args.in_cols = Parameters->InputShape[1]; - + args.out_rows = Parameters->OutputShape[0]; args.out_cols = Parameters->OutputShape[1]; @@ -301,7 +517,39 @@ Routine Description: convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output); } +*/ +static +void +MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( + const MLAS_CONV_PARAMETERS* Parameters, + const float* Input, + const float* Filter, + float* Output, + const float* Zeros + ) +/*++ + +Routine Description: + + This routine is an inner kernel to compute convolution on one channel input with one filter channel. + +Arguments: + + Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. + + Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + + Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + + Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + + Zeroes - Point to working buffer where all 0.0f are filled. + +--*/ +{ + DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output, Zeros); +} void MlasConvDepthwiseFloat_CHW( const MLAS_CONV_PARAMETERS* Parameters, @@ -338,4 +586,3 @@ Routine Description: { MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros); } - From dd94a3b0883f584d9b6b02ea3eeda46b9ddfa377 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 02:23:58 -0800 Subject: [PATCH 17/31] Fix indexing --- onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index d147ea6276566..02c34bf83990c 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -384,9 +384,9 @@ static void DepthwiseConv3x3Stride1PadLe1Neon( for (size_t oh = 0; oh < out_rows; ++oh) { const ptrdiff_t ih = static_cast(oh) - static_cast(pad_top); - const ptrdiff_t row0_index = ih - 1; - const ptrdiff_t row1_index = ih; - const ptrdiff_t row2_index = ih + 1; + const ptrdiff_t row0_index = ih; + const ptrdiff_t row1_index = ih + 1; + const ptrdiff_t row2_index = ih + 2; const float* row0 = nullptr; const float* row1 = nullptr; From ffd291adfe1b25464416ebc24cae97aaf33e50c9 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 06:54:23 -0800 Subject: [PATCH 18/31] Add benchmark --- onnxruntime/test/mlas/bench/bench_sconv.cpp | 5 +++++ onnxruntime/test/optimizer/conv_add_act_test.cc | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index dc37980002978..2a41a6a6667b2 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -326,6 +326,11 @@ static void TeamsModel(benchmark::internal::Benchmark* b) { b->Args({2, 1, 1, 12, 12, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}); // fused Conv_376 => 48x80 b->Args({2, 1, 1, 12, 72, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}); // Conv_59 => 24x40 + + b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model + b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model + b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model + } BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime(); diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc index 1797ede3d24c5..f61f9b29d9cce 100644 --- a/onnxruntime/test/optimizer/conv_add_act_test.cc +++ b/onnxruntime/test/optimizer/conv_add_act_test.cc @@ -19,7 +19,7 @@ void TestConvPath(const std::vector& input_shape, const std::vector(input_shape, -31, 31); auto* output_arg = builder.MakeOutput(); auto* bias_arg = builder.MakeInitializer({weights_shape[0]}, -20.f, 20.f); - auto* add_arg = builder.MakeInput(output_shape, 0.f, 0.f); + auto* add_arg = builder.MakeInput(output_shape, -20.f, 20.f); auto* weight_arg = builder.MakeInitializer(weights_shape, -2.f, 2.f); auto* conv_out_arg = builder.MakeIntermediate(); From 92fb60450745beb41690a738460f1303775e654b Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 07:03:53 -0800 Subject: [PATCH 19/31] Add lambda --- onnxruntime/core/mlas/lib/convolve.cpp | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 7cbc7de86238a..288109bd8e79f 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1454,17 +1454,25 @@ Return Value: #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution. - // Currently only support 3x3 kernel with padding <=1 and dilations = 1. + // Currently only support 3x3 kernel with padding <=1 and dilations = 1 + // and on ARM64, it is further restricted to strides = 1. // TODO: support more general depthwise convolution. + bool depthwise_conv_stride_support_check = [&]() { + // On ARM64, only support stride = 1 for depthwise conv. + #if defined(MLAS_TARGET_ARM64) + return Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1; + #else + return true; + #endif + }(); + if (Dimensions == 2 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3 && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1 && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1 -#if defined(MLAS_TARGET_ARM64) - && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1 -#endif + && depthwise_conv_stride_support_check && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) { *WorkingBufferSize = Parameters->InputShape[1] + 2; From d15bb9303eef43b06276acac334c8204919c8a58 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 07:06:22 -0800 Subject: [PATCH 20/31] Rework --- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index 02c34bf83990c..c70ae134d6e42 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -357,8 +357,6 @@ static void DepthwiseConv3x3Stride1PadLe1Neon( const float* Zeros ) { - MLAS_UNREFERENCED_PARAMETER(Zeros); - const size_t H = Parameters->InputShape[0]; const size_t W = Parameters->InputShape[1]; const size_t out_rows = Parameters->OutputShape[0]; @@ -525,8 +523,7 @@ MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( const MLAS_CONV_PARAMETERS* Parameters, const float* Input, const float* Filter, - float* Output, - const float* Zeros + float* Output ) /*++ @@ -544,11 +541,9 @@ Routine Description: Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. - Zeroes - Point to working buffer where all 0.0f are filled. - --*/ { - DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output, Zeros); + DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output); } void MlasConvDepthwiseFloat_CHW( @@ -579,10 +574,11 @@ Routine Description: Note: No checking here as it is inner loop. Logic in generating Parameters controls the check. - Currently only support 2d kernel 3x3. + Currently only support 2d kernel 3x3 with strides=1, dilations=1, pads<=1. Will add general case and more special case if needed later. --*/ { - MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros); + MLAS_UNREFERENCED_PARAMETER(Zeros); + MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output); } From 119ec9a760bf61a58b03c32310cdce7c08c8def3 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 07:14:29 -0800 Subject: [PATCH 21/31] Update onnxruntime/test/mlas/bench/bench_sconv.cpp Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- onnxruntime/test/mlas/bench/bench_sconv.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index 2a41a6a6667b2..4330b6e47a63d 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -330,7 +330,6 @@ static void TeamsModel(benchmark::internal::Benchmark* b) { b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model - } BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime(); From d0fc1431b26aac4e0639d67ef94adb0c299b0087 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 07:30:29 -0800 Subject: [PATCH 22/31] Fix --- onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index c70ae134d6e42..9a5d5f1ebae8a 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -353,8 +353,7 @@ static void DepthwiseConv3x3Stride1PadLe1Neon( const MLAS_CONV_PARAMETERS* Parameters, const float* Input, const float* Filter, - float* Output, - const float* Zeros + float* Output ) { const size_t H = Parameters->InputShape[0]; From 2820a84261123499e6ddb03e734810d8f6ad98ed Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 20:23:31 -0800 Subject: [PATCH 23/31] Remove Winograd implementation --- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 286 ------------------ 1 file changed, 286 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index 9a5d5f1ebae8a..c47103a3963b2 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -18,248 +18,6 @@ Module Name: #include "mlasi.h" #include -#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) -#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) -#else -#define C10_LIKELY(expr) (expr) -#endif - - -#if defined(_MSC_VER) && defined(MLAS_TARGET_ARM64) -MLAS_FORCEINLINE float32x4_t operator+(float32x4_t lhs, float32x4_t rhs) -{ - return vaddq_f32(lhs, rhs); -} - -MLAS_FORCEINLINE float32x4_t operator-(float32x4_t lhs, float32x4_t rhs) -{ - return vsubq_f32(lhs, rhs); -} - -MLAS_FORCEINLINE float32x4_t operator*(float32x4_t lhs, float32x4_t rhs) -{ - return vmulq_f32(lhs, rhs); -} - -MLAS_FORCEINLINE float32x4_t operator-(float32x4_t value) -{ - return vnegq_f32(value); -} -#endif - -struct Arguments final { - // Input layer dimensions - int64_t in_rows; - int64_t in_cols; - - // Output layer dimensions - int64_t out_rows; - int64_t out_cols; - - // Padding info - int64_t pad_rows; - int64_t pad_cols; -}; - -inline void winograd_f2k3_input_transform_inplace__neon( - float32x4_t* const d0, - float32x4_t* const d1, - float32x4_t* const d2, - float32x4_t* const d3 -) -{ - const float32x4_t wd0 = *d0 - *d2; - const float32x4_t wd1 = *d1 + *d2; - const float32x4_t wd2 = -*d1 + *d2; - const float32x4_t wd3 = *d1 - *d3; - *d0 = wd0; - *d1 = wd1; - *d2 = wd2; - *d3 = wd3; -} - -inline void winograd_f2k3_output_transform_inplace__neon( - float32x4_t* const m0, - float32x4_t* const m1, - const float32x4_t* const m2, - const float32x4_t* const m3 -) -{ - *m0 = *m0 + *m1 + *m2; - *m1 = *m1 - *m2 - *m3; -} - -inline float32x4_t vmuladdq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b) -{ - return vfmaq_f32(c, a, b); - // TODO: Support ARMv7 -} - -inline float32x4_t vmulsubq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b) -{ - return vfmsq_f32(c, a, b); - // TODO: Support ARMv7 -} - -inline void winograd_f2k3_kernel_transform__neon( - const float32x4_t g0, - const float32x4_t g1, - const float32x4_t g2, - float32x4_t* const transform0, - float32x4_t* const transform1, - float32x4_t* const transform2, - float32x4_t* const transform3 -) -{ - const float32x4_t const_half = vdupq_n_f32(0.5f); - float32x4_t half_g0_plus_g2 = const_half * (g0 + g2); - *transform0 = g0; - *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1); - *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1); - *transform3 = g2; -} - -inline float32x4x4_t v4f_transpose4x4__neon(const float32x4x4_t m) -{ - float32x4x4_t ret; - vst4q_f32((float*)(&ret), m); - return ret; -} - -void convolution_depthwise3x3_winograd_impl( - const Arguments& args, - const float* const input, - const float* const kernel, - float* const output -) -{ - //const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1); - float32x4x4_t kernel_tile; - - { - const float32x4_t g0 = vld1q_f32(kernel); - const float32x4_t g1 = vld1q_f32(kernel + 3); - // g2[3] is junk - const float32x4_t g2 = - vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1); - float32x4x4_t w; - winograd_f2k3_kernel_transform__neon( - g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3] - ); - w = v4f_transpose4x4__neon(w); - - winograd_f2k3_kernel_transform__neon( - w.val[0], - w.val[1], - w.val[2], - &kernel_tile.val[0], - &kernel_tile.val[1], - &kernel_tile.val[2], - &kernel_tile.val[3] - ); - } - - #define TILE \ - winograd_f2k3_input_transform_inplace__neon( \ - &input_tile.val[0], \ - &input_tile.val[1], \ - &input_tile.val[2], \ - &input_tile.val[3] \ - ); \ - input_tile = v4f_transpose4x4__neon(input_tile); \ - winograd_f2k3_input_transform_inplace__neon( \ - &input_tile.val[0], \ - &input_tile.val[1], \ - &input_tile.val[2], \ - &input_tile.val[3] \ - ); \ - \ - for (size_t row = 0; row < 4; ++row) { \ - input_tile.val[row] = \ - vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \ - } \ - \ - winograd_f2k3_output_transform_inplace__neon( \ - &input_tile.val[0], \ - &input_tile.val[1], \ - &input_tile.val[2], \ - &input_tile.val[3] \ - ); \ - input_tile = v4f_transpose4x4__neon(input_tile); \ - winograd_f2k3_output_transform_inplace__neon( \ - &input_tile.val[0], \ - &input_tile.val[1], \ - &input_tile.val[2], \ - &input_tile.val[3] \ - ) - - // Non-padded regime. - - // Iterate over non-padded output tiles. - // TODO: avoid spilling W by breaking out the non-padded vs padded case. - for (int64_t oth = 0; oth < (args.out_rows + 1) / 2; ++oth) { - for (int64_t otw = 0; otw < (args.out_cols + 1) / 2; ++otw) { - // load input tile for [oth, otw]; - int64_t ih = oth * 2 - args.pad_rows; - int64_t iw = otw * 2 - args.pad_cols; - // fast-path, all accesses in-bounds - if (C10_LIKELY( - ih >= 0 && iw >= 0 && ih + 3 < args.in_rows && - iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows && - 2 * otw + 1 < args.out_cols - )) { - float32x4x4_t input_tile; - for (int64_t row = 0; row < 4; ++row) { - input_tile.val[row] = - vld1q_f32(input + (ih + row) * args.in_cols + iw); - } - - TILE; - - for (int64_t row = 0; row < 2; ++row) { - vst1_f32( - output + (oth * 2 + row) * args.out_cols + otw * 2, - vget_low_f32(input_tile.val[row]) - ); - } - } else { - float block[4][4]; - for (int64_t row = 0; row < 4; ++row) { - for (int64_t col = 0; col < 4; ++col) { - if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows && - iw + col < args.in_cols) { - block[row][col] = input[(ih + row) * args.in_cols + iw + col]; - } else { - block[row][col] = 0.0; - } - } - } - - float32x4x4_t input_tile; - for (int64_t row = 0; row < 4; ++row) { - input_tile.val[row] = vld1q_f32(&block[row][0]); - } - - TILE; - - float oblock[2][2]; - for (int64_t row = 0; row < 2; ++row) { - vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row])); - } - for (int64_t row = 0; row < 2; ++row) { - for (int64_t col = 0; col < 2; ++col) { - if (2 * oth + row < args.out_rows && - 2 * otw + col < args.out_cols) { - output[(2 * oth + row) * args.out_cols + 2 * otw + col] = - oblock[row][col]; - } - } - } - } - } - } -} - MLAS_FORCEINLINE float DepthwiseSampleValue( const float* row, ptrdiff_t col, @@ -472,50 +230,6 @@ static void DepthwiseConv3x3Stride1PadLe1Neon( } } -/* - static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( - const MLAS_CONV_PARAMETERS* Parameters, - const float* Input, - const float* Filter, - float* Output, - const float* Zeros - ) -++ - -Routine Description: - - This routine is an inner kernel to compute convolution on one channel input with one filter channel. - -Arguments: - - Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. - - Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. - - Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. - - Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. - - Zeroes - Point to working buffer where all 0.0f are filled. - --- -{ - MLAS_UNREFERENCED_PARAMETER(Zeros); - - Arguments args; - args.in_rows = Parameters->InputShape[0]; - args.in_cols = Parameters->InputShape[1]; - - args.out_rows = Parameters->OutputShape[0]; - args.out_cols = Parameters->OutputShape[1]; - - args.pad_rows = Parameters->Padding[0]; - args.pad_cols = Parameters->Padding[1]; - - convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output); -} -*/ - static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1( From 0ffb811911fbaa38493930ecffcc076b36781bf4 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:21:58 -0800 Subject: [PATCH 24/31] Update onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index c47103a3963b2..c3783d516031f 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -278,7 +278,7 @@ Routine Description: Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. - Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data. Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. From 59e2b2dcff290ccebb0aaf9b50ea2354ecc02219 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:22:30 -0800 Subject: [PATCH 25/31] Update onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index c3783d516031f..a009a8f28cb59 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -282,7 +282,7 @@ Routine Description: Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. - Zeroes - Point to working buffer where all 0.0f are filled. + Zeros - Point to working buffer where all 0.0f are filled. Note: No checking here as it is inner loop. Logic in generating Parameters controls the check. From e34c930ac5641a267566144f5873b92282a108a9 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:22:49 -0800 Subject: [PATCH 26/31] Update onnxruntime/core/mlas/lib/convolve.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/lib/convolve.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 288109bd8e79f..7c7971061f780 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1111,7 +1111,6 @@ Return Value: ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool); - if (static_cast(TargetThreadCount) >= BatchGroupCount) { TargetThreadCount = static_cast(BatchGroupCount); } From 027e74292c071a5d8017c2480e8a9d2f2af6ccbe Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:23:08 -0800 Subject: [PATCH 27/31] Update onnxruntime/core/mlas/inc/mlas.h Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/inc/mlas.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index f59ac92e1a9c2..edf2bb7582a90 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM { MlasConvAlgorithmGemmDirect, MlasConvAlgorithmExpandThenGemm, MlasConvAlgorithmExpandThenGemmSegmented, -#if defined(MLAS_TARGET_WASM_SCALAR) || defined (MLAS_TARGET_ARM64) +#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64) MlasConvAlgorithmDepthwise, #endif }; From f93ed67a1fdec09c75d84b33fc8eaf9d1af59503 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:23:39 -0800 Subject: [PATCH 28/31] Update onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index a009a8f28cb59..d3ec05ec92fac 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -248,11 +248,11 @@ Routine Description: Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc. - Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data. + Input - input channel data start. Input is NCHW, so this pointer points to single H x W image data. - Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data. + Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data. - Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data. + Output - whole output are of N x F x OH x OW. This pointer points to single OH x OW output image data. --*/ { From f5c1b812cf14f5b73b71159ec0ae5e2b94fb1059 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 4 Dec 2025 23:31:36 -0800 Subject: [PATCH 29/31] Update onnxruntime/core/mlas/lib/convolve.cpp Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- onnxruntime/core/mlas/lib/convolve.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp index 7c7971061f780..0349ce418d406 100644 --- a/onnxruntime/core/mlas/lib/convolve.cpp +++ b/onnxruntime/core/mlas/lib/convolve.cpp @@ -1457,14 +1457,12 @@ Return Value: // and on ARM64, it is further restricted to strides = 1. // TODO: support more general depthwise convolution. - bool depthwise_conv_stride_support_check = [&]() { // On ARM64, only support stride = 1 for depthwise conv. #if defined(MLAS_TARGET_ARM64) - return Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1; + bool depthwise_conv_stride_support_check = Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1; #else - return true; + bool depthwise_conv_stride_support_check = true; #endif - }(); if (Dimensions == 2 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1 From f15e5540aa8cad568ea26d6be53c0b4e07992494 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Thu, 4 Dec 2025 23:34:14 -0800 Subject: [PATCH 30/31] Benchmark updates --- onnxruntime/test/mlas/bench/bench_sconv.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp index 4330b6e47a63d..849911e322214 100644 --- a/onnxruntime/test/mlas/bench/bench_sconv.cpp +++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp @@ -328,8 +328,8 @@ static void TeamsModel(benchmark::internal::Benchmark* b) { b->Args({2, 1, 1, 12, 72, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}); // Conv_59 => 24x40 b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model - b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model - b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model + b->Args({2, 1, 512, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model + b->Args({2, 1, 960, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1}); // External customer model } BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime(); From 60a2b81b5d2da6e04cdc34ee2184ee8287eec580 Mon Sep 17 00:00:00 2001 From: Hari Seshadri Date: Tue, 16 Dec 2025 07:10:30 -0800 Subject: [PATCH 31/31] Use MLAS helpers instead of using NEON intrindics directly --- .../core/mlas/lib/sconv_nchw_kernel_neon.cpp | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp index d3ec05ec92fac..14b6b30c85bda 100644 --- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp @@ -63,13 +63,13 @@ MLAS_FORCEINLINE void DepthwiseAccumulateRowVector( } const float* r = row + base; - const float32x4_t c0 = vld1q_f32(r); - const float32x4_t c1 = vld1q_f32(r + 1); - const float32x4_t c2 = vld1q_f32(r + 2); + const float32x4_t c0 = MlasLoadFloat32x4(r); + const float32x4_t c1 = MlasLoadFloat32x4(r + 1); + const float32x4_t c2 = MlasLoadFloat32x4(r + 2); - acc = vmlaq_n_f32(acc, c0, w0); - acc = vmlaq_n_f32(acc, c1, w1); - acc = vmlaq_n_f32(acc, c2, w2); + acc = MlasMultiplyAddFloat32x4(c0, w0, acc); + acc = MlasMultiplyAddFloat32x4(c1, w1, acc); + acc = MlasMultiplyAddFloat32x4(c2, w2, acc); } MLAS_FORCEINLINE float DepthwiseComputeEdge( @@ -185,18 +185,18 @@ static void DepthwiseConv3x3Stride1PadLe1Neon( } const size_t base = static_cast(iw); - float32x4_t acc = vdupq_n_f32(0.0f); + float32x4_t acc = MlasZeroFloat32x4(); DepthwiseAccumulateRowVector(acc, row0, base, w00, w01, w02); DepthwiseAccumulateRowVector(acc, row1, base, w10, w11, w12); DepthwiseAccumulateRowVector(acc, row2, base, w20, w21, w22); if (accumulate_output) { - const float32x4_t prev = vld1q_f32(out_row + ow); - acc = vmlaq_n_f32(acc, prev, beta); + const float32x4_t prev = MlasLoadFloat32x4(out_row + ow); + acc = MlasMultiplyAddFloat32x4(prev, beta, acc); } - vst1q_f32(out_row + ow, acc); + MlasStoreFloat32x4(out_row + ow, acc); ow += 4; processed += 4; }