From 160c71e22c7bbfcfff09a3528198839d626dd1f6 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Sun, 30 Nov 2025 22:16:00 -0800
Subject: [PATCH 01/31] Initial commit

---
 onnxruntime/core/mlas/lib/convolve.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 9518134631f2d..fab0f742a125b 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -985,7 +985,7 @@ Return Value:
         return;
     }
 
-#if defined(MLAS_TARGET_WASM_SCALAR)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
     if (Algorithm == MlasConvAlgorithmDepthwise) {
         // Fill the Working Buffer with Zero for use by the depthwise kernel.
@@ -1082,7 +1082,7 @@ Return Value:
                     break;
                 }
 
-#if defined(MLAS_TARGET_WASM_SCALAR)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
                 case MlasConvAlgorithmDepthwise:
                 {
@@ -1337,9 +1337,9 @@ Return Value:
 
     } else {
 
-#if defined(MLAS_TARGET_WASM_SCALAR)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
-        // Scalar direct conv for depthwise convolution.
+        // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution.
         // Currently only support 3x3 kernel with padding <=1 and dilations = 1.
         // TODO: support more general depthwise convolution.
 
@@ -1411,8 +1411,8 @@ Return Value:
 
         if (Parameters->BatchCount > 1 || Parameters->GroupCount > 1) {
 
-            size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K, 
-                                                          Parameters->FilterCount * Parameters->OutputSize, 
+            size_t WorkingBufferSizePerThread = std::max({Parameters->OutputSize * Parameters->K,
+                                                          Parameters->FilterCount * Parameters->OutputSize,
                                                           static_cast<size_t>(MLAS_CONV_WORKING_BUFFER_SIZE_PER_THREAD)});
             TargetThreadCount = MaximumThreadCount;
             if (static_cast<size_t>(TargetThreadCount) >= Parameters->BatchCount * Parameters->GroupCount) {

From a134ea00ff98ff2a3c425f1f880cec201beeeb7a Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Sun, 30 Nov 2025 22:24:59 -0800
Subject: [PATCH 02/31] More changes

---
 onnxruntime/core/mlas/inc/mlas.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index 9d98a15d8457a..edf2bb7582a90 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM {
     MlasConvAlgorithmGemmDirect,
     MlasConvAlgorithmExpandThenGemm,
     MlasConvAlgorithmExpandThenGemmSegmented,
-#if defined(MLAS_TARGET_WASM_SCALAR)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
     MlasConvAlgorithmDepthwise,
 #endif
 };

From a44f7085ea60d3b79d4de4f762387722f060ad92 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 02:06:52 -0800
Subject: [PATCH 03/31] More changes

---
 cmake/onnxruntime_mlas.cmake                  |   2 +-
 .../core/mlas/lib/sconv_kernel_neon.cpp       | 180 +++++++++++++++++-
 2 files changed, 180 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index c0ab948b41fff..e113a9f0cc81c 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -307,7 +307,6 @@ endfunction()
 function (setup_arm_neon_nchwc)
   target_sources(onnxruntime_mlas PRIVATE
    ${MLAS_SRC_DIR}/sconv.h
-   ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
    ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
   )
   list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
@@ -460,6 +459,7 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
+          ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
         )
 
         # Conditionally add the SVE implementation if compiler supports it
diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
index 4c5f50adb929c..3909495f24743 100644
--- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
@@ -14,9 +14,187 @@ Module Name:
 
 --*/
 
-#if defined(MLAS_USE_ARM_NEON_NCHWC)
 
 #include "mlasi.h"
+
+static
+void
+MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+
+    Zeroes - Point to working buffer where all 0.0f are filled.
+
+--*/
+{
+    const size_t W = Parameters->InputShape[1];
+    const float beta = Parameters->Beta;
+
+    if (W > 1) {
+
+        const float w00 = Filter[0];
+        const float w01 = Filter[1];
+        const float w02 = Filter[2];
+        const float w10 = Filter[3];
+        const float w11 = Filter[4];
+        const float w12 = Filter[5];
+        const float w20 = Filter[6];
+        const float w21 = Filter[7];
+        const float w22 = Filter[8];
+
+        const size_t H = Parameters->InputShape[0];
+        const size_t pad_top = Parameters->Padding[0];
+        const size_t pad_left = Parameters->Padding[1];
+        const size_t stride_h = Parameters->StrideShape[0];
+        const size_t stride_w = Parameters->StrideShape[1];
+
+        // We treat pad_left, pad_top are hard require.
+        // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters.
+        const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0;
+
+        const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left);
+        // Need to handle effective pad_bottom is 2 when H == 1
+        const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left;
+        const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W);
+
+        for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) {
+            auto out_col = Parameters->OutputShape[1];
+
+            if (pad_left == 1) {
+                float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] +
+                               w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+                out_col--;
+                row0 += stride_w;
+                row1 += stride_w;
+                row2 += stride_w;
+            }
+
+            for (; out_col > pad_right; out_col--) {
+                float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] +
+                               w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] +
+                               w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+                row0 += stride_w;
+                row1 += stride_w;
+                row2 += stride_w;
+            }
+
+            if (out_col == 1) { // pad_right == 1
+                float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] +
+                               w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+            }
+
+            h += stride_h;
+            row0 = (Input + (h - pad_top) * W) - pad_left;
+            row1 = row0 + W;
+            row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W);
+        }
+
+    } else { // W == 1
+
+        const size_t H = Parameters->InputShape[0];
+        const size_t pad_left = Parameters->Padding[1];
+        const size_t pad_top = Parameters->Padding[0];
+        const size_t stride_h = Parameters->StrideShape[0];
+        size_t out_row = Parameters->OutputShape[0];
+
+        // Make sure pad_bottom is consistent with other parameters.
+        size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ?
+                                ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0;
+
+        const float w0 = Filter[pad_left ? 1 : 0];
+        const float w1 = Filter[pad_left ? 4 : 3];
+        const float w2 = Filter[pad_left ? 7 : 6];
+        auto init_v = (beta == 0.f ? 0.f : *Output * beta);
+
+        if (pad_top == 1) {
+            *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v;
+            out_row--;
+        }
+
+        for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) {
+            // All pixels are in the input col
+            auto init = (beta == 0.f ? 0.f : *Output * beta);
+            *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init;
+            row += stride_h;
+        }
+
+        if (out_row > 0) {
+            // last 1 or 2 rows are from the padding zero row.
+            // out_row == 1 when arrive here
+            if (pad_bottom == 1) {
+                const float* row = Input + H - 2;
+                *Output++ = w0 * row[0] + w1 * row[1] + init_v;
+            } else { // pad_bottom == 2 and H == 1 and padding_top == 0
+                *Output++ = w0 * Input[0] + init_v;
+            }
+        }
+    }
+
+}
+
+
+void
+MlasConvDepthwiseFloat_CHW(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+
+    Zeroes - Point to working buffer where all 0.0f are filled.
+
+Note:
+    No checking here as it is inner loop. Logic in generating Parameters controls the check.
+
+    Currently only support 2d kernel 3x3.
+    Will add general case and more special case if needed later.
+
+--*/
+{
+    MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros);
+}
+
+#if defined(MLAS_USE_ARM_NEON_NCHWC)
+// Everything below these are NCHWC related kernels
+
 #include "sconv.h"
 
 constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;

From 3a3ccf76783e1b301ead69db91d6a61af80fcc8e Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 07:38:57 -0800
Subject: [PATCH 04/31] Fix builds

---
 cmake/onnxruntime_mlas.cmake     | 1 +
 onnxruntime/core/mlas/inc/mlas.h | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index e113a9f0cc81c..0ba48c2b094c5 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -113,6 +113,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
+	${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
       )
 
       set(mlas_platform_preprocess_srcs
diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index edf2bb7582a90..f59ac92e1a9c2 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM {
     MlasConvAlgorithmGemmDirect,
     MlasConvAlgorithmExpandThenGemm,
     MlasConvAlgorithmExpandThenGemmSegmented,
-#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined (MLAS_TARGET_ARM64)
     MlasConvAlgorithmDepthwise,
 #endif
 };

From 212dbf1e9f89b533654f06660cefd5b3db312b1e Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 07:42:49 -0800
Subject: [PATCH 05/31] Fix builds 2

---
 onnxruntime/core/mlas/lib/mlasi.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
index ad62cccbfb9c7..386570454b2fd 100644
--- a/onnxruntime/core/mlas/lib/mlasi.h
+++ b/onnxruntime/core/mlas/lib/mlasi.h
@@ -1601,7 +1601,8 @@ MlasFp32FromBits(
 #pragma warning(pop)
 #endif
 
-#if defined(MLAS_TARGET_WASM_SCALAR)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
+
 
 void
 MLASCALL

From fceae09b1ac6e820c567b89d35478974b137cbcb Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 08:27:52 -0800
Subject: [PATCH 06/31] Threaded

---
 onnxruntime/core/mlas/lib/convolve.cpp | 105 +++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index fab0f742a125b..f18632829e662 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -805,6 +805,86 @@ Return Value:
     }
 }
 
+void
+MlasDepthwiseThreaded(
+    void* Context,
+    ptrdiff_t Index
+)
+
+/*++
+
+Routine Description:
+
+    This routine is invoked from a worker thread to execute a segment of a
+    convolution operation.
+
+    If using this, the entire convolution operation is parallelized on the
+    (batch size * group count) parameter and this routine has logic to
+    perform a specific thread's shard of the entire Convolution operation.
+
+Arguments:
+
+    Context - Supplies the pointer to the context for the threaded operation.
+
+    Index - Supplies the current index of the threaded operation.
+
+Return Value:
+
+    None.
+
+--*/
+
+{
+
+    MLAS_CONV_WORK_BLOCK* WorkBlock = (MLAS_CONV_WORK_BLOCK*)Context;
+
+    const MLAS_CONV_PARAMETERS* Parameters = WorkBlock->Parameters;
+
+    const size_t GroupCount = Parameters->GroupCount;
+    const size_t BatchGroupCount = Parameters->BatchCount * GroupCount;
+
+    const size_t TargetThreadCount = WorkBlock->TargetThreadCount;
+
+    const size_t BatchGroupCountPerThread = BatchGroupCount / TargetThreadCount;
+    const size_t BatchGroupCountExtra = BatchGroupCount % TargetThreadCount;
+
+    size_t BatchGroupStart;
+    size_t BatchGroupEnd;
+
+    if (static_cast<size_t>(Index) < BatchGroupCountExtra) {
+        BatchGroupStart = (BatchGroupCountPerThread + 1) * Index;
+        BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread + 1;
+    } else {
+        BatchGroupStart = BatchGroupCountPerThread * Index + BatchGroupCountExtra;
+        BatchGroupEnd = BatchGroupStart + BatchGroupCountPerThread;
+    }
+
+    const size_t FilterCount = Parameters->FilterCount;
+    const size_t OutputSize = Parameters->OutputSize;
+    const size_t K = Parameters->K;
+
+    const size_t InputGroupSize = Parameters->InputChannels * Parameters->InputSize;
+    const size_t OutputGroupSize = FilterCount * OutputSize;
+    const size_t FilterGroupSize = FilterCount * K;
+
+    for (size_t bg = BatchGroupStart; bg < BatchGroupEnd; bg++) {
+        size_t group = bg % GroupCount;
+
+        const float* input = WorkBlock->Input + bg * InputGroupSize;
+        const float* filter = WorkBlock->Filter + group * FilterGroupSize;
+        float* output = WorkBlock->Output + bg * OutputGroupSize;
+        const float* bias = WorkBlock->Bias;
+        if (bias != nullptr) {
+            bias += group * FilterCount;
+        }
+
+        float* WorkingBuffer = WorkBlock->WorkingBuffer;
+
+        MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer);
+        MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
+    }
+}
+
 inline
 bool
 MlasConvTryMultithread(
@@ -1019,6 +1099,31 @@ Return Value:
         return;
     }
 
+    if (Algorithm == MlasConvAlgorithmDepthwise && ((BatchCount > 1) || (GroupCount > 1))) {
+        const size_t BatchGroupCount = BatchCount * GroupCount;
+
+        ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
+
+
+        if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
+            TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
+        }
+
+        MLAS_CONV_WORK_BLOCK WorkBlock;
+
+        WorkBlock.Parameters = Parameters;
+        WorkBlock.Input = Input;
+        WorkBlock.Filter = Filter;
+        WorkBlock.Bias = Bias;
+        WorkBlock.WorkingBuffer = WorkingBuffer;
+        WorkBlock.Output = Output;
+        WorkBlock.TargetThreadCount = TargetThreadCount;
+
+        MlasExecuteThreaded(MlasDepthwiseThreaded, &WorkBlock, TargetThreadCount, ThreadPool);
+
+        return;
+    }
+
     //
     // Iterate over each batch and group.
     //

From 3793d70f9afd953c7b8814d782f7a4968de2f938 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 08:42:38 -0800
Subject: [PATCH 07/31] Fix x64 builds

---
 onnxruntime/core/mlas/lib/convolve.cpp | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index f18632829e662..24c4f9ef8be5d 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -805,6 +805,8 @@ Return Value:
     }
 }
 
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
+
 void
 MlasDepthwiseThreaded(
     void* Context,
@@ -885,6 +887,8 @@ Return Value:
     }
 }
 
+#endif
+
 inline
 bool
 MlasConvTryMultithread(
@@ -1099,6 +1103,9 @@ Return Value:
         return;
     }
 
+
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
+
     if (Algorithm == MlasConvAlgorithmDepthwise && ((BatchCount > 1) || (GroupCount > 1))) {
         const size_t BatchGroupCount = BatchCount * GroupCount;
 
@@ -1124,6 +1131,8 @@ Return Value:
         return;
     }
 
+#endif
+
     //
     // Iterate over each batch and group.
     //

From 481a7f668ca11853a126d22aa495e03a8601b5ab Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 18:07:41 -0800
Subject: [PATCH 08/31] Experiment

---
 onnxruntime/core/mlas/lib/convolve.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 24c4f9ef8be5d..c9916f9ef580b 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -883,7 +883,7 @@ Return Value:
         float* WorkingBuffer = WorkBlock->WorkingBuffer;
 
         MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer);
-        MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
+        //MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
     }
 }
 

From 8993a0a0c0bcfccc4b24dd77322d699828e92549 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Mon, 1 Dec 2025 19:11:29 -0800
Subject: [PATCH 09/31] Experiment revert

---
 onnxruntime/core/mlas/lib/convolve.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index c9916f9ef580b..24c4f9ef8be5d 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -883,7 +883,7 @@ Return Value:
         float* WorkingBuffer = WorkBlock->WorkingBuffer;
 
         MlasConvDepthwiseFloat_CHW(Parameters, input, filter, output, WorkingBuffer);
-        //MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
+        MlasActivation(Parameters->Activation, output, bias, FilterCount, OutputSize, OutputSize);
     }
 }
 

From d765c1a145b1a9b4922c468179dbb79e5ba4c0e0 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Tue, 2 Dec 2025 07:28:42 -0800
Subject: [PATCH 10/31] Refactor

---
 cmake/onnxruntime_mlas.cmake                  |   9 +-
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp  | 194 ++++++++++++++++++
 ...l_neon.cpp => sconv_nchwc_kernel_neon.cpp} | 185 +----------------
 .../{sconv.h => sconv_nchwc_kernel_neon.h}    |   0
 ...l_neon.cpp => spool_nchwc_kernel_neon.cpp} |   0
 5 files changed, 202 insertions(+), 186 deletions(-)
 create mode 100644 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
 rename onnxruntime/core/mlas/lib/{sconv_kernel_neon.cpp => sconv_nchwc_kernel_neon.cpp} (77%)
 rename onnxruntime/core/mlas/lib/{sconv.h => sconv_nchwc_kernel_neon.h} (100%)
 rename onnxruntime/core/mlas/lib/{spool_kernel_neon.cpp => spool_nchwc_kernel_neon.cpp} (100%)

diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
index 0ba48c2b094c5..11c32dac1df04 100644
--- a/cmake/onnxruntime_mlas.cmake
+++ b/cmake/onnxruntime_mlas.cmake
@@ -113,7 +113,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
         ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
         ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-	${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
+        ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -307,8 +307,9 @@ endfunction()
 
 function (setup_arm_neon_nchwc)
   target_sources(onnxruntime_mlas PRIVATE
-   ${MLAS_SRC_DIR}/sconv.h
-   ${MLAS_SRC_DIR}/spool_kernel_neon.cpp
+   ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.h
+   ${MLAS_SRC_DIR}/sconv_nchwc_kernel_neon.cpp
+   ${MLAS_SRC_DIR}/spool_nchwc_kernel_neon.cpp
   )
   list(APPEND mlas_private_compile_definitions MLAS_USE_ARM_NEON_NCHWC)
   set(mlas_private_compile_definitions ${mlas_private_compile_definitions} PARENT_SCOPE)
@@ -460,7 +461,7 @@ else()
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.h
           ${MLAS_SRC_DIR}/eltwise_kernel_neon.cpp
           ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8_i8mm.cpp
-          ${MLAS_SRC_DIR}/sconv_kernel_neon.cpp
+          ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
         )
 
         # Conditionally add the SVE implementation if compiler supports it
diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
new file mode 100644
index 0000000000000..906d54fc2d4c5
--- /dev/null
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -0,0 +1,194 @@
+/*++
+
+Copyright (c) Microsoft Corporation. All rights reserved.
+
+Licensed under the MIT License.
+
+Module Name:
+
+    sconv_nchw_kernel_neon.cpp
+
+Abstract:
+
+    This module implements the single precision NCHW convolution kernels for ARM NEON.
+
+--*/
+
+
+#include "mlasi.h"
+
+static
+void
+MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+
+    Zeroes - Point to working buffer where all 0.0f are filled.
+
+--*/
+{
+    const size_t W = Parameters->InputShape[1];
+    const float beta = Parameters->Beta;
+
+    if (W > 1) {
+
+        const float w00 = Filter[0];
+        const float w01 = Filter[1];
+        const float w02 = Filter[2];
+        const float w10 = Filter[3];
+        const float w11 = Filter[4];
+        const float w12 = Filter[5];
+        const float w20 = Filter[6];
+        const float w21 = Filter[7];
+        const float w22 = Filter[8];
+
+        const size_t H = Parameters->InputShape[0];
+        const size_t pad_top = Parameters->Padding[0];
+        const size_t pad_left = Parameters->Padding[1];
+        const size_t stride_h = Parameters->StrideShape[0];
+        const size_t stride_w = Parameters->StrideShape[1];
+
+        // We treat pad_left, pad_top are hard require.
+        // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters.
+        const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0;
+
+        const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left);
+        // Need to handle effective pad_bottom is 2 when H == 1
+        const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left;
+        const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W);
+
+        for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) {
+            auto out_col = Parameters->OutputShape[1];
+
+            if (pad_left == 1) {
+                float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] +
+                               w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+                out_col--;
+                row0 += stride_w;
+                row1 += stride_w;
+                row2 += stride_w;
+            }
+
+            for (; out_col > pad_right; out_col--) {
+                float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] +
+                               w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] +
+                               w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+                row0 += stride_w;
+                row1 += stride_w;
+                row2 += stride_w;
+            }
+
+            if (out_col == 1) { // pad_right == 1
+                float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] +
+                               w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta);
+                *Output++ = dotsum;
+            }
+
+            h += stride_h;
+            row0 = (Input + (h - pad_top) * W) - pad_left;
+            row1 = row0 + W;
+            row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W);
+        }
+
+    } else { // W == 1
+
+        const size_t H = Parameters->InputShape[0];
+        const size_t pad_left = Parameters->Padding[1];
+        const size_t pad_top = Parameters->Padding[0];
+        const size_t stride_h = Parameters->StrideShape[0];
+        size_t out_row = Parameters->OutputShape[0];
+
+        // Make sure pad_bottom is consistent with other parameters.
+        size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ?
+                                ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0;
+
+        const float w0 = Filter[pad_left ? 1 : 0];
+        const float w1 = Filter[pad_left ? 4 : 3];
+        const float w2 = Filter[pad_left ? 7 : 6];
+        auto init_v = (beta == 0.f ? 0.f : *Output * beta);
+
+        if (pad_top == 1) {
+            *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v;
+            out_row--;
+        }
+
+        for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) {
+            // All pixels are in the input col
+            auto init = (beta == 0.f ? 0.f : *Output * beta);
+            *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init;
+            row += stride_h;
+        }
+
+        if (out_row > 0) {
+            // last 1 or 2 rows are from the padding zero row.
+            // out_row == 1 when arrive here
+            if (pad_bottom == 1) {
+                const float* row = Input + H - 2;
+                *Output++ = w0 * row[0] + w1 * row[1] + init_v;
+            } else { // pad_bottom == 2 and H == 1 and padding_top == 0
+                *Output++ = w0 * Input[0] + init_v;
+            }
+        }
+    }
+
+}
+
+
+void
+MlasConvDepthwiseFloat_CHW(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+
+    Zeroes - Point to working buffer where all 0.0f are filled.
+
+Note:
+    No checking here as it is inner loop. Logic in generating Parameters controls the check.
+
+    Currently only support 2d kernel 3x3.
+    Will add general case and more special case if needed later.
+
+--*/
+{
+    MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros);
+}
+
diff --git a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
similarity index 77%
rename from onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
rename to onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
index 3909495f24743..6f0e6c07eb8c0 100644
--- a/onnxruntime/core/mlas/lib/sconv_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
@@ -6,196 +6,17 @@ Licensed under the MIT License.
 
 Module Name:
 
-    sconv_kernel_neon.cpp
+    sconv_nchwc_kernel_neon.cpp
 
 Abstract:
 
-    This module implements the single precision convolution kernels for ARM NEON.
+    This module implements the single precision NCHWC convolution kernels for ARM NEON.
 
 --*/
 
-
-#include "mlasi.h"
-
-static
-void
-MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
-    const MLAS_CONV_PARAMETERS* Parameters,
-    const float* Input,
-    const float* Filter,
-    float* Output,
-    const float* Zeros
-    )
-/*++
-
-Routine Description:
-
-    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
-
-Arguments:
-
-    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
-
-    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
-
-    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
-
-    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
-
-    Zeroes - Point to working buffer where all 0.0f are filled.
-
---*/
-{
-    const size_t W = Parameters->InputShape[1];
-    const float beta = Parameters->Beta;
-
-    if (W > 1) {
-
-        const float w00 = Filter[0];
-        const float w01 = Filter[1];
-        const float w02 = Filter[2];
-        const float w10 = Filter[3];
-        const float w11 = Filter[4];
-        const float w12 = Filter[5];
-        const float w20 = Filter[6];
-        const float w21 = Filter[7];
-        const float w22 = Filter[8];
-
-        const size_t H = Parameters->InputShape[0];
-        const size_t pad_top = Parameters->Padding[0];
-        const size_t pad_left = Parameters->Padding[1];
-        const size_t stride_h = Parameters->StrideShape[0];
-        const size_t stride_w = Parameters->StrideShape[1];
-
-        // We treat pad_left, pad_top are hard require.
-        // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters.
-        const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0;
-
-        const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left);
-        // Need to handle effective pad_bottom is 2 when H == 1
-        const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left;
-        const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W);
-
-        for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) {
-            auto out_col = Parameters->OutputShape[1];
-
-            if (pad_left == 1) {
-                float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] +
-                               w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-                out_col--;
-                row0 += stride_w;
-                row1 += stride_w;
-                row2 += stride_w;
-            }
-
-            for (; out_col > pad_right; out_col--) {
-                float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] +
-                               w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] +
-                               w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-                row0 += stride_w;
-                row1 += stride_w;
-                row2 += stride_w;
-            }
-
-            if (out_col == 1) { // pad_right == 1
-                float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] +
-                               w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-            }
-
-            h += stride_h;
-            row0 = (Input + (h - pad_top) * W) - pad_left;
-            row1 = row0 + W;
-            row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W);
-        }
-
-    } else { // W == 1
-
-        const size_t H = Parameters->InputShape[0];
-        const size_t pad_left = Parameters->Padding[1];
-        const size_t pad_top = Parameters->Padding[0];
-        const size_t stride_h = Parameters->StrideShape[0];
-        size_t out_row = Parameters->OutputShape[0];
-
-        // Make sure pad_bottom is consistent with other parameters.
-        size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ?
-                                ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0;
-
-        const float w0 = Filter[pad_left ? 1 : 0];
-        const float w1 = Filter[pad_left ? 4 : 3];
-        const float w2 = Filter[pad_left ? 7 : 6];
-        auto init_v = (beta == 0.f ? 0.f : *Output * beta);
-
-        if (pad_top == 1) {
-            *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v;
-            out_row--;
-        }
-
-        for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) {
-            // All pixels are in the input col
-            auto init = (beta == 0.f ? 0.f : *Output * beta);
-            *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init;
-            row += stride_h;
-        }
-
-        if (out_row > 0) {
-            // last 1 or 2 rows are from the padding zero row.
-            // out_row == 1 when arrive here
-            if (pad_bottom == 1) {
-                const float* row = Input + H - 2;
-                *Output++ = w0 * row[0] + w1 * row[1] + init_v;
-            } else { // pad_bottom == 2 and H == 1 and padding_top == 0
-                *Output++ = w0 * Input[0] + init_v;
-            }
-        }
-    }
-
-}
-
-
-void
-MlasConvDepthwiseFloat_CHW(
-    const MLAS_CONV_PARAMETERS* Parameters,
-    const float* Input,
-    const float* Filter,
-    float* Output,
-    const float* Zeros
-    )
-/*++
-
-Routine Description:
-
-    This routine is an inner kernel to compute depthwise convolution for one filter channel on one input channel.
-
-Arguments:
-
-    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
-
-    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
-
-    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
-
-    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
-
-    Zeroes - Point to working buffer where all 0.0f are filled.
-
-Note:
-    No checking here as it is inner loop. Logic in generating Parameters controls the check.
-
-    Currently only support 2d kernel 3x3.
-    Will add general case and more special case if needed later.
-
---*/
-{
-    MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros);
-}
-
 #if defined(MLAS_USE_ARM_NEON_NCHWC)
-// Everything below these are NCHWC related kernels
 
-#include "sconv.h"
+#include "sconv_nchwc_kernel_neon.h"
 
 constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;
 
diff --git a/onnxruntime/core/mlas/lib/sconv.h b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.h
similarity index 100%
rename from onnxruntime/core/mlas/lib/sconv.h
rename to onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.h
diff --git a/onnxruntime/core/mlas/lib/spool_kernel_neon.cpp b/onnxruntime/core/mlas/lib/spool_nchwc_kernel_neon.cpp
similarity index 100%
rename from onnxruntime/core/mlas/lib/spool_kernel_neon.cpp
rename to onnxruntime/core/mlas/lib/spool_nchwc_kernel_neon.cpp

From a428d509675c34f7aa0156a5d4dd281f35c35723 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Tue, 2 Dec 2025 07:30:51 -0800
Subject: [PATCH 11/31] More changes

---
 onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
index 6f0e6c07eb8c0..a387c6a07992a 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchwc_kernel_neon.cpp
@@ -16,6 +16,7 @@ Module Name:
 
 #if defined(MLAS_USE_ARM_NEON_NCHWC)
 
+#include "mlasi.h"
 #include "sconv_nchwc_kernel_neon.h"
 
 constexpr size_t BlockSize = MLAS_PLATFORM::MLAS_NEON_NCHWC_BLOCK_SIZE;

From d53dd15b58ca18afdb5f14f7ba04c18a257a1d12 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Tue, 2 Dec 2025 21:33:21 -0800
Subject: [PATCH 12/31] a

---
 onnxruntime/core/mlas/lib/convolve.cpp        |   4 +-
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp  | 363 ++++++++++++------
 2 files changed, 257 insertions(+), 110 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 24c4f9ef8be5d..098b800b81353 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1460,8 +1460,8 @@ Return Value:
         if (Dimensions == 2
                 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1
                 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
-                && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1
-                && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1
+                && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0
+                && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0
                 && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
 
             *WorkingBufferSize = Parameters->InputShape[1] + 2;
diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index 906d54fc2d4c5..c559399cdcfd3 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -16,10 +16,251 @@ Module Name:
 
 
 #include "mlasi.h"
+#include <arm_neon.h>
 
-static
-void
-MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
+#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
+#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
+#else
+#define C10_LIKELY(expr) (expr)
+#endif
+
+
+#if defined(_MSC_VER) && defined(MLAS_TARGET_ARM64)
+MLAS_FORCEINLINE float32x4_t operator+(float32x4_t lhs, float32x4_t rhs)
+{
+    return vaddq_f32(lhs, rhs);
+}
+
+MLAS_FORCEINLINE float32x4_t operator-(float32x4_t lhs, float32x4_t rhs)
+{
+    return vsubq_f32(lhs, rhs);
+}
+
+MLAS_FORCEINLINE float32x4_t operator*(float32x4_t lhs, float32x4_t rhs)
+{
+    return vmulq_f32(lhs, rhs);
+}
+
+MLAS_FORCEINLINE float32x4_t operator-(float32x4_t value)
+{
+    return vnegq_f32(value);
+}
+#endif
+
+struct Arguments final {
+    // Input layer dimensions
+    int64_t in_rows;
+    int64_t in_cols;
+
+    // Output layer dimensions
+    int64_t out_rows;
+    int64_t out_cols;
+
+    // Padding info
+    int64_t pad_rows;
+    int64_t pad_cols;
+};
+
+inline void winograd_f2k3_input_transform_inplace__neon(
+    float32x4_t* const d0,
+    float32x4_t* const d1,
+    float32x4_t* const d2,
+    float32x4_t* const d3
+)
+{
+    const float32x4_t wd0 = *d0 - *d2;
+    const float32x4_t wd1 = *d1 + *d2;
+    const float32x4_t wd2 = -*d1 + *d2;
+    const float32x4_t wd3 = *d1 - *d3;
+    *d0 = wd0;
+    *d1 = wd1;
+    *d2 = wd2;
+    *d3 = wd3;
+}
+
+inline void winograd_f2k3_output_transform_inplace__neon(
+    float32x4_t* const m0,
+    float32x4_t* const m1,
+    const float32x4_t* const m2,
+    const float32x4_t* const m3
+)
+{
+    *m0 = *m0 + *m1 + *m2;
+    *m1 = *m1 - *m2 - *m3;
+}
+
+inline float32x4_t vmuladdq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b)
+{
+    return vfmaq_f32(c, a, b);
+    // TODO: Support ARMv7
+}
+
+inline float32x4_t vmulsubq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b)
+{
+    return vfmsq_f32(c, a, b);
+    // TODO: Support ARMv7
+}
+
+inline void winograd_f2k3_kernel_transform__neon(
+    const float32x4_t g0,
+    const float32x4_t g1,
+    const float32x4_t g2,
+    float32x4_t* const transform0,
+    float32x4_t* const transform1,
+    float32x4_t* const transform2,
+    float32x4_t* const transform3
+)
+{
+    const float32x4_t const_half = vdupq_n_f32(0.5f);
+    float32x4_t half_g0_plus_g2 = const_half * (g0 + g2);
+    *transform0 = g0;
+    *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1);
+    *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1);
+    *transform3 = g2;
+}
+
+inline float32x4x4_t v4f_transpose4x4__neon(const float32x4x4_t m)
+{
+    float32x4x4_t ret;
+    vst4q_f32((float*)(&ret), m);
+    return ret;
+}
+
+void convolution_depthwise3x3_winograd_impl(
+    const Arguments& args,
+    const float* const input,
+    const float* const kernel,
+    float* const output
+)
+{
+    //const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1);
+    float32x4x4_t kernel_tile;
+
+    {
+        const float32x4_t g0 = vld1q_f32(kernel);
+        const float32x4_t g1 = vld1q_f32(kernel + 3);
+        // g2[3] is junk
+        const float32x4_t g2 =
+            vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1);
+        float32x4x4_t w;
+        winograd_f2k3_kernel_transform__neon(
+            g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3]
+        );
+        w = v4f_transpose4x4__neon(w);
+
+        winograd_f2k3_kernel_transform__neon(
+            w.val[0],
+            w.val[1],
+            w.val[2],
+            &kernel_tile.val[0],
+            &kernel_tile.val[1],
+            &kernel_tile.val[2],
+            &kernel_tile.val[3]
+        );
+    }
+
+    #define TILE                                                      \
+        winograd_f2k3_input_transform_inplace__neon(                  \
+            &input_tile.val[0],                                       \
+            &input_tile.val[1],                                       \
+            &input_tile.val[2],                                       \
+            &input_tile.val[3]                                        \
+        );                                                            \
+        input_tile = v4f_transpose4x4__neon(input_tile);              \
+        winograd_f2k3_input_transform_inplace__neon(                  \
+            &input_tile.val[0],                                       \
+            &input_tile.val[1],                                       \
+            &input_tile.val[2],                                       \
+            &input_tile.val[3]                                        \
+        );                                                            \
+                                                                      \
+        for (size_t row = 0; row < 4; ++row) {                        \
+            input_tile.val[row] =                                     \
+                vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \
+        }                                                             \
+                                                                      \
+        winograd_f2k3_output_transform_inplace__neon(                 \
+            &input_tile.val[0],                                       \
+            &input_tile.val[1],                                       \
+            &input_tile.val[2],                                       \
+            &input_tile.val[3]                                        \
+        );                                                            \
+        input_tile = v4f_transpose4x4__neon(input_tile);              \
+        winograd_f2k3_output_transform_inplace__neon(                 \
+            &input_tile.val[0],                                       \
+            &input_tile.val[1],                                       \
+            &input_tile.val[2],                                       \
+            &input_tile.val[3]                                        \
+        )
+
+  // Non-padded regime.
+
+    // Iterate over non-padded output tiles.
+    // TODO: avoid spilling W by breaking out the non-padded vs padded case.
+    for (int64_t oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
+        for (int64_t otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
+            // load input tile for [oth, otw];
+            int64_t ih = oth * 2 - args.pad_rows;
+            int64_t iw = otw * 2 - args.pad_cols;
+            // fast-path, all accesses in-bounds
+            if (C10_LIKELY(
+                    ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
+                    iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
+                    2 * otw + 1 < args.out_cols
+                )) {
+                float32x4x4_t input_tile;
+                for (int64_t row = 0; row < 4; ++row) {
+                    input_tile.val[row] =
+                        vld1q_f32(input + (ih + row) * args.in_cols + iw);
+                }
+
+                TILE;
+
+                for (int64_t row = 0; row < 2; ++row) {
+                    vst1_f32(
+                        output + (oth * 2 + row) * args.out_cols + otw * 2,
+                        vget_low_f32(input_tile.val[row])
+                    );
+                }
+            } else {
+                float block[4][4];
+                for (int64_t row = 0; row < 4; ++row) {
+                    for (int64_t col = 0; col < 4; ++col) {
+                        if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
+                            iw + col < args.in_cols) {
+                            block[row][col] = input[(ih + row) * args.in_cols + iw + col];
+                        } else {
+                            block[row][col] = 0.0;
+                        }
+                    }
+                }
+
+                float32x4x4_t input_tile;
+                for (int64_t row = 0; row < 4; ++row) {
+                    input_tile.val[row] = vld1q_f32(&block[row][0]);
+                }
+
+                TILE;
+
+                float oblock[2][2];
+                for (int64_t row = 0; row < 2; ++row) {
+                    vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row]));
+                }
+                for (int64_t row = 0; row < 2; ++row) {
+                    for (int64_t col = 0; col < 2; ++col) {
+                        if (2 * oth + row < args.out_rows &&
+                            2 * otw + col < args.out_cols) {
+                            output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
+                                oblock[row][col];
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+    static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
     const float* Filter,
@@ -46,117 +287,23 @@ Routine Description:
 
 --*/
 {
-    const size_t W = Parameters->InputShape[1];
-    const float beta = Parameters->Beta;
-
-    if (W > 1) {
-
-        const float w00 = Filter[0];
-        const float w01 = Filter[1];
-        const float w02 = Filter[2];
-        const float w10 = Filter[3];
-        const float w11 = Filter[4];
-        const float w12 = Filter[5];
-        const float w20 = Filter[6];
-        const float w21 = Filter[7];
-        const float w22 = Filter[8];
-
-        const size_t H = Parameters->InputShape[0];
-        const size_t pad_top = Parameters->Padding[0];
-        const size_t pad_left = Parameters->Padding[1];
-        const size_t stride_h = Parameters->StrideShape[0];
-        const size_t stride_w = Parameters->StrideShape[1];
-
-        // We treat pad_left, pad_top are hard require.
-        // While pad_right and pad_bottom could be adjusted if they do not 100% match other parameters.
-        const size_t pad_right = (((Parameters->OutputShape[1] - 1) * stride_w + 3) > (pad_left + W)) ? 1 : 0;
-
-        const float* row0 = (pad_top > 0) ? Zeros : (Input - pad_left);
-        // Need to handle effective pad_bottom is 2 when H == 1
-        const float* row1 = (H + pad_top <= 1) ? Zeros : (Input + (1 - pad_top) * W) - pad_left;
-        const float* row2 = (H + pad_top <= 2) ? Zeros : (row1 + W);
-
-        for (size_t h = 0, out_row = Parameters->OutputShape[0]; out_row > 0; --out_row) {
-            auto out_col = Parameters->OutputShape[1];
-
-            if (pad_left == 1) {
-                float dotsum = w01 * row0[1] + w02 * row0[2] + w11 * row1[1] + w12 * row1[2] +
-                               w21 * row2[1] + w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-                out_col--;
-                row0 += stride_w;
-                row1 += stride_w;
-                row2 += stride_w;
-            }
-
-            for (; out_col > pad_right; out_col--) {
-                float dotsum = w00 * row0[0] + w01 * row0[1] + w02 * row0[2] + w10 * row1[0] +
-                               w11 * row1[1] + w12 * row1[2] + w20 * row2[0] + w21 * row2[1] +
-                               w22 * row2[2] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-                row0 += stride_w;
-                row1 += stride_w;
-                row2 += stride_w;
-            }
+     MLAS_UNREFERENCED_PARAMETER(Zeros);
 
-            if (out_col == 1) { // pad_right == 1
-                float dotsum = w00 * row0[0] + w01 * row0[1] + w10 * row1[0] + w11 * row1[1] +
-                               w20 * row2[0] + w21 * row2[1] + (beta == 0.f ? 0.f : *Output * beta);
-                *Output++ = dotsum;
-            }
-
-            h += stride_h;
-            row0 = (Input + (h - pad_top) * W) - pad_left;
-            row1 = row0 + W;
-            row2 = (h + 2 >= H + pad_top) ? Zeros : (row1 + W);
-        }
-
-    } else { // W == 1
-
-        const size_t H = Parameters->InputShape[0];
-        const size_t pad_left = Parameters->Padding[1];
-        const size_t pad_top = Parameters->Padding[0];
-        const size_t stride_h = Parameters->StrideShape[0];
-        size_t out_row = Parameters->OutputShape[0];
-
-        // Make sure pad_bottom is consistent with other parameters.
-        size_t pad_bottom = ((out_row - 1) * stride_h + 3) > (pad_top + H) ?
-                                ((out_row - 1) * stride_h + 3) - (pad_top + H) : 0;
-
-        const float w0 = Filter[pad_left ? 1 : 0];
-        const float w1 = Filter[pad_left ? 4 : 3];
-        const float w2 = Filter[pad_left ? 7 : 6];
-        auto init_v = (beta == 0.f ? 0.f : *Output * beta);
+     Arguments args;
+     args.in_rows = Parameters->InputShape[0];
+     args.in_cols = Parameters->InputShape[1];
+     
+     args.out_rows = Parameters->OutputShape[0];
+     args.out_cols = Parameters->OutputShape[1];
 
-        if (pad_top == 1) {
-            *Output++ = w1 * Input[0] + w2 * ((H + pad_top <= 2) ? 0.0f : Input[1]) + init_v;
-            out_row--;
-        }
-
-        for (const float* row = Input + pad_top * stride_h - pad_top; out_row > pad_bottom; --out_row) {
-            // All pixels are in the input col
-            auto init = (beta == 0.f ? 0.f : *Output * beta);
-            *Output++ = w0 * row[0] + w1 * row[1] + w2 * row[2] + init;
-            row += stride_h;
-        }
-
-        if (out_row > 0) {
-            // last 1 or 2 rows are from the padding zero row.
-            // out_row == 1 when arrive here
-            if (pad_bottom == 1) {
-                const float* row = Input + H - 2;
-                *Output++ = w0 * row[0] + w1 * row[1] + init_v;
-            } else { // pad_bottom == 2 and H == 1 and padding_top == 0
-                *Output++ = w0 * Input[0] + init_v;
-            }
-        }
-    }
+     args.pad_rows = Parameters->Padding[0];
+     args.pad_cols = Parameters->Padding[1];
 
+     convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output);
 }
 
 
-void
-MlasConvDepthwiseFloat_CHW(
+void MlasConvDepthwiseFloat_CHW(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
     const float* Filter,

From 8f12c51ce910e95fac66e6e1c0dda3ee0bcf2bc1 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Tue, 2 Dec 2025 23:56:58 -0800
Subject: [PATCH 13/31] Try

---
 onnxruntime/test/optimizer/conv_add_act_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc
index f61f9b29d9cce..a557f7a15be2b 100644
--- a/onnxruntime/test/optimizer/conv_add_act_test.cc
+++ b/onnxruntime/test/optimizer/conv_add_act_test.cc
@@ -18,8 +18,8 @@ void TestConvPath(const std::vector<int64_t>& input_shape, const std::vector<int
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<float>(input_shape, -31, 31);
     auto* output_arg = builder.MakeOutput();
-    auto* bias_arg = builder.MakeInitializer<float>({weights_shape[0]}, -20.f, 20.f);
-    auto* add_arg = builder.MakeInput<float>(output_shape, -20.f, 20.f);
+    auto* bias_arg = builder.MakeInitializer<float>({weights_shape[0]}, 0.f, 0.f);
+    auto* add_arg = builder.MakeInput<float>(output_shape, 0.f, 0.f);
     auto* weight_arg = builder.MakeInitializer<float>(weights_shape, -2.f, 2.f);
     auto* conv_out_arg = builder.MakeIntermediate();
 

From 67b680130bb1ef43fda7026b6a4ed624a8292edc Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Wed, 3 Dec 2025 05:42:05 -0800
Subject: [PATCH 14/31] More changes

---
 onnxruntime/core/mlas/lib/convolve.cpp          | 3 +++
 onnxruntime/test/optimizer/conv_add_act_test.cc | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 098b800b81353..628a99a424384 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1462,6 +1462,9 @@ Return Value:
                 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
                 && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0
                 && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0
+#if defined(MLAS_TARGET_ARM64)
+                && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1
+#endif
                 && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
 
             *WorkingBufferSize = Parameters->InputShape[1] + 2;
diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc
index a557f7a15be2b..1797ede3d24c5 100644
--- a/onnxruntime/test/optimizer/conv_add_act_test.cc
+++ b/onnxruntime/test/optimizer/conv_add_act_test.cc
@@ -18,7 +18,7 @@ void TestConvPath(const std::vector<int64_t>& input_shape, const std::vector<int
   auto build_test_case = [&](ModelTestBuilder& builder) {
     auto* input_arg = builder.MakeInput<float>(input_shape, -31, 31);
     auto* output_arg = builder.MakeOutput();
-    auto* bias_arg = builder.MakeInitializer<float>({weights_shape[0]}, 0.f, 0.f);
+    auto* bias_arg = builder.MakeInitializer<float>({weights_shape[0]}, -20.f, 20.f);
     auto* add_arg = builder.MakeInput<float>(output_shape, 0.f, 0.f);
     auto* weight_arg = builder.MakeInitializer<float>(weights_shape, -2.f, 2.f);
     auto* conv_out_arg = builder.MakeIntermediate();

From 01b43fb68a0c1740577a8693c55d606bb4f71e23 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Wed, 3 Dec 2025 19:55:15 -0800
Subject: [PATCH 15/31] Relax padding

---
 onnxruntime/core/mlas/lib/convolve.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 628a99a424384..7cbc7de86238a 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1460,8 +1460,8 @@ Return Value:
         if (Dimensions == 2
                 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1
                 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
-                && Parameters->Padding[0] <= 0 && Parameters->Padding[1] <= 0
-                && Parameters->Padding[2] <= 0 && Parameters->Padding[3] <= 0
+                && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1
+                && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1
 #if defined(MLAS_TARGET_ARM64)
                 && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1
 #endif

From ea8339475baad05ef3a51059a501a94df6b628e3 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 00:48:10 -0800
Subject: [PATCH 16/31] Vanilla NEON Depthwise

---
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp  | 255 +++++++++++++++++-
 1 file changed, 251 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index c559399cdcfd3..d147ea6276566 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -260,6 +260,222 @@ void convolution_depthwise3x3_winograd_impl(
     }
 }
 
+MLAS_FORCEINLINE float DepthwiseSampleValue(
+    const float* row,
+    ptrdiff_t col,
+    size_t width
+)
+{
+    if (row == nullptr || col < 0 || col >= static_cast<ptrdiff_t>(width)) {
+        return 0.0f;
+    }
+    return row[col];
+}
+
+MLAS_FORCEINLINE float DepthwiseAccumulateRowScalar(
+    float acc,
+    const float* row,
+    size_t base,
+    float w0,
+    float w1,
+    float w2
+)
+{
+    if (row == nullptr) {
+        return acc;
+    }
+
+    acc += row[base] * w0;
+    acc += row[base + 1] * w1;
+    acc += row[base + 2] * w2;
+    return acc;
+}
+
+MLAS_FORCEINLINE void DepthwiseAccumulateRowVector(
+    float32x4_t& acc,
+    const float* row,
+    size_t base,
+    float w0,
+    float w1,
+    float w2
+)
+{
+    if (row == nullptr) {
+        return;
+    }
+
+    const float* r = row + base;
+    const float32x4_t c0 = vld1q_f32(r);
+    const float32x4_t c1 = vld1q_f32(r + 1);
+    const float32x4_t c2 = vld1q_f32(r + 2);
+
+    acc = vmlaq_n_f32(acc, c0, w0);
+    acc = vmlaq_n_f32(acc, c1, w1);
+    acc = vmlaq_n_f32(acc, c2, w2);
+}
+
+MLAS_FORCEINLINE float DepthwiseComputeEdge(
+    const float* row0,
+    const float* row1,
+    const float* row2,
+    ptrdiff_t iw,
+    size_t width,
+    const float w00,
+    const float w01,
+    const float w02,
+    const float w10,
+    const float w11,
+    const float w12,
+    const float w20,
+    const float w21,
+    const float w22
+)
+{
+    float acc = 0.0f;
+    const ptrdiff_t c0 = iw;
+    const ptrdiff_t c1 = iw + 1;
+    const ptrdiff_t c2 = iw + 2;
+
+    acc += DepthwiseSampleValue(row0, c0, width) * w00;
+    acc += DepthwiseSampleValue(row0, c1, width) * w01;
+    acc += DepthwiseSampleValue(row0, c2, width) * w02;
+    acc += DepthwiseSampleValue(row1, c0, width) * w10;
+    acc += DepthwiseSampleValue(row1, c1, width) * w11;
+    acc += DepthwiseSampleValue(row1, c2, width) * w12;
+    acc += DepthwiseSampleValue(row2, c0, width) * w20;
+    acc += DepthwiseSampleValue(row2, c1, width) * w21;
+    acc += DepthwiseSampleValue(row2, c2, width) * w22;
+
+    return acc;
+}
+
+static void DepthwiseConv3x3Stride1PadLe1Neon(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+)
+{
+    MLAS_UNREFERENCED_PARAMETER(Zeros);
+
+    const size_t H = Parameters->InputShape[0];
+    const size_t W = Parameters->InputShape[1];
+    const size_t out_rows = Parameters->OutputShape[0];
+    const size_t out_cols = Parameters->OutputShape[1];
+
+    const size_t pad_top = Parameters->Padding[0];
+    const size_t pad_left = Parameters->Padding[1];
+    const size_t pad_right = Parameters->Padding[3];
+
+    const float beta = Parameters->Beta;
+    const bool accumulate_output = beta != 0.0f;
+
+    const float w00 = Filter[0];
+    const float w01 = Filter[1];
+    const float w02 = Filter[2];
+    const float w10 = Filter[3];
+    const float w11 = Filter[4];
+    const float w12 = Filter[5];
+    const float w20 = Filter[6];
+    const float w21 = Filter[7];
+    const float w22 = Filter[8];
+
+    for (size_t oh = 0; oh < out_rows; ++oh) {
+        const ptrdiff_t ih = static_cast<ptrdiff_t>(oh) - static_cast<ptrdiff_t>(pad_top);
+
+        const ptrdiff_t row0_index = ih - 1;
+        const ptrdiff_t row1_index = ih;
+        const ptrdiff_t row2_index = ih + 1;
+
+        const float* row0 = nullptr;
+        const float* row1 = nullptr;
+        const float* row2 = nullptr;
+
+        if (row0_index >= 0 && row0_index < static_cast<ptrdiff_t>(H)) {
+            row0 = Input + static_cast<size_t>(row0_index) * W;
+        }
+        if (row1_index >= 0 && row1_index < static_cast<ptrdiff_t>(H)) {
+            row1 = Input + static_cast<size_t>(row1_index) * W;
+        }
+        if (row2_index >= 0 && row2_index < static_cast<ptrdiff_t>(H)) {
+            row2 = Input + static_cast<size_t>(row2_index) * W;
+        }
+
+        float* out_row = Output + oh * out_cols;
+        size_t ow = 0;
+
+        if (pad_left && ow < out_cols) {
+            const ptrdiff_t iw = static_cast<ptrdiff_t>(ow) - static_cast<ptrdiff_t>(pad_left);
+            float acc = DepthwiseComputeEdge(
+                row0, row1, row2, iw, W,
+                w00, w01, w02, w10, w11, w12, w20, w21, w22
+            );
+            if (accumulate_output) {
+                acc += beta * out_row[ow];
+            }
+            out_row[ow++] = acc;
+        }
+
+        size_t interior_cols = 0;
+        if (out_cols > pad_left + pad_right) {
+            interior_cols = out_cols - pad_left - pad_right;
+        }
+
+        size_t processed = 0;
+        while (processed + 4 <= interior_cols) {
+            const ptrdiff_t iw = static_cast<ptrdiff_t>(ow) - static_cast<ptrdiff_t>(pad_left);
+            if ((iw + 5) >= static_cast<ptrdiff_t>(W)) {
+                break;
+            }
+
+            const size_t base = static_cast<size_t>(iw);
+            float32x4_t acc = vdupq_n_f32(0.0f);
+
+            DepthwiseAccumulateRowVector(acc, row0, base, w00, w01, w02);
+            DepthwiseAccumulateRowVector(acc, row1, base, w10, w11, w12);
+            DepthwiseAccumulateRowVector(acc, row2, base, w20, w21, w22);
+
+            if (accumulate_output) {
+                const float32x4_t prev = vld1q_f32(out_row + ow);
+                acc = vmlaq_n_f32(acc, prev, beta);
+            }
+
+            vst1q_f32(out_row + ow, acc);
+            ow += 4;
+            processed += 4;
+        }
+
+        for (; processed < interior_cols; ++processed) {
+            const ptrdiff_t iw = static_cast<ptrdiff_t>(ow) - static_cast<ptrdiff_t>(pad_left);
+            const size_t base = static_cast<size_t>(iw);
+
+            float acc = 0.0f;
+            acc = DepthwiseAccumulateRowScalar(acc, row0, base, w00, w01, w02);
+            acc = DepthwiseAccumulateRowScalar(acc, row1, base, w10, w11, w12);
+            acc = DepthwiseAccumulateRowScalar(acc, row2, base, w20, w21, w22);
+
+            if (accumulate_output) {
+                acc += beta * out_row[ow];
+            }
+            out_row[ow++] = acc;
+        }
+
+        if (pad_right && ow < out_cols) {
+            const ptrdiff_t iw = static_cast<ptrdiff_t>(ow) - static_cast<ptrdiff_t>(pad_left);
+            float acc = DepthwiseComputeEdge(
+                row0, row1, row2, iw, W,
+                w00, w01, w02, w10, w11, w12, w20, w21, w22
+            );
+            if (accumulate_output) {
+                acc += beta * out_row[ow];
+            }
+            out_row[ow++] = acc;
+        }
+    }
+}
+
+/*
     static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
@@ -267,7 +483,7 @@ void convolution_depthwise3x3_winograd_impl(
     float* Output,
     const float* Zeros
     )
-/*++
+++
 
 Routine Description:
 
@@ -285,14 +501,14 @@ Routine Description:
 
     Zeroes - Point to working buffer where all 0.0f are filled.
 
---*/
+--
 {
      MLAS_UNREFERENCED_PARAMETER(Zeros);
 
      Arguments args;
      args.in_rows = Parameters->InputShape[0];
      args.in_cols = Parameters->InputShape[1];
-     
+
      args.out_rows = Parameters->OutputShape[0];
      args.out_cols = Parameters->OutputShape[1];
 
@@ -301,7 +517,39 @@ Routine Description:
 
      convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output);
 }
+*/
 
+static
+void
+MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
+    const MLAS_CONV_PARAMETERS* Parameters,
+    const float* Input,
+    const float* Filter,
+    float* Output,
+    const float* Zeros
+    )
+/*++
+
+Routine Description:
+
+    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
+
+Arguments:
+
+    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
+
+    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+
+    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+
+    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+
+    Zeroes - Point to working buffer where all 0.0f are filled.
+
+--*/
+{
+        DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output, Zeros);
+}
 
 void MlasConvDepthwiseFloat_CHW(
     const MLAS_CONV_PARAMETERS* Parameters,
@@ -338,4 +586,3 @@ Routine Description:
 {
     MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros);
 }
-

From dd94a3b0883f584d9b6b02ea3eeda46b9ddfa377 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 02:23:58 -0800
Subject: [PATCH 17/31] Fix indexing

---
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index d147ea6276566..02c34bf83990c 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -384,9 +384,9 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
     for (size_t oh = 0; oh < out_rows; ++oh) {
         const ptrdiff_t ih = static_cast<ptrdiff_t>(oh) - static_cast<ptrdiff_t>(pad_top);
 
-        const ptrdiff_t row0_index = ih - 1;
-        const ptrdiff_t row1_index = ih;
-        const ptrdiff_t row2_index = ih + 1;
+        const ptrdiff_t row0_index = ih;
+        const ptrdiff_t row1_index = ih + 1;
+        const ptrdiff_t row2_index = ih + 2;
 
         const float* row0 = nullptr;
         const float* row1 = nullptr;

From ffd291adfe1b25464416ebc24cae97aaf33e50c9 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 06:54:23 -0800
Subject: [PATCH 18/31] Add benchmark

---
 onnxruntime/test/mlas/bench/bench_sconv.cpp     | 5 +++++
 onnxruntime/test/optimizer/conv_add_act_test.cc | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index dc37980002978..2a41a6a6667b2 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -326,6 +326,11 @@ static void TeamsModel(benchmark::internal::Benchmark* b) {
 
   b->Args({2, 1, 1, 12, 12, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1});  // fused Conv_376 => 48x80
   b->Args({2, 1, 1, 12, 72, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1});  // Conv_59 => 24x40
+
+  b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});  // External customer model
+  b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
+  b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
+
 }
 
 BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();
diff --git a/onnxruntime/test/optimizer/conv_add_act_test.cc b/onnxruntime/test/optimizer/conv_add_act_test.cc
index 1797ede3d24c5..f61f9b29d9cce 100644
--- a/onnxruntime/test/optimizer/conv_add_act_test.cc
+++ b/onnxruntime/test/optimizer/conv_add_act_test.cc
@@ -19,7 +19,7 @@ void TestConvPath(const std::vector<int64_t>& input_shape, const std::vector<int
     auto* input_arg = builder.MakeInput<float>(input_shape, -31, 31);
     auto* output_arg = builder.MakeOutput();
     auto* bias_arg = builder.MakeInitializer<float>({weights_shape[0]}, -20.f, 20.f);
-    auto* add_arg = builder.MakeInput<float>(output_shape, 0.f, 0.f);
+    auto* add_arg = builder.MakeInput<float>(output_shape, -20.f, 20.f);
     auto* weight_arg = builder.MakeInitializer<float>(weights_shape, -2.f, 2.f);
     auto* conv_out_arg = builder.MakeIntermediate();
 

From 92fb60450745beb41690a738460f1303775e654b Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 07:03:53 -0800
Subject: [PATCH 19/31] Add lambda

---
 onnxruntime/core/mlas/lib/convolve.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 7cbc7de86238a..288109bd8e79f 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1454,17 +1454,25 @@ Return Value:
 #if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
 
         // Scalar (WASM_SCALAR) / vectorized (ARM64) direct conv for depthwise convolution.
-        // Currently only support 3x3 kernel with padding <=1 and dilations = 1.
+        // Currently only support 3x3 kernel with padding <=1 and dilations = 1
+        // and on ARM64, it is further restricted to strides = 1.
         // TODO: support more general depthwise convolution.
 
+        bool depthwise_conv_stride_support_check = [&]() {
+        // On ARM64, only support stride = 1 for depthwise conv.
+    #if defined(MLAS_TARGET_ARM64)
+        return Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1;
+    #else
+        return true;
+    #endif
+        }();
+
         if (Dimensions == 2
                 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1
                 && Parameters->KernelShape[0] == 3 && Parameters->KernelShape[1] == 3
                 && Parameters->Padding[0] <= 1 && Parameters->Padding[1] <= 1
                 && Parameters->Padding[2] <= 1 && Parameters->Padding[3] <= 1
-#if defined(MLAS_TARGET_ARM64)
-                && Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1
-#endif
+                && depthwise_conv_stride_support_check
                 && Parameters->DilationShape[0] == 1 && Parameters->DilationShape[1] == 1) {
 
             *WorkingBufferSize = Parameters->InputShape[1] + 2;

From d15bb9303eef43b06276acac334c8204919c8a58 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 07:06:22 -0800
Subject: [PATCH 20/31] Rework

---
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp       | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index 02c34bf83990c..c70ae134d6e42 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -357,8 +357,6 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
     const float* Zeros
 )
 {
-    MLAS_UNREFERENCED_PARAMETER(Zeros);
-
     const size_t H = Parameters->InputShape[0];
     const size_t W = Parameters->InputShape[1];
     const size_t out_rows = Parameters->OutputShape[0];
@@ -525,8 +523,7 @@ MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
     const float* Filter,
-    float* Output,
-    const float* Zeros
+    float* Output
     )
 /*++
 
@@ -544,11 +541,9 @@ Routine Description:
 
     Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
 
-    Zeroes - Point to working buffer where all 0.0f are filled.
-
 --*/
 {
-        DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output, Zeros);
+        DepthwiseConv3x3Stride1PadLe1Neon(Parameters, Input, Filter, Output);
 }
 
 void MlasConvDepthwiseFloat_CHW(
@@ -579,10 +574,11 @@ Routine Description:
 Note:
     No checking here as it is inner loop. Logic in generating Parameters controls the check.
 
-    Currently only support 2d kernel 3x3.
+    Currently only support 2d kernel 3x3 with strides=1, dilations=1, pads<=1.
     Will add general case and more special case if needed later.
 
 --*/
 {
-    MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output, Zeros);
+    MLAS_UNREFERENCED_PARAMETER(Zeros);
+    MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(Parameters, Input, Filter, Output);
 }

From 119ec9a760bf61a58b03c32310cdce7c08c8def3 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 07:14:29 -0800
Subject: [PATCH 21/31] Update onnxruntime/test/mlas/bench/bench_sconv.cpp

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 onnxruntime/test/mlas/bench/bench_sconv.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 2a41a6a6667b2..4330b6e47a63d 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -330,7 +330,6 @@ static void TeamsModel(benchmark::internal::Benchmark* b) {
   b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});  // External customer model
   b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
   b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
-
 }
 
 BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();

From d0fc1431b26aac4e0639d67ef94adb0c299b0087 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 07:30:29 -0800
Subject: [PATCH 22/31] Fix

---
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index c70ae134d6e42..9a5d5f1ebae8a 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -353,8 +353,7 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
     const MLAS_CONV_PARAMETERS* Parameters,
     const float* Input,
     const float* Filter,
-    float* Output,
-    const float* Zeros
+    float* Output
 )
 {
     const size_t H = Parameters->InputShape[0];

From 2820a84261123499e6ddb03e734810d8f6ad98ed Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 20:23:31 -0800
Subject: [PATCH 23/31] Remove Winograd implementation

---
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp  | 286 ------------------
 1 file changed, 286 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index 9a5d5f1ebae8a..c47103a3963b2 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -18,248 +18,6 @@ Module Name:
 #include "mlasi.h"
 #include <arm_neon.h>
 
-#if defined(__GNUC__) || defined(__ICL) || defined(__clang__)
-#define C10_LIKELY(expr) (__builtin_expect(static_cast<bool>(expr), 1))
-#else
-#define C10_LIKELY(expr) (expr)
-#endif
-
-
-#if defined(_MSC_VER) && defined(MLAS_TARGET_ARM64)
-MLAS_FORCEINLINE float32x4_t operator+(float32x4_t lhs, float32x4_t rhs)
-{
-    return vaddq_f32(lhs, rhs);
-}
-
-MLAS_FORCEINLINE float32x4_t operator-(float32x4_t lhs, float32x4_t rhs)
-{
-    return vsubq_f32(lhs, rhs);
-}
-
-MLAS_FORCEINLINE float32x4_t operator*(float32x4_t lhs, float32x4_t rhs)
-{
-    return vmulq_f32(lhs, rhs);
-}
-
-MLAS_FORCEINLINE float32x4_t operator-(float32x4_t value)
-{
-    return vnegq_f32(value);
-}
-#endif
-
-struct Arguments final {
-    // Input layer dimensions
-    int64_t in_rows;
-    int64_t in_cols;
-
-    // Output layer dimensions
-    int64_t out_rows;
-    int64_t out_cols;
-
-    // Padding info
-    int64_t pad_rows;
-    int64_t pad_cols;
-};
-
-inline void winograd_f2k3_input_transform_inplace__neon(
-    float32x4_t* const d0,
-    float32x4_t* const d1,
-    float32x4_t* const d2,
-    float32x4_t* const d3
-)
-{
-    const float32x4_t wd0 = *d0 - *d2;
-    const float32x4_t wd1 = *d1 + *d2;
-    const float32x4_t wd2 = -*d1 + *d2;
-    const float32x4_t wd3 = *d1 - *d3;
-    *d0 = wd0;
-    *d1 = wd1;
-    *d2 = wd2;
-    *d3 = wd3;
-}
-
-inline void winograd_f2k3_output_transform_inplace__neon(
-    float32x4_t* const m0,
-    float32x4_t* const m1,
-    const float32x4_t* const m2,
-    const float32x4_t* const m3
-)
-{
-    *m0 = *m0 + *m1 + *m2;
-    *m1 = *m1 - *m2 - *m3;
-}
-
-inline float32x4_t vmuladdq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b)
-{
-    return vfmaq_f32(c, a, b);
-    // TODO: Support ARMv7
-}
-
-inline float32x4_t vmulsubq_f32(const float32x4_t c, const float32x4_t a, const float32x4_t b)
-{
-    return vfmsq_f32(c, a, b);
-    // TODO: Support ARMv7
-}
-
-inline void winograd_f2k3_kernel_transform__neon(
-    const float32x4_t g0,
-    const float32x4_t g1,
-    const float32x4_t g2,
-    float32x4_t* const transform0,
-    float32x4_t* const transform1,
-    float32x4_t* const transform2,
-    float32x4_t* const transform3
-)
-{
-    const float32x4_t const_half = vdupq_n_f32(0.5f);
-    float32x4_t half_g0_plus_g2 = const_half * (g0 + g2);
-    *transform0 = g0;
-    *transform1 = vmuladdq_f32(half_g0_plus_g2, const_half, g1);
-    *transform2 = vmulsubq_f32(half_g0_plus_g2, const_half, g1);
-    *transform3 = g2;
-}
-
-inline float32x4x4_t v4f_transpose4x4__neon(const float32x4x4_t m)
-{
-    float32x4x4_t ret;
-    vst4q_f32((float*)(&ret), m);
-    return ret;
-}
-
-void convolution_depthwise3x3_winograd_impl(
-    const Arguments& args,
-    const float* const input,
-    const float* const kernel,
-    float* const output
-)
-{
-    //const float32x4_t vbias = vsetq_lane_f32(*bias, vdupq_n_f32(0.0), 1);
-    float32x4x4_t kernel_tile;
-
-    {
-        const float32x4_t g0 = vld1q_f32(kernel);
-        const float32x4_t g1 = vld1q_f32(kernel + 3);
-        // g2[3] is junk
-        const float32x4_t g2 =
-            vextq_f32(vld1q_f32(kernel + 5), vld1q_f32(kernel + 5), 1);
-        float32x4x4_t w;
-        winograd_f2k3_kernel_transform__neon(
-            g0, g1, g2, &w.val[0], &w.val[1], &w.val[2], &w.val[3]
-        );
-        w = v4f_transpose4x4__neon(w);
-
-        winograd_f2k3_kernel_transform__neon(
-            w.val[0],
-            w.val[1],
-            w.val[2],
-            &kernel_tile.val[0],
-            &kernel_tile.val[1],
-            &kernel_tile.val[2],
-            &kernel_tile.val[3]
-        );
-    }
-
-    #define TILE                                                      \
-        winograd_f2k3_input_transform_inplace__neon(                  \
-            &input_tile.val[0],                                       \
-            &input_tile.val[1],                                       \
-            &input_tile.val[2],                                       \
-            &input_tile.val[3]                                        \
-        );                                                            \
-        input_tile = v4f_transpose4x4__neon(input_tile);              \
-        winograd_f2k3_input_transform_inplace__neon(                  \
-            &input_tile.val[0],                                       \
-            &input_tile.val[1],                                       \
-            &input_tile.val[2],                                       \
-            &input_tile.val[3]                                        \
-        );                                                            \
-                                                                      \
-        for (size_t row = 0; row < 4; ++row) {                        \
-            input_tile.val[row] =                                     \
-                vmulq_f32(input_tile.val[row], kernel_tile.val[row]); \
-        }                                                             \
-                                                                      \
-        winograd_f2k3_output_transform_inplace__neon(                 \
-            &input_tile.val[0],                                       \
-            &input_tile.val[1],                                       \
-            &input_tile.val[2],                                       \
-            &input_tile.val[3]                                        \
-        );                                                            \
-        input_tile = v4f_transpose4x4__neon(input_tile);              \
-        winograd_f2k3_output_transform_inplace__neon(                 \
-            &input_tile.val[0],                                       \
-            &input_tile.val[1],                                       \
-            &input_tile.val[2],                                       \
-            &input_tile.val[3]                                        \
-        )
-
-  // Non-padded regime.
-
-    // Iterate over non-padded output tiles.
-    // TODO: avoid spilling W by breaking out the non-padded vs padded case.
-    for (int64_t oth = 0; oth < (args.out_rows + 1) / 2; ++oth) {
-        for (int64_t otw = 0; otw < (args.out_cols + 1) / 2; ++otw) {
-            // load input tile for [oth, otw];
-            int64_t ih = oth * 2 - args.pad_rows;
-            int64_t iw = otw * 2 - args.pad_cols;
-            // fast-path, all accesses in-bounds
-            if (C10_LIKELY(
-                    ih >= 0 && iw >= 0 && ih + 3 < args.in_rows &&
-                    iw + 3 < args.in_cols && 2 * oth + 1 < args.out_rows &&
-                    2 * otw + 1 < args.out_cols
-                )) {
-                float32x4x4_t input_tile;
-                for (int64_t row = 0; row < 4; ++row) {
-                    input_tile.val[row] =
-                        vld1q_f32(input + (ih + row) * args.in_cols + iw);
-                }
-
-                TILE;
-
-                for (int64_t row = 0; row < 2; ++row) {
-                    vst1_f32(
-                        output + (oth * 2 + row) * args.out_cols + otw * 2,
-                        vget_low_f32(input_tile.val[row])
-                    );
-                }
-            } else {
-                float block[4][4];
-                for (int64_t row = 0; row < 4; ++row) {
-                    for (int64_t col = 0; col < 4; ++col) {
-                        if (ih + row >= 0 && iw + col >= 0 && ih + row < args.in_rows &&
-                            iw + col < args.in_cols) {
-                            block[row][col] = input[(ih + row) * args.in_cols + iw + col];
-                        } else {
-                            block[row][col] = 0.0;
-                        }
-                    }
-                }
-
-                float32x4x4_t input_tile;
-                for (int64_t row = 0; row < 4; ++row) {
-                    input_tile.val[row] = vld1q_f32(&block[row][0]);
-                }
-
-                TILE;
-
-                float oblock[2][2];
-                for (int64_t row = 0; row < 2; ++row) {
-                    vst1_f32(&oblock[row][0], vget_low_f32(input_tile.val[row]));
-                }
-                for (int64_t row = 0; row < 2; ++row) {
-                    for (int64_t col = 0; col < 2; ++col) {
-                        if (2 * oth + row < args.out_rows &&
-                            2 * otw + col < args.out_cols) {
-                            output[(2 * oth + row) * args.out_cols + 2 * otw + col] =
-                                oblock[row][col];
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
 MLAS_FORCEINLINE float DepthwiseSampleValue(
     const float* row,
     ptrdiff_t col,
@@ -472,50 +230,6 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
     }
 }
 
-/*
-    static void MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(
-    const MLAS_CONV_PARAMETERS* Parameters,
-    const float* Input,
-    const float* Filter,
-    float* Output,
-    const float* Zeros
-    )
-++
-
-Routine Description:
-
-    This routine is an inner kernel to compute convolution on one channel input with one filter channel.
-
-Arguments:
-
-    Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
-
-    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
-
-    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
-
-    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
-
-    Zeroes - Point to working buffer where all 0.0f are filled.
-
---
-{
-     MLAS_UNREFERENCED_PARAMETER(Zeros);
-
-     Arguments args;
-     args.in_rows = Parameters->InputShape[0];
-     args.in_cols = Parameters->InputShape[1];
-
-     args.out_rows = Parameters->OutputShape[0];
-     args.out_cols = Parameters->OutputShape[1];
-
-     args.pad_rows = Parameters->Padding[0];
-     args.pad_cols = Parameters->Padding[1];
-
-     convolution_depthwise3x3_winograd_impl(args, Input, Filter, Output);
-}
-*/
-
 static
 void
 MlasConv2dSingleChannel_CHW_Kernel3x3_Pad01_Dilation1(

From 0ffb811911fbaa38493930ecffcc076b36781bf4 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:21:58 -0800
Subject: [PATCH 24/31] Update
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index c47103a3963b2..c3783d516031f 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -278,7 +278,7 @@ Routine Description:
 
     Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
 
-    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+    Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data.
 
     Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
 

From 59e2b2dcff290ccebb0aaf9b50ea2354ecc02219 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:22:30 -0800
Subject: [PATCH 25/31] Update
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index c3783d516031f..a009a8f28cb59 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -282,7 +282,7 @@ Routine Description:
 
     Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
 
-    Zeroes - Point to working buffer where all 0.0f are filled.
+    Zeros - Point to working buffer where all 0.0f are filled.
 
 Note:
     No checking here as it is inner loop. Logic in generating Parameters controls the check.

From e34c930ac5641a267566144f5873b92282a108a9 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:22:49 -0800
Subject: [PATCH 26/31] Update onnxruntime/core/mlas/lib/convolve.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/lib/convolve.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 288109bd8e79f..7c7971061f780 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1111,7 +1111,6 @@ Return Value:
 
         ptrdiff_t TargetThreadCount = MlasGetMaximumThreadCount(ThreadPool);
 
-
         if (static_cast<size_t>(TargetThreadCount) >= BatchGroupCount) {
             TargetThreadCount = static_cast<ptrdiff_t>(BatchGroupCount);
         }

From 027e74292c071a5d8017c2480e8a9d2f2af6ccbe Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:23:08 -0800
Subject: [PATCH 27/31] Update onnxruntime/core/mlas/inc/mlas.h

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/inc/mlas.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h
index f59ac92e1a9c2..edf2bb7582a90 100644
--- a/onnxruntime/core/mlas/inc/mlas.h
+++ b/onnxruntime/core/mlas/inc/mlas.h
@@ -830,7 +830,7 @@ enum MLAS_CONV_ALGORITHM {
     MlasConvAlgorithmGemmDirect,
     MlasConvAlgorithmExpandThenGemm,
     MlasConvAlgorithmExpandThenGemmSegmented,
-#if defined(MLAS_TARGET_WASM_SCALAR) || defined (MLAS_TARGET_ARM64)
+#if defined(MLAS_TARGET_WASM_SCALAR) || defined(MLAS_TARGET_ARM64)
     MlasConvAlgorithmDepthwise,
 #endif
 };

From f93ed67a1fdec09c75d84b33fc8eaf9d1af59503 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:23:39 -0800
Subject: [PATCH 28/31] Update
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index a009a8f28cb59..d3ec05ec92fac 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -248,11 +248,11 @@ Routine Description:
 
     Parameters - conv parameters calculated based on conv parameters like padding, strides, dilations, etc.
 
-    Input - input channel data start. Input is NCHW, so this pointer point to single H x W image data.
+    Input - input channel data start. Input is NCHW, so this pointer points to single H x W image data.
 
-    Filter - Whole filters are of F x CpG x FH x FW, this filter point to single FH x FW filter data.
+    Filter - Whole filters are of F x CpG x FH x FW, this filter points to single FH x FW filter data.
 
-    Output - whole output are of N x F x OH x OW. This pointer point to single OH x OW output image data.
+    Output - whole output are of N x F x OH x OW. This pointer points to single OH x OW output image data.
 
 --*/
 {

From f5c1b812cf14f5b73b71159ec0ae5e2b94fb1059 Mon Sep 17 00:00:00 2001
From: Hariharan Seshadri <shariharan91@gmail.com>
Date: Thu, 4 Dec 2025 23:31:36 -0800
Subject: [PATCH 29/31] Update onnxruntime/core/mlas/lib/convolve.cpp

Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
---
 onnxruntime/core/mlas/lib/convolve.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/convolve.cpp b/onnxruntime/core/mlas/lib/convolve.cpp
index 7c7971061f780..0349ce418d406 100644
--- a/onnxruntime/core/mlas/lib/convolve.cpp
+++ b/onnxruntime/core/mlas/lib/convolve.cpp
@@ -1457,14 +1457,12 @@ Return Value:
         // and on ARM64, it is further restricted to strides = 1.
         // TODO: support more general depthwise convolution.
 
-        bool depthwise_conv_stride_support_check = [&]() {
         // On ARM64, only support stride = 1 for depthwise conv.
     #if defined(MLAS_TARGET_ARM64)
-        return Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1;
+        bool depthwise_conv_stride_support_check = Parameters->StrideShape[0] == 1 && Parameters->StrideShape[1] == 1;
     #else
-        return true;
+        bool depthwise_conv_stride_support_check = true;
     #endif
-        }();
 
         if (Dimensions == 2
                 && Parameters->FilterCount == 1 && Parameters->InputChannels == 1

From f15e5540aa8cad568ea26d6be53c0b4e07992494 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Thu, 4 Dec 2025 23:34:14 -0800
Subject: [PATCH 30/31] Benchmark updates

---
 onnxruntime/test/mlas/bench/bench_sconv.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/mlas/bench/bench_sconv.cpp b/onnxruntime/test/mlas/bench/bench_sconv.cpp
index 4330b6e47a63d..849911e322214 100644
--- a/onnxruntime/test/mlas/bench/bench_sconv.cpp
+++ b/onnxruntime/test/mlas/bench/bench_sconv.cpp
@@ -328,8 +328,8 @@ static void TeamsModel(benchmark::internal::Benchmark* b) {
   b->Args({2, 1, 1, 12, 72, 48, 80, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1});  // Conv_59 => 24x40
 
   b->Args({2, 1, 256, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});  // External customer model
-  b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
-  b->Args({2, 1, 24, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});   // External customer model
+  b->Args({2, 1, 512, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});  // External customer model
+  b->Args({2, 1, 960, 1, 1, 378, 378, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1});  // External customer model
 }
 
 BENCHMARK_CAPTURE(SCONV_NCHW, TeamsModel, "")->Apply(TeamsModel)->UseRealTime();

From 60a2b81b5d2da6e04cdc34ee2184ee8287eec580 Mon Sep 17 00:00:00 2001
From: Hari Seshadri <hasesh@microsoft.com>
Date: Tue, 16 Dec 2025 07:10:30 -0800
Subject: [PATCH 31/31] Use MLAS helpers instead of using NEON intrindics
 directly

---
 .../core/mlas/lib/sconv_nchw_kernel_neon.cpp  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
index d3ec05ec92fac..14b6b30c85bda 100644
--- a/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
+++ b/onnxruntime/core/mlas/lib/sconv_nchw_kernel_neon.cpp
@@ -63,13 +63,13 @@ MLAS_FORCEINLINE void DepthwiseAccumulateRowVector(
     }
 
     const float* r = row + base;
-    const float32x4_t c0 = vld1q_f32(r);
-    const float32x4_t c1 = vld1q_f32(r + 1);
-    const float32x4_t c2 = vld1q_f32(r + 2);
+    const float32x4_t c0 = MlasLoadFloat32x4(r);
+    const float32x4_t c1 = MlasLoadFloat32x4(r + 1);
+    const float32x4_t c2 = MlasLoadFloat32x4(r + 2);
 
-    acc = vmlaq_n_f32(acc, c0, w0);
-    acc = vmlaq_n_f32(acc, c1, w1);
-    acc = vmlaq_n_f32(acc, c2, w2);
+    acc = MlasMultiplyAddFloat32x4(c0, w0, acc);
+    acc = MlasMultiplyAddFloat32x4(c1, w1, acc);
+    acc = MlasMultiplyAddFloat32x4(c2, w2, acc);
 }
 
 MLAS_FORCEINLINE float DepthwiseComputeEdge(
@@ -185,18 +185,18 @@ static void DepthwiseConv3x3Stride1PadLe1Neon(
             }
 
             const size_t base = static_cast<size_t>(iw);
-            float32x4_t acc = vdupq_n_f32(0.0f);
+            float32x4_t acc = MlasZeroFloat32x4();
 
             DepthwiseAccumulateRowVector(acc, row0, base, w00, w01, w02);
             DepthwiseAccumulateRowVector(acc, row1, base, w10, w11, w12);
             DepthwiseAccumulateRowVector(acc, row2, base, w20, w21, w22);
 
             if (accumulate_output) {
-                const float32x4_t prev = vld1q_f32(out_row + ow);
-                acc = vmlaq_n_f32(acc, prev, beta);
+                const float32x4_t prev = MlasLoadFloat32x4(out_row + ow);
+                acc = MlasMultiplyAddFloat32x4(prev, beta, acc);
             }
 
-            vst1q_f32(out_row + ow, acc);
+            MlasStoreFloat32x4(out_row + ow, acc);
             ow += 4;
             processed += 4;
         }