diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2ed1589cfc..62229859ce 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -112,6 +112,7 @@ else ()
 endif ()
 option (${PROJ_NAME}_BUILD_TOOLS "Build the command-line tools" ON)
 option (${PROJ_NAME}_BUILD_TESTS "Build the unit tests" ON)
+option (OIIO_USE_HWY "Enable experimental Google Highway SIMD optimizations (if Highway is available)" OFF)
 set (OIIO_LIBNAME_SUFFIX "" CACHE STRING
      "Optional name appended to ${PROJECT_NAME} libraries that are built")
 option (BUILD_OIIOUTIL_ONLY "If ON, will build *only* libOpenImageIO_Util" OFF)
diff --git a/docs/dev/Architecture.md b/docs/dev/Architecture.md
index 5e52bf4143..72f81d8907 100644
--- a/docs/dev/Architecture.md
+++ b/docs/dev/Architecture.md
@@ -117,6 +117,10 @@ objects. These algorithms include simple operations like copying, resizing,
 and compositing images, as well as more complex operations like color
 conversions, resizing, filtering, etc.
 
+Some performance-critical `ImageBufAlgo` implementations have SIMD-accelerated
+paths using Google Highway. For implementation details and guidance for adding
+new kernels, see `docs/dev/ImageBufAlgo_Highway.md`.
+
 ## Image caching: TextureSystem and ImageCache
 
 There are situations where ImageBuf is still not the right abstraction,
diff --git a/docs/dev/ImageBufAlgo_Highway.md b/docs/dev/ImageBufAlgo_Highway.md
new file mode 100644
index 0000000000..960766978c
--- /dev/null
+++ b/docs/dev/ImageBufAlgo_Highway.md
@@ -0,0 +1,264 @@
+ImageBufAlgo Highway (hwy) Implementation Guide
+==============================================
+
+This document explains how OpenImageIO uses Google Highway (hwy) to accelerate
+selected `ImageBufAlgo` operations, and how to add or modify kernels in a way
+that preserves OIIO semantics while keeping the code maintainable.
+
+This is a developer-facing document about the implementation structure in
+`src/libOpenImageIO/`. It does not describe the public API behavior of the
+algorithms.
+
+
+Goals and non-goals
+-------------------
+
+Goals:
+- Make the hwy-backed code paths easy to read and easy to extend.
+- Centralize repetitive boilerplate (type conversion, tails, ROI pointer math).
+- Preserve OIIO's numeric semantics (normalized integer model).
+- Keep scalar fallbacks as the source of truth for tricky layout cases.
+
+Non-goals:
+- Explain Highway itself. Refer to the upstream Highway documentation.
+- Guarantee that every ImageBufAlgo op has a hwy implementation.
+
+
+Where the code lives
+--------------------
+
+Core helpers:
+- `src/libOpenImageIO/imagebufalgo_hwy_pvt.h`
+
+Typical hwy call sites:
+- `src/libOpenImageIO/imagebufalgo_addsub.cpp`
+- `src/libOpenImageIO/imagebufalgo_muldiv.cpp`
+- `src/libOpenImageIO/imagebufalgo_mad.cpp`
+- `src/libOpenImageIO/imagebufalgo_pixelmath.cpp`
+- `src/libOpenImageIO/imagebufalgo_xform.cpp` (some ops are hwy-accelerated)
+
+
+Enabling and gating the hwy path
+-------------------------------
+
+The hwy path is only used when:
+- Highway usage is enabled at runtime (`OIIO::pvt::enable_hwy`).
+- The relevant `ImageBuf` objects have local pixel storage (`localpixels()` is
+  non-null), meaning the data is in process memory rather than accessed through
+  an `ImageCache` tile abstraction.
+- The operation can be safely expressed as contiguous streams of pixels/channels
+  for the hot path, or the code falls back to a scalar implementation for
+  strided/non-contiguous layouts.
+
+The common gating pattern looks like:
+- In a typed `*_impl` dispatcher: check `OIIO::pvt::enable_hwy` and `localpixels`
+  and then call a `*_impl_hwy` function; otherwise call `*_impl_scalar`.
+
+Important: the hwy path is an optimization. Correctness must not depend on hwy.
+
+
+OIIO numeric semantics: why we promote to float
+----------------------------------------------
+
+OIIO treats integer image pixels as normalized values:
+- Unsigned integers represent [0, 1].
+- Signed integers represent approximately [-1, 1] with clamping for INT_MIN.
+
+Therefore, most pixel math must be performed in float (or double) space, even
+when the stored data is integer. This is why the hwy layer uses the
+"LoadPromote/Operate/DemoteStore" pattern.
+
+For additional discussion (and pitfalls of saturating integer arithmetic), see:
+- `HIGHWAY_SATURATING_ANALYSIS.md`
+
+
+The core pattern: LoadPromote -> RunHwy* -> DemoteStore
+-------------------------------------------------------
+
+The helper header `imagebufalgo_hwy_pvt.h` defines the reusable building blocks:
+
+1) Computation type selection
+   - `SimdMathType<T>` selects `float` for most types, and `double` only when
+     the destination type is `double`.
+
+   Rationale:
+   - Float math is significantly faster on many targets.
+   - For OIIO, integer images are normalized to [0,1] (or ~[-1,1]), so float
+     precision is sufficient for typical image processing workloads.
+
+2) Load and promote (with normalization)
+   - `LoadPromote(d, ptr)` and `LoadPromoteN(d, ptr, count)` load values and
+     normalize integer ranges into the computation space.
+
+   Rationale:
+   - Consolidates all normalization and conversion logic in one place.
+   - Prevents subtle drift where each operation re-implements integer scaling.
+   - Ensures tail handling ("N" variants) is correct and consistent.
+
+3) Demote and store (with denormalization/clamp/round)
+   - `DemoteStore(d, ptr, v)` and `DemoteStoreN(d, ptr, v, count)` reverse the
+     normalization and store results in the destination pixel type.
+
+   Rationale:
+   - Centralizes rounding and clamping behavior for all destination types.
+   - Ensures output matches OIIO scalar semantics.
+
+4) Generic kernel runners (streaming arrays)
+   - `RunHwyUnaryCmd`, `RunHwyCmd` (binary), `RunHwyTernaryCmd`
+   - These are the primary entry points for most hwy kernels.
+
+   Rationale:
+   - Encapsulates lane iteration and tail processing once.
+   - The call sites only provide the per-lane math lambda, not the boilerplate.
+
+
+Native integer runners: when they are valid
+-------------------------------------------
+
+Some operations are "scale-invariant" under OIIO's normalized integer model.
+For example, for unsigned integer add:
+- `(a/max + b/max)` in float space, then clamped to [0,1], then scaled by max
+  matches saturated integer add `SaturatedAdd(a, b)` for the same bit depth.
+
+For those cases, `imagebufalgo_hwy_pvt.h` provides:
+- `RunHwyUnaryNativeInt<T>`
+- `RunHwyBinaryNativeInt<T>`
+
+These should only be used when all of the following are true:
+- The operation is known to be scale-invariant under the normalization model.
+- Input and output types are the same integral type.
+- The operation does not depend on mixed types or float-range behavior.
+
+Rationale:
+- Avoids promotion/demotion overhead and can be materially faster.
+- Must be opt-in and explicit, because many operations are NOT compatible with
+  raw integer arithmetic (e.g. multiplication, division, pow).
+
+
+Local pixel pointer helpers: reducing boilerplate safely
+-------------------------------------------------------
+
+Most hwy call sites need repeated pointer and stride computations:
+- Pixel size in bytes.
+- Scanline size in bytes.
+- Base pointer to local pixels.
+- Per-row pointer for a given ROI and scanline.
+- Per-pixel pointer for non-contiguous fallbacks.
+
+To centralize that, `imagebufalgo_hwy_pvt.h` defines:
+- `HwyPixels(ImageBuf&)` and `HwyPixels(const ImageBuf&)`
+  returning a small view (`HwyLocalPixelsView`) with:
+  - base pointer (`std::byte*` / `const std::byte*`)
+  - `pixel_bytes`, `scanline_bytes`
+  - `xbegin`, `ybegin`, `nchannels`
+- `RoiNChannels(roi)` for `roi.chend - roi.chbegin`
+- `ChannelsContiguous<T>(view, nchannels)`:
+  true only when the pixel stride exactly equals `nchannels * sizeof(T)`
+- `PixelBase(view, x, y)`, `ChannelPtr<T>(view, x, y, ch)`
+- `RoiRowPtr<T>(view, y, roi)` for the start of the ROI row at `roi.xbegin` and
+  `roi.chbegin`.
+
+Rationale:
+- Avoids duplicating fragile byte-offset math across many ops.
+- Makes it visually obvious what the code is doing: "get row pointer" vs
+  "compute offset by hand."
+- Makes non-contiguous fallback paths less error-prone by reusing the same
+  pointer computations.
+
+Important: these helpers are only valid for `ImageBuf` instances with local
+pixels (`localpixels()` non-null). The call sites must check that before using
+them.
+
+
+Contiguous fast path vs non-contiguous fallback
+-----------------------------------------------
+
+Most operations implement two paths:
+
+1) Contiguous fast path:
+   - Used when pixels are tightly packed for the ROI's channel range.
+   - The operation is executed as a 1D stream of length:
+     `roi.width() * (roi.chend - roi.chbegin)`
+   - Uses `RunHwy*Cmd` (or native-int runner) and benefits from:
+     - fewer branches
+     - fewer pointer computations
+     - auto tail handling
+
+2) Non-contiguous fallback:
+   - Used when pixels have padding, unusual strides, or channel subsets that do
+     not form a dense stream.
+   - Typically loops pixel-by-pixel and channel-by-channel.
+   - May still use the `ChannelPtr` helpers to compute correct addresses.
+
+Rationale:
+- The contiguous path is where SIMD delivers large gains.
+- Trying to SIMD-optimize arbitrary strided layouts often increases complexity
+  and risk for marginal benefit. Keeping a scalar fallback preserves
+  correctness and maintainability.
+
+
+How to add a new hwy kernel
+---------------------------
+
+Step 1: Choose the kernel shape
+- Unary: `R = f(A)` -> use `RunHwyUnaryCmd`
+- Binary: `R = f(A, B)` -> use `RunHwyCmd`
+- Ternary: `R = f(A, B, C)` -> use `RunHwyTernaryCmd`
+
+Step 2: Decide if a native-int fast path is valid
+- Only for scale-invariant ops and same-type integral inputs/outputs.
+- Use `RunHwyUnaryNativeInt` / `RunHwyBinaryNativeInt` when safe.
+- Otherwise, always use the promote/demote runners.
+
+Step 3: Implement the hwy body with a contig check
+Typical structure inside `*_impl_hwy`:
+- Acquire views once:
+  - `auto Rv = HwyPixels(R);`
+  - `auto Av = HwyPixels(A);` etc.
+- In the parallel callback:
+  - compute `nchannels = RoiNChannels(roi)`
+  - compute `contig = ChannelsContiguous<...>(...)` for each image
+  - for each scanline y:
+    - `Rtype* r_row = RoiRowPtr<Rtype>(Rv, y, roi);`
+    - `const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);` etc.
+    - if contig: call `RunHwy*` with `n = roi.width() * nchannels`
+    - else: fall back per pixel, per channel
+
+Step 4: Keep the scalar path as the reference
+- The scalar implementation should remain correct for all layouts and types.
+- The hwy path should match scalar results for supported cases.
+
+
+Design rationale summary
+------------------------
+
+This design intentionally separates concerns:
+- Type conversion and normalization are centralized (`LoadPromote`,
+  `DemoteStore`).
+- SIMD lane iteration and tail handling are centralized (`RunHwy*` runners).
+- Image address computations are centralized (`HwyPixels`, `RoiRowPtr`,
+  `ChannelPtr`).
+- Operation-specific code is reduced to short lambdas expressing the math.
+
+This makes the hwy layer:
+- Easier to maintain: fewer places to fix bugs when semantics change.
+- Easier to extend: adding an op mostly means writing the math lambda and the
+  dispatch glue.
+- Safer: correctness for unusual layouts remains in scalar fallbacks.
+
+
+Notes on `half`
+---------------
+
+The hwy conversion helpers handle `half` by converting through
+`hwy::float16_t`. This currently assumes the underlying `half` representation
+is compatible with how Highway loads/stores 16-bit floats.
+
+If this assumption is revisited in the future, it should be changed as a
+separate, explicit correctness/performance project.
+
+
+<!-- SPDX-License-Identifier: CC-BY-4.0 -->
+<!-- Copyright Contributors to the OpenImageIO Project. -->
+
+
diff --git a/src/cmake/externalpackages.cmake b/src/cmake/externalpackages.cmake
index 12467ae6b6..55874fdd90 100644
--- a/src/cmake/externalpackages.cmake
+++ b/src/cmake/externalpackages.cmake
@@ -225,6 +225,11 @@ if (USE_QT AND OPENGL_FOUND)
 endif ()
 
 
+# Google Highway for SIMD (optional optimization)
+if (OIIO_USE_HWY)
+    checked_find_package (hwy)
+endif ()
+
 # Tessil/robin-map
 checked_find_package (Robinmap REQUIRED
                       VERSION_MIN 1.2.0
diff --git a/src/doc/imagebufalgo.rst b/src/doc/imagebufalgo.rst
index b013ce0d20..200417accc 100644
--- a/src/doc/imagebufalgo.rst
+++ b/src/doc/imagebufalgo.rst
@@ -152,6 +152,68 @@ the computation without spawning additional threads, which might tend to
 crowd out the other application threads.
 
 
+SIMD Performance and Data Types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Many ImageBufAlgo operations use SIMD (Single Instruction, Multiple Data)
+optimizations powered by the Google Highway library to achieve significant
+performance improvements, particularly for integer image formats.
+
+**Integer Type Optimizations:**
+
+OpenImageIO treats all integer images as normalized Standard Dynamic Range
+(SDR) data:
+
+* Unsigned integers (``uint8``, ``uint16``, ``uint32``, ``uint64``) are
+  normalized to the [0.0, 1.0] range: ``float_value = int_value / max_value``
+* Signed integers (``int8``, ``int16``, ``int32``, ``int64``) are normalized
+  to approximately the [-1.0, 1.0] range: ``float_value = int_value / max_value``
+
+Most ImageBufAlgo operations convert integer data to float, perform the
+operation, and convert back. Highway SIMD provides 3-5x speedup for these
+operations compared to scalar code.
+
+**Scale-Invariant Operations:**
+
+Certain operations are *scale-invariant*, meaning they produce identical
+results whether performed on raw integers or normalized floats. For these
+operations, OpenImageIO uses native integer SIMD paths that avoid float
+conversion entirely, achieving 6-12x speedup (2-3x faster than the float
+promotion path):
+
+* ``add``, ``sub`` (with saturation)
+* ``min``, ``max``
+* ``abs``, ``absdiff``
+
+These optimizations automatically activate when all input and output images
+have matching integer types (e.g., all ``uint8``). When types differ or when
+mixing integer and float images, the standard float promotion path is used.
+
+**Controlling SIMD Optimizations:**
+
+Highway SIMD is enabled by default. To disable it globally::
+
+    OIIO::attribute("enable_hwy", 0);
+
+Or via environment variable::
+
+    export OPENIMAGEIO_ENABLE_HWY=0
+
+This is primarily useful for debugging or performance comparison. In normal
+use, the optimizations should remain enabled for best performance.
+
+**Performance Expectations:**
+
+Typical speedups with Highway SIMD (compared to scalar code):
+
+* Float operations: 3-5x faster
+* Integer operations (with float conversion): 3-5x faster
+* Integer scale-invariant operations (native int): 6-12x faster
+* Half-float operations: 3-5x faster
+
+Actual performance depends on the specific operation, image size, data types,
+and hardware capabilities (AVX2, AVX-512, ARM NEON, etc.).
+
 
 .. _sec-iba-patterns:
 
diff --git a/src/doc/imageioapi.rst b/src/doc/imageioapi.rst
index d2d6b192b4..dca5a66da5 100644
--- a/src/doc/imageioapi.rst
+++ b/src/doc/imageioapi.rst
@@ -397,16 +397,36 @@ inside the source code.
     line, but not the full human-readable command line. (This was added in
     OpenImageIO 2.5.11.)
 
+.. cpp:var:: OPENIMAGEIO_ENABLE_HWY
+
+    Controls whether to use Google Highway SIMD library optimizations for
+    ImageBufAlgo operations. If set to "1" (the default), Highway SIMD
+    optimizations will be enabled for supported operations, providing
+    significant performance improvements (typically 3-12x faster) on integer
+    image types. If set to "0", these optimizations will be disabled and fall
+    back to scalar implementations.
+
+    This can also be controlled at runtime via::
+
+        OIIO::attribute("enable_hwy", 1);  // enable (default)
+        OIIO::attribute("enable_hwy", 0);  // disable
+
+    Note: Highway SIMD optimizations are particularly beneficial for integer
+    image formats (uint8, uint16, int8, int16, uint32, int32, etc.) and provide
+    additional speedup for scale-invariant operations (add, sub, min, max,
+    absdiff) that can operate directly on integer data without float conversion.
+    (This was added in OpenImageIO 3.1.)
+
 .. cpp:var:: OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH
 
-    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier. 
-    If set to "1", all directories under ``PATH`` will be added to the DLL load 
+    Windows only. Mimics the DLL-loading behavior of Python 3.7 and earlier.
+    If set to "1", all directories under ``PATH`` will be added to the DLL load
     path before attempting to import the OpenImageIO module. (This was added in
     OpenImageIO 3.0.3.0)
 
-    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style" 
-    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH` 
-    environment variable (added in OpenImageIO 2.4.0/2.3.18). 
+    Note: This "opt-in-style" behavior replaces and inverts the "opt-out-style"
+    Windows DLL-loading behavior governed by the now-defunct `OIIO_LOAD_DLLS_FROM_PATH`
+    environment variable (added in OpenImageIO 2.4.0/2.3.18).
 
-    In other words, to reproduce the default Python-module-loading behavior of 
+    In other words, to reproduce the default Python-module-loading behavior of
     earlier versions of OIIO, set ``OPENIMAGEIO_PYTHON_LOAD_DLLS_FROM_PATH=1``.
diff --git a/src/include/OpenImageIO/platform.h b/src/include/OpenImageIO/platform.h
index e4760e5545..aeba989947 100644
--- a/src/include/OpenImageIO/platform.h
+++ b/src/include/OpenImageIO/platform.h
@@ -305,7 +305,12 @@
 /// enough to cause trouble). Consider using the OIIO_ALLOCATE_STACK_OR_HEAP
 /// idiom rather than a direct OIIO_ALLOCA if you aren't sure the item will
 /// be small.
-#if defined(__GNUC__)
+#if defined(__has_include)
+#    if __has_include(<alloca.h>)
+#        include <alloca.h>  // for alloca (when available)
+#    endif
+#endif
+#if defined(__GNUC__) || defined(__clang__)
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)__builtin_alloca((size) * sizeof(type))) : nullptr)
 #else
 #    define OIIO_ALLOCA(type, size) (assert(size < (1<<20)), (size) != 0 ? ((type*)alloca((size) * sizeof(type))) : nullptr)
diff --git a/src/libOpenImageIO/CMakeLists.txt b/src/libOpenImageIO/CMakeLists.txt
index f2459b2d32..d813606755 100644
--- a/src/libOpenImageIO/CMakeLists.txt
+++ b/src/libOpenImageIO/CMakeLists.txt
@@ -168,6 +168,17 @@ target_link_libraries (OpenImageIO
             ${CMAKE_DL_LIBS}
         )
 
+# Google Highway (hwy) is an optional optimization dependency.
+set (_oiio_use_hwy 0)
+if (OIIO_USE_HWY AND TARGET hwy::hwy)
+    set (_oiio_use_hwy 1)
+    target_link_libraries (OpenImageIO PRIVATE hwy::hwy)
+    if (TARGET hwy::hwy_contrib)
+        target_link_libraries (OpenImageIO PRIVATE hwy::hwy_contrib)
+    endif ()
+endif ()
+target_compile_definitions (OpenImageIO PRIVATE OIIO_USE_HWY=${_oiio_use_hwy})
+
 if (WIN32)
     target_link_libraries (OpenImageIO PRIVATE psapi)
 endif()
diff --git a/src/libOpenImageIO/imagebufalgo_addsub.cpp b/src/libOpenImageIO/imagebufalgo_addsub.cpp
index c7a4d83e9c..127cedd858 100644
--- a/src/libOpenImageIO/imagebufalgo_addsub.cpp
+++ b/src/libOpenImageIO/imagebufalgo_addsub.cpp
@@ -18,6 +18,10 @@
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
+
 #include "imageio_pvt.h"
 
 
@@ -26,8 +30,8 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class Atype, class Btype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -44,7 +48,8 @@ add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+add_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -58,6 +63,298 @@ add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+#if OIIO_USE_HWY
+// Native integer add using SaturatedAdd (scale-invariant, no float conversion)
+template<class T>
+static bool
+add_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated add - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedAdd(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating add in scalar
+                        int64_t sum = (int64_t)a_ptr[c] + (int64_t)b_ptr[c];
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                           : (T)sum;
+                        } else {
+                            r_ptr[c] = (sum > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (sum < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)sum;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                // Process whole line as one vector stream
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Add(a, b);
+                                               });
+            } else {
+                // Process pixel by pixel (scalar fallback for strided channels)
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            + static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] + (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant add when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return add_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return add_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+#endif
+    return add_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+add_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return add_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+#endif
+    return add_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+// Native integer sub using SaturatedSub (scale-invariant, no float conversion)
+template<class T>
+static bool
+sub_impl_hwy_native_int(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                        ROI roi, int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<T>(Rv, nchannels)
+                            && ChannelsContiguous<T>(Av, nchannels)
+                            && ChannelsContiguous<T>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            T* r_row       = RoiRowPtr<T>(Rv, y, roi);
+            const T* a_row = RoiRowPtr<T>(Av, y, roi);
+            const T* b_row = RoiRowPtr<T>(Bv, y, roi);
+
+            if (contig) {
+                // Native integer saturated sub - much faster than float conversion!
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyBinaryNativeInt<T>(r_row, a_row, b_row, n,
+                                         [](auto d, auto a, auto b) {
+                                             return hn::SaturatedSub(a, b);
+                                         });
+            } else {
+                // Scalar fallback
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    T* r_ptr       = ChannelPtr<T>(Rv, x, y, roi.chbegin);
+                    const T* a_ptr = ChannelPtr<T>(Av, x, y, roi.chbegin);
+                    const T* b_ptr = ChannelPtr<T>(Bv, x, y, roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        // Saturating sub in scalar
+                        if constexpr (std::is_unsigned_v<T>) {
+                            r_ptr[c] = (a_ptr[c] > b_ptr[c])
+                                           ? (a_ptr[c] - b_ptr[c])
+                                           : T(0);
+                        } else {
+                            int64_t diff = (int64_t)a_ptr[c]
+                                           - (int64_t)b_ptr[c];
+                            r_ptr[c] = (diff > std::numeric_limits<T>::max())
+                                           ? std::numeric_limits<T>::max()
+                                       : (diff < std::numeric_limits<T>::min())
+                                           ? std::numeric_limits<T>::min()
+                                           : (T)diff;
+                        }
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Sub(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            - static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels()) {
+        // Use native integer path for scale-invariant sub when all types match
+        // and are integer types (much faster: 6-12x vs 3-5x with float conversion)
+        constexpr bool all_same = std::is_same_v<Rtype, Atype>
+                                  && std::is_same_v<Atype, Btype>;
+        constexpr bool is_integer = std::is_integral_v<Rtype>;
+        if constexpr (all_same && is_integer) {
+            return sub_impl_hwy_native_int<Rtype>(R, A, B, roi, nthreads);
+        }
+        return sub_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+    }
+#endif
+    return sub_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
 static bool
 add_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -155,8 +452,8 @@ ImageBufAlgo::add(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-sub_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+sub_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
diff --git a/src/libOpenImageIO/imagebufalgo_hwy_pvt.h b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
new file mode 100644
index 0000000000..fe4c9b0d8a
--- /dev/null
+++ b/src/libOpenImageIO/imagebufalgo_hwy_pvt.h
@@ -0,0 +1,970 @@
+// Copyright Contributors to the OpenImageIO project.
+// SPDX-License-Identifier: Apache-2.0
+// https://github.com/AcademySoftwareFoundation/OpenImageIO
+
+#pragma once
+
+#include <OpenImageIO/half.h>
+#include <OpenImageIO/imagebuf.h>
+#include <OpenImageIO/imageio.h>
+#include <algorithm>
+#include <cstddef>
+#include <hwy/contrib/math/math-inl.h>
+#include <hwy/highway.h>
+#include <type_traits>
+
+OIIO_NAMESPACE_BEGIN
+
+// Alias for Highway's namespace for convenience
+namespace hn = hwy::HWY_NAMESPACE;
+
+// -----------------------------------------------------------------------
+// ImageBuf local pixel helpers (header-only)
+// -----------------------------------------------------------------------
+
+template<class ByteT> struct HwyLocalPixelsView {
+    ByteT* base           = nullptr;
+    size_t pixel_bytes    = 0;
+    size_t scanline_bytes = 0;
+    int xbegin            = 0;
+    int ybegin            = 0;
+    int nchannels         = 0;
+};
+
+inline HwyLocalPixelsView<std::byte>
+HwyPixels(ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline HwyLocalPixelsView<const std::byte>
+HwyPixels(const ImageBuf& img)
+{
+    const ImageSpec& spec = img.spec();
+    return { reinterpret_cast<const std::byte*>(img.localpixels()),
+             spec.pixel_bytes(),
+             spec.scanline_bytes(),
+             img.xbegin(),
+             img.ybegin(),
+             spec.nchannels };
+}
+
+inline int
+RoiNChannels(const ROI& roi) noexcept
+{
+    return roi.chend - roi.chbegin;
+}
+
+template<class T, class ByteT>
+inline bool
+ChannelsContiguous(const HwyLocalPixelsView<ByteT>& v, int nchannels) noexcept
+{
+    return size_t(nchannels) * sizeof(T) == v.pixel_bytes;
+}
+
+template<class ByteT>
+inline ByteT*
+PixelBase(const HwyLocalPixelsView<ByteT>& v, int x, int y) noexcept
+{
+    return v.base + size_t(y - v.ybegin) * v.scanline_bytes
+           + size_t(x - v.xbegin) * v.pixel_bytes;
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+ChannelPtr(const HwyLocalPixelsView<ByteT>& v, int x, int y, int ch) noexcept
+{
+    using RetT = std::conditional_t<std::is_const_v<ByteT>, const T, T>;
+    return reinterpret_cast<RetT*>(PixelBase(v, x, y) + size_t(ch) * sizeof(T));
+}
+
+template<class T, class ByteT>
+inline std::conditional_t<std::is_const_v<ByteT>, const T*, T*>
+RoiRowPtr(const HwyLocalPixelsView<ByteT>& v, int y, const ROI& roi) noexcept
+{
+    return ChannelPtr<T>(v, roi.xbegin, y, roi.chbegin);
+}
+
+// -----------------------------------------------------------------------
+// Type Traits
+// -----------------------------------------------------------------------
+
+/// Determine the appropriate SIMD math type for a given result type.
+/// Promotes smaller types to float, keeps double as double.
+/// Note: uint32_t uses float (not double) for image processing performance.
+/// In OIIO, uint32 images are normalized to 0-1 range like uint8/uint16,
+/// so float precision (24-bit mantissa) is sufficient and much faster than double.
+template<typename T> struct SimdMathType {
+    using type = float;
+};
+template<> struct SimdMathType<double> {
+    using type = double;
+};
+
+// -----------------------------------------------------------------------
+// Load and Promote
+// -----------------------------------------------------------------------
+
+/// Load and promote source data to target SIMD type.
+/// Handles type conversions from various source formats (uint8_t, int8_t, uint16_t,
+/// int16_t, uint32_t, int32_t, uint64_t, int64_t, half, float, double) to the
+/// target SIMD computation type.
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @return SIMD vector with promoted values
+template<class D, typename SrcT>
+inline auto
+LoadPromote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::Load(d, ptr);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::Load(d16, reinterpret_cast<const T16*>(ptr));
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::Load(d_u8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::Load(d_i8, ptr);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::Load(d_u16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::Load(d_i16, ptr);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::Load(d_u32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::Load(d_i32, ptr);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        // Note: Precision loss expected for large values (>24 bits)
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::Load(d_u64, ptr);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::Load(d_i64, ptr);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+/// Load and promote partial source data to target SIMD type.
+/// Same as LoadPromote but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag defining the target SIMD type
+/// @param ptr Pointer to source data (may be unaligned)
+/// @param count Number of elements to load (must be <= lane count)
+/// @return SIMD vector with promoted values (undefined in unused lanes)
+template<class D, typename SrcT>
+inline auto
+LoadPromoteN(D d, const SrcT* ptr, size_t count)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        return hn::LoadN(d, ptr, count);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+        auto v16  = hn::LoadN(d16, reinterpret_cast<const T16*>(ptr), count);
+        return hn::PromoteTo(d, v16);
+    } else if constexpr (std::is_same_v<SrcT, uint8_t>) {
+        auto d_u8       = hn::Rebind<uint8_t, D>();
+        auto v_u8       = hn::LoadN(d_u8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_u8)));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 255.0)));
+    } else if constexpr (std::is_same_v<SrcT, int8_t>) {
+        auto d_i8       = hn::Rebind<int8_t, D>();
+        auto v_i8       = hn::LoadN(d_i8, ptr, count);
+        auto v_promoted = hn::ConvertTo(
+            d, hn::PromoteTo(hn::Rebind<int32_t, D>(),
+                             hn::PromoteTo(hn::Rebind<int16_t, D>(), v_i8)));
+        // Normalize: map [-128, 127] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 127.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint16_t>) {
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::LoadN(d_u16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_u16));
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 65535.0)));
+    } else if constexpr (std::is_same_v<SrcT, int16_t>) {
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::LoadN(d_i16, ptr, count);
+        auto v_promoted
+            = hn::ConvertTo(d, hn::PromoteTo(hn::Rebind<int32_t, D>(), v_i16));
+        // Normalize: map [-32768, 32767] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 32767.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint32_t>) {
+        // uint32 to float: Load, convert, and normalize to 0-1 range
+        auto d_u32      = hn::Rebind<uint32_t, D>();
+        auto v_u32      = hn::LoadN(d_u32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_u32);
+        // Normalize to 0-1 range for image operations
+        return hn::Mul(v_promoted, hn::Set(d, (MathT)(1.0 / 4294967295.0)));
+    } else if constexpr (std::is_same_v<SrcT, int32_t>) {
+        // int32 to float: Load, convert, and normalize to approximately [-1.0, 1.0]
+        auto d_i32      = hn::Rebind<int32_t, D>();
+        auto v_i32      = hn::LoadN(d_i32, ptr, count);
+        auto v_promoted = hn::ConvertTo(d, v_i32);
+        // Normalize: map [-2147483648, 2147483647] to approximately [-1.0, 1.0]
+        // Clamp INT_MIN so we never produce values < -1.0.
+        auto v_norm = hn::Mul(v_promoted,
+                              hn::Set(d, (MathT)(1.0 / 2147483647.0)));
+        return hn::Max(v_norm, hn::Set(d, (MathT)-1.0));
+    } else if constexpr (std::is_same_v<SrcT, uint64_t>) {
+        // uint64 to float: Load and demote to uint32, then convert
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::LoadN(d_u64, ptr, count);
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::DemoteTo(d_u32, v_u64);
+        return hn::ConvertTo(d, v_u32);
+    } else if constexpr (std::is_same_v<SrcT, int64_t>) {
+        // int64 to float: Load and demote to int32, then convert
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::LoadN(d_i64, ptr, count);
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::DemoteTo(d_i32, v_i64);
+        return hn::ConvertTo(d, v_i32);
+    } else {
+        return hn::Zero(d);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Demote and Store
+// -----------------------------------------------------------------------
+
+/// Demote SIMD values and store to destination type.
+/// Handles type conversions from SIMD computation type (float/double) back to
+/// various destination formats with proper rounding and clamping for integer types.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStore(D d, DstT* ptr, VecT v)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::Store(v, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::Store(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr));
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::Store(v_u8, d_u8, ptr);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -128-127 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::Store(v_i8, d_i8, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::Store(v_u16, d_u16, ptr);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -32768-32767 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::Store(v_i16, d_i16, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::Store(v_u32, d_u32, ptr);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] to int32 range
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to -2147483648-2147483647 range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::Store(v_i32, d_i32, ptr);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        // Note: Precision loss expected (float has only 24-bit mantissa)
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::Store(v_u64, d_u64, ptr);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::Store(v_i64, d_i64, ptr);
+    }
+}
+
+/// Demote and store partial SIMD values to destination type.
+/// Same as DemoteStore but handles partial vectors (< full lane count).
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination data (may be unaligned)
+/// @param v SIMD vector to demote and store
+/// @param count Number of elements to store (must be <= lane count)
+template<class D, typename DstT, typename VecT>
+inline void
+DemoteStoreN(D d, DstT* ptr, VecT v, size_t count)
+{
+    using MathT = typename D::T;
+    using VecD  = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        hn::StoreN(v, d, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        auto d16 = hn::Rebind<hwy::float16_t, D>();
+        auto v16 = hn::DemoteTo(d16, v);
+        hn::StoreN(v16, d16, reinterpret_cast<hwy::float16_t*>(ptr), count);
+    } else if constexpr (std::is_same_v<DstT, uint8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-255 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)255.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)255.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_u8  = hn::Rebind<uint8_t, D>();
+        auto v_u8  = hn::DemoteTo(d_u8, v_i16);
+        hn::StoreN(v_u8, d_u8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int8_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-128, 127] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)127.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-128.0);
+        VecD v_max     = hn::Set(d, (MathT)127.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        auto d_i8  = hn::Rebind<int8_t, D>();
+        auto v_i8  = hn::DemoteTo(d_i8, v_i16);
+        hn::StoreN(v_i8, d_i8, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-65535 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)65535.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)65535.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_u16 = hn::Rebind<uint16_t, D>();
+        auto v_u16 = hn::DemoteTo(d_u16, vi32);
+        hn::StoreN(v_u16, d_u16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int16_t>) {
+        VecD v_val = (VecD)v;
+        // Denormalize from approximately [-1.0, 1.0] range to [-32768, 32767] range
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)32767.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-32768.0);
+        VecD v_max     = hn::Set(d, (MathT)32767.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d32   = hn::Rebind<int32_t, D>();
+        auto vi32  = hn::ConvertTo(d32, v_clamped);
+        auto d_i16 = hn::Rebind<int16_t, D>();
+        auto v_i16 = hn::DemoteTo(d_i16, vi32);
+        hn::StoreN(v_i16, d_i16, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint32_t>) {
+        // float -> uint32: Denormalize from 0-1 to 0-4294967295, round and convert
+        VecD v_val = (VecD)v;
+        // Denormalize from 0-1 range to 0-4294967295 range
+        VecD v_denorm  = hn::Mul(v_val, hn::Set(d, (MathT)4294967295.0));
+        VecD v_rounded = hn::Add(v_denorm, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_max     = hn::Set(d, (MathT)4294967295.0);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        hn::StoreN(v_u32, d_u32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int32_t>) {
+        // float -> int32: Denormalize from approximately [-1.0, 1.0] range to [-2147483648, 2147483647] range
+        VecD v_val    = (VecD)v;
+        VecD v_denorm = hn::Mul(v_val, hn::Set(d, (MathT)2147483647.0));
+        // Symmetric round-to-nearest for signed values (assumes ConvertTo truncates).
+        auto is_neg    = hn::Lt(v_denorm, hn::Zero(d));
+        auto v_bias    = hn::IfThenElse(is_neg, hn::Set(d, (MathT)-0.5),
+                                        hn::Set(d, (MathT)0.5));
+        VecD v_rounded = hn::Add(v_denorm, v_bias);
+        VecD v_min     = hn::Set(d, (MathT)-2147483648.0);
+        VecD v_max     = hn::Set(d, (MathT)2147483647.0);
+        VecD v_clamped = hn::Max(v_rounded, v_min);
+        v_clamped      = hn::Min(v_clamped, v_max);
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_clamped);
+        hn::StoreN(v_i32, d_i32, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, uint64_t>) {
+        // float -> uint64: Promote via uint32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+        VecD v_zero    = hn::Zero(d);
+        VecD v_clamped = hn::Max(v_rounded, v_zero);
+
+        auto d_u32 = hn::Rebind<uint32_t, D>();
+        auto v_u32 = hn::ConvertTo(d_u32, v_clamped);
+        auto d_u64 = hn::Rebind<uint64_t, D>();
+        auto v_u64 = hn::PromoteTo(d_u64, v_u32);
+        hn::StoreN(v_u64, d_u64, ptr, count);
+    } else if constexpr (std::is_same_v<DstT, int64_t>) {
+        // float -> int64: Promote via int32
+        VecD v_val     = (VecD)v;
+        VecD v_rounded = hn::Add(v_val, hn::Set(d, (MathT)0.5));
+
+        auto d_i32 = hn::Rebind<int32_t, D>();
+        auto v_i32 = hn::ConvertTo(d_i32, v_rounded);
+        auto d_i64 = hn::Rebind<int64_t, D>();
+        auto v_i64 = hn::PromoteTo(d_i64, v_i32);
+        hn::StoreN(v_i64, d_i64, ptr, count);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Native Integer Kernel Runners (No Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like abs, where int_op(a) == denorm(float_op(norm(a))).
+/// Much faster than promotion path - operates directly on integer SIMD vectors.
+/// @param r Destination array (same type as source)
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Abs(va); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyUnaryNativeInt(T* r, const T* a, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto res = op(d, va);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto res = op(d, va);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on native integer arrays (no type promotion).
+/// For scale-invariant operations like saturated add, min, max, where:
+/// int_op(a, b) == denorm(float_op(norm(a), norm(b))).
+/// Much faster than promotion path - no conversion overhead.
+/// @param r Destination array (same type as sources)
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::SaturatedAdd(va, vb); }
+template<typename T, typename OpFunc>
+inline void
+RunHwyBinaryNativeInt(T* r, const T* a, const T* b, size_t n, OpFunc op)
+{
+    const hn::ScalableTag<T> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = hn::Load(d, a + x);
+        auto vb  = hn::Load(d, b + x);
+        auto res = op(d, va, vb);
+        hn::Store(res, d, r + x);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = hn::LoadN(d, a + x, remaining);
+        auto vb  = hn::LoadN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        hn::StoreN(res, d, r + x, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Generic Kernel Runners (With Type Conversion)
+// -----------------------------------------------------------------------
+
+/// Execute a unary SIMD operation on an array.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a Source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector) and returning result vector
+///           Example: [](auto d, auto va) { return hn::Sqrt(va); }
+template<typename Rtype, typename Atype, typename OpFunc>
+inline void
+RunHwyUnaryCmd(Rtype* r, const Atype* a, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto res = op(d, va);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto res = op(d, va);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a binary SIMD operation on two arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b) and returning result
+///           Example: [](auto d, auto va, auto vb) { return hn::Add(va, vb); }
+template<typename Rtype, typename Atype, typename Btype, typename OpFunc>
+inline void
+RunHwyCmd(Rtype* r, const Atype* a, const Btype* b, size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto res = op(d, va, vb);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto res = op(d, va, vb);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+/// Execute a ternary SIMD operation on three arrays.
+/// Processes array elements in SIMD batches, handling type promotion/demotion
+/// and partial vectors at the end.
+/// @param r Destination array
+/// @param a First source array
+/// @param b Second source array
+/// @param c Third source array
+/// @param n Number of elements to process
+/// @param op Lambda/functor taking (descriptor, vector_a, vector_b, vector_c) and returning result
+///           Example: [](auto d, auto va, auto vb, auto vc) { return hn::MulAdd(va, vb, vc); }
+template<typename Rtype, typename ABCtype, typename OpFunc>
+inline void
+RunHwyTernaryCmd(Rtype* r, const ABCtype* a, const ABCtype* b, const ABCtype* c,
+                 size_t n, OpFunc op)
+{
+    using MathT = typename SimdMathType<Rtype>::type;
+    const hn::ScalableTag<MathT> d;
+    size_t x     = 0;
+    size_t lanes = hn::Lanes(d);
+    for (; x + lanes <= n; x += lanes) {
+        auto va  = LoadPromote(d, a + x);
+        auto vb  = LoadPromote(d, b + x);
+        auto vc  = LoadPromote(d, c + x);
+        auto res = op(d, va, vb, vc);
+        DemoteStore(d, r + x, res);
+    }
+    size_t remaining = n - x;
+    if (remaining > 0) {
+        auto va  = LoadPromoteN(d, a + x, remaining);
+        auto vb  = LoadPromoteN(d, b + x, remaining);
+        auto vc  = LoadPromoteN(d, c + x, remaining);
+        auto res = op(d, va, vb, vc);
+        DemoteStoreN(d, r + x, res, remaining);
+    }
+}
+
+// -----------------------------------------------------------------------
+// Interleaved Channel Load/Store Helpers
+// -----------------------------------------------------------------------
+
+/// Load 4 interleaved channels (RGBA) with type promotion.
+/// For matching types, uses Highway's native LoadInterleaved4.
+/// For type promotion, loads and manually deinterleaves.
+/// @param d Highway descriptor tag for the target SIMD type
+/// @param ptr Pointer to interleaved RGBA data (R0,G0,B0,A0,R1,G1,B1,A1,...)
+/// @return Tuple of (R, G, B, A) SIMD vectors in promoted type
+template<class D, typename SrcT>
+inline auto
+LoadInterleaved4Promote(D d, const SrcT* ptr)
+{
+    using MathT = typename D::T;
+    using Vec   = hn::Vec<D>;
+
+    if constexpr (std::is_same_v<SrcT, MathT>) {
+        // No promotion needed - use Highway's optimized LoadInterleaved4
+        Vec r, g, b, a;
+        hn::LoadInterleaved4(d, ptr, r, g, b, a);
+        return std::make_tuple(r, g, b, a);
+    } else if constexpr (std::is_same_v<SrcT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Load interleaved half data as float16_t
+        hn::Vec<decltype(d16)> r16, g16, b16, a16;
+        hn::LoadInterleaved4(d16, reinterpret_cast<const T16*>(ptr), r16, g16,
+                             b16, a16);
+
+        // Promote to computation type
+        Vec r_vec = hn::PromoteTo(d, r16);
+        Vec g_vec = hn::PromoteTo(d, g16);
+        Vec b_vec = hn::PromoteTo(d, b16);
+        Vec a_vec = hn::PromoteTo(d, a16);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    } else {
+        // Generic type promotion - deinterleave manually with normalization
+        const size_t N = hn::Lanes(d);
+        SrcT r_src[hn::MaxLanes(d)];
+        SrcT g_src[hn::MaxLanes(d)];
+        SrcT b_src[hn::MaxLanes(d)];
+        SrcT a_src[hn::MaxLanes(d)];
+
+        for (size_t i = 0; i < N; ++i) {
+            r_src[i] = ptr[i * 4 + 0];
+            g_src[i] = ptr[i * 4 + 1];
+            b_src[i] = ptr[i * 4 + 2];
+            a_src[i] = ptr[i * 4 + 3];
+        }
+
+        // Use LoadPromote for proper normalization of integer types
+        auto r_vec = LoadPromote(d, r_src);
+        auto g_vec = LoadPromote(d, g_src);
+        auto b_vec = LoadPromote(d, b_src);
+        auto a_vec = LoadPromote(d, a_src);
+
+        return std::make_tuple(r_vec, g_vec, b_vec, a_vec);
+    }
+}
+
+/// Store 4 interleaved channels (RGBA) with type demotion.
+/// For matching types, uses Highway's native StoreInterleaved4.
+/// For type demotion, manually interleaves and stores.
+/// @param d Highway descriptor tag for the source SIMD type
+/// @param ptr Pointer to destination interleaved RGBA data
+/// @param r Red channel SIMD vector
+/// @param g Green channel SIMD vector
+/// @param b Blue channel SIMD vector
+/// @param a Alpha channel SIMD vector
+template<class D, typename DstT, typename VecT>
+inline void
+StoreInterleaved4Demote(D d, DstT* ptr, VecT r, VecT g, VecT b, VecT a)
+{
+    using MathT = typename D::T;
+
+    if constexpr (std::is_same_v<DstT, MathT>) {
+        // No demotion needed - use Highway's optimized StoreInterleaved4
+        hn::StoreInterleaved4(r, g, b, a, d, ptr);
+    } else if constexpr (std::is_same_v<DstT, half>) {
+        // Special handling for half type - convert through hwy::float16_t
+        using T16 = hwy::float16_t;
+        auto d16  = hn::Rebind<T16, D>();
+
+        // Demote to float16_t
+        auto r16 = hn::DemoteTo(d16, r);
+        auto g16 = hn::DemoteTo(d16, g);
+        auto b16 = hn::DemoteTo(d16, b);
+        auto a16 = hn::DemoteTo(d16, a);
+
+        // Store interleaved float16_t data
+        hn::StoreInterleaved4(r16, g16, b16, a16, d16,
+                              reinterpret_cast<T16*>(ptr));
+    } else {
+        // Generic type demotion - use DemoteStore for each channel then interleave
+        const size_t N = hn::Lanes(d);
+
+        // Temporary arrays for demoted values
+        DstT r_demoted[hn::MaxLanes(d)];
+        DstT g_demoted[hn::MaxLanes(d)];
+        DstT b_demoted[hn::MaxLanes(d)];
+        DstT a_demoted[hn::MaxLanes(d)];
+
+        // Use DemoteStoreN to properly denormalize integer types
+        DemoteStoreN(d, r_demoted, r, N);
+        DemoteStoreN(d, g_demoted, g, N);
+        DemoteStoreN(d, b_demoted, b, N);
+        DemoteStoreN(d, a_demoted, a, N);
+
+        // Interleave the demoted values
+        for (size_t i = 0; i < N; ++i) {
+            ptr[i * 4 + 0] = r_demoted[i];
+            ptr[i * 4 + 1] = g_demoted[i];
+            ptr[i * 4 + 2] = b_demoted[i];
+            ptr[i * 4 + 3] = a_demoted[i];
+        }
+    }
+}
+
+// -----------------------------------------------------------------------
+// Rangecompress/Rangeexpand SIMD Kernels
+// -----------------------------------------------------------------------
+
+/// Apply rangecompress formula to a SIMD vector.
+/// Formula (courtesy Sony Pictures Imageworks):
+///   if (|x| <= 0.18) return x
+///   else return copysign(a + b * log(c * |x| + 1), x)
+/// where a = -0.545768857, b = 0.183516696, c = 284.357788
+/// @param d Highway descriptor tag
+/// @param x Input SIMD vector
+/// @return Compressed SIMD vector
+template<class D, typename VecT>
+inline auto
+rangecompress_simd(D d, VecT x)
+{
+    using T = typename D::T;
+
+    // Constants from Sony Pictures Imageworks
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_x            = hn::Abs(x);
+    auto mask_passthrough = hn::Le(abs_x, hn::Set(d, x1));
+
+    // compressed = a + b * log(c * |x| + 1.0)
+    auto c_vec      = hn::Set(d, c);
+    auto one        = hn::Set(d, static_cast<T>(1.0));
+    auto temp       = hn::MulAdd(c_vec, abs_x, one);  // c * |x| + 1.0
+    auto log_val    = hn::Log(d, temp);
+    auto b_vec      = hn::Set(d, b);
+    auto a_vec      = hn::Set(d, a);
+    auto compressed = hn::MulAdd(b_vec, log_val, a_vec);  // a + b * log
+
+    // Apply sign of original x
+    auto result = hn::CopySign(compressed, x);
+
+    // If |x| <= x1, return x; else return compressed
+    return hn::IfThenElse(mask_passthrough, x, result);
+}
+
+/// Apply rangeexpand formula to a SIMD vector (inverse of rangecompress).
+/// Formula:
+///   if (|y| <= 0.18) return y
+///   else x = exp((|y| - a) / b); x = (x - 1) / c
+///        if x < 0.18 then x = (-x_intermediate - 1) / c
+///        return copysign(x, y)
+/// @param d Highway descriptor tag
+/// @param y Input SIMD vector (compressed values)
+/// @return Expanded SIMD vector
+template<class D, typename VecT>
+inline auto
+rangeexpand_simd(D d, VecT y)
+{
+    using T = typename D::T;
+
+    // Constants (same as rangecompress)
+    constexpr T x1 = static_cast<T>(0.18);
+    constexpr T a  = static_cast<T>(-0.54576885700225830078);
+    constexpr T b  = static_cast<T>(0.18351669609546661377);
+    constexpr T c  = static_cast<T>(284.3577880859375);
+
+    auto abs_y            = hn::Abs(y);
+    auto mask_passthrough = hn::Le(abs_y, hn::Set(d, x1));
+
+    // x_intermediate = exp((|y| - a) / b)
+    auto a_vec        = hn::Set(d, a);
+    auto b_vec        = hn::Set(d, b);
+    auto intermediate = hn::Div(hn::Sub(abs_y, a_vec), b_vec);  // (|y| - a) / b
+    auto x_intermediate = hn::Exp(d, intermediate);
+
+    // x = (x_intermediate - 1.0) / c
+    auto one   = hn::Set(d, static_cast<T>(1.0));
+    auto c_vec = hn::Set(d, c);
+    auto x     = hn::Div(hn::Sub(x_intermediate, one), c_vec);
+
+    // If x < x1, use alternate solution: (-x_intermediate - 1.0) / c
+    auto mask_alternate = hn::Lt(x, hn::Set(d, x1));
+    auto x_alternate    = hn::Div(hn::Sub(hn::Neg(x_intermediate), one), c_vec);
+    x                   = hn::IfThenElse(mask_alternate, x_alternate, x);
+
+    // Apply sign of input y
+    auto result = hn::CopySign(x, y);
+
+    return hn::IfThenElse(mask_passthrough, y, result);
+}
+
+OIIO_NAMESPACE_END
diff --git a/src/libOpenImageIO/imagebufalgo_mad.cpp b/src/libOpenImageIO/imagebufalgo_mad.cpp
index 5707fcd6ac..d8038a74a7 100644
--- a/src/libOpenImageIO/imagebufalgo_mad.cpp
+++ b/src/libOpenImageIO/imagebufalgo_mad.cpp
@@ -6,12 +6,19 @@
 #include <iostream>
 #include <limits>
 
+#if defined(_WIN32)
+#    include <malloc.h>  // for alloca
+#endif
+
 #include <OpenImageIO/dassert.h>
 #include <OpenImageIO/half.h>
 #include <OpenImageIO/imagebuf.h>
 #include <OpenImageIO/imagebufalgo.h>
 #include <OpenImageIO/imagebufalgo_util.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
 #include "imageio_pvt.h"
 
 
@@ -21,66 +28,91 @@ OIIO_NAMESPACE_3_1_BEGIN
 
 template<class Rtype, class ABCtype>
 static bool
-mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
-         ROI roi, int nthreads)
+mad_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+                const ImageBuf& C, ROI roi, int nthreads)
+{
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        ImageBuf::Iterator<Rtype> r(R, roi);
+        ImageBuf::ConstIterator<ABCtype> a(A, roi);
+        ImageBuf::ConstIterator<ABCtype> b(B, roi);
+        ImageBuf::ConstIterator<ABCtype> c(C, roi);
+        for (; !r.done(); ++r, ++a, ++b, ++c) {
+            for (int ch = roi.chbegin; ch < roi.chend; ++ch)
+                r[ch] = a[ch] * b[ch] + c[ch];
+        }
+    });
+    return true;
+}
+
+
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class ABCtype>
+static bool
+mad_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B,
+             const ImageBuf& C, ROI roi, int nthreads)
 {
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    auto Cv = HwyPixels(C);
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
-        if ((std::is_same<Rtype, float>::value
-             || std::is_same<Rtype, half>::value)
-            && (std::is_same<ABCtype, float>::value
-                || std::is_same<ABCtype, half>::value)
-            // && R.localpixels() // has to be, because it's writable
-            && A.localpixels() && B.localpixels()
-            && C.localpixels()
-            // && R.contains_roi(roi)  // has to be, because IBAPrep
-            && A.contains_roi(roi) && B.contains_roi(roi) && C.contains_roi(roi)
-            && roi.chbegin == 0 && roi.chend == R.nchannels()
-            && roi.chend == A.nchannels() && roi.chend == B.nchannels()
-            && roi.chend == C.nchannels()) {
-            // Special case when all inputs are either float or half, with in-
-            // memory contiguous data and we're operating on the full channel
-            // range: skip iterators: For these circumstances, we can operate on
-            // the raw memory very efficiently. Otherwise, we will need the
-            // magic of the the Iterators (and pay the price).
-            int nxvalues = roi.width() * R.nchannels();
-            for (int z = roi.zbegin; z < roi.zend; ++z)
-                for (int y = roi.ybegin; y < roi.yend; ++y) {
-                    Rtype* rraw = (Rtype*)R.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* araw
-                        = (const ABCtype*)A.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* braw
-                        = (const ABCtype*)B.pixeladdr(roi.xbegin, y, z);
-                    const ABCtype* craw
-                        = (const ABCtype*)C.pixeladdr(roi.xbegin, y, z);
-                    OIIO_DASSERT(araw && braw && craw);
-                    // The straightforward loop auto-vectorizes very well,
-                    // there's no benefit to using explicit SIMD here.
-                    for (int x = 0; x < nxvalues; ++x)
-                        rraw[x] = araw[x] * braw[x] + craw[x];
-                    // But if you did want to explicitly vectorize, this is
-                    // how it would look:
-                    // int simdend = nxvalues & (~3); // how many float4's?
-                    // for (int x = 0; x < simdend; x += 4) {
-                    //     simd::float4 a_simd(araw+x), b_simd(braw+x), c_simd(craw+x);
-                    //     simd::float4 r_simd = a_simd * b_simd + c_simd;
-                    //     r_simd.store (rraw+x);
-                    // }
-                    // for (int x = simdend; x < nxvalues; ++x)
-                    //     rraw[x] = araw[x] * braw[x] + craw[x];
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Av, nchannels)
+                            && ChannelsContiguous<ABCtype>(Bv, nchannels)
+                            && ChannelsContiguous<ABCtype>(Cv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row         = RoiRowPtr<Rtype>(Rv, y, roi);
+            const ABCtype* a_row = RoiRowPtr<ABCtype>(Av, y, roi);
+            const ABCtype* b_row = RoiRowPtr<ABCtype>(Bv, y, roi);
+            const ABCtype* c_row = RoiRowPtr<ABCtype>(Cv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                // Use Highway SIMD for a*b+c (fused multiply-add)
+                RunHwyTernaryCmd<Rtype, ABCtype>(r_row, a_row, b_row, c_row, n,
+                                                 [](auto d, auto a, auto b,
+                                                    auto c) {
+                                                     return hn::MulAdd(a, b, c);
+                                                 });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const ABCtype* a_ptr = ChannelPtr<ABCtype>(Av, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* b_ptr = ChannelPtr<ABCtype>(Bv, x, y,
+                                                               roi.chbegin);
+                    const ABCtype* c_ptr = ChannelPtr<ABCtype>(Cv, x, y,
+                                                               roi.chbegin);
+                    for (int ch = 0; ch < nchannels; ++ch) {
+                        r_ptr[ch] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[ch])
+                                * static_cast<float>(b_ptr[ch])
+                            + static_cast<float>(c_ptr[ch]));
+                    }
                 }
-        } else {
-            ImageBuf::Iterator<Rtype> r(R, roi);
-            ImageBuf::ConstIterator<ABCtype> a(A, roi);
-            ImageBuf::ConstIterator<ABCtype> b(B, roi);
-            ImageBuf::ConstIterator<ABCtype> c(C, roi);
-            for (; !r.done(); ++r, ++a, ++b, ++c) {
-                for (int ch = roi.chbegin; ch < roi.chend; ++ch)
-                    r[ch] = a[ch] * b[ch] + c[ch];
             }
         }
     });
     return true;
 }
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class ABCtype>
+static bool
+mad_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, const ImageBuf& C,
+         ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels() && C.localpixels())
+        return mad_impl_hwy<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+#endif
+    return mad_impl_scalar<Rtype, ABCtype>(R, A, B, C, roi, nthreads);
+}
 
 
 
diff --git a/src/libOpenImageIO/imagebufalgo_muldiv.cpp b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
index 4fa1a6cba0..45c166907a 100644
--- a/src/libOpenImageIO/imagebufalgo_muldiv.cpp
+++ b/src/libOpenImageIO/imagebufalgo_muldiv.cpp
@@ -10,8 +10,15 @@
 #include <iostream>
 #include <limits>
 
+#if defined(_WIN32)
+#    include <malloc.h>  // for alloca
+#endif
+
 #include <OpenImageIO/half.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
 #include <OpenImageIO/dassert.h>
 #include <OpenImageIO/deepdata.h>
 #include <OpenImageIO/imagebuf.h>
@@ -86,8 +93,8 @@ ImageBufAlgo::scale(const ImageBuf& A, const ImageBuf& B, KWArgs options,
 
 template<class Rtype, class Atype, class Btype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -104,7 +111,8 @@ mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 template<class Rtype, class Atype>
 static bool
-mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+mul_impl_scalar(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::ConstIterator<Atype> a(A, roi);
@@ -117,6 +125,106 @@ mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
 
 
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(r_row, a_row, b_row, n,
+                                               [](auto d, auto a, auto b) {
+                                                   return hn::Mul(a, b);
+                                               });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        r_ptr[c] = static_cast<Rtype>(
+                            static_cast<float>(a_ptr[c])
+                            * static_cast<float>(b_ptr[c]));
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl_hwy(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<Rtype, double>, double, float>;
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            std::byte* r_row       = PixelBase(Rv, roi.xbegin, y);
+            const std::byte* a_row = PixelBase(Av, roi.xbegin, y);
+            for (int x = roi.xbegin; x < roi.xend; ++x) {
+                const size_t xoff = static_cast<size_t>(x - roi.xbegin);
+                Rtype* r_ptr      = reinterpret_cast<Rtype*>(
+                    r_row + xoff * Rv.pixel_bytes);
+                const Atype* a_ptr = reinterpret_cast<const Atype*>(
+                    a_row + xoff * Av.pixel_bytes);
+
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    r_ptr[c] = (Rtype)((SimdType)a_ptr[c] * (SimdType)b[c]);
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return mul_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+#endif
+    return mul_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+template<class Rtype, class Atype>
+static bool
+mul_impl(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi, int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels())
+        return mul_impl_hwy<Rtype, Atype>(R, A, b, roi, nthreads);
+#endif
+    return mul_impl_scalar<Rtype, Atype>(R, A, b, roi, nthreads);
+}
+
 static bool
 mul_impl_deep(ImageBuf& R, const ImageBuf& A, cspan<float> b, ROI roi,
               int nthreads)
@@ -198,8 +306,8 @@ ImageBufAlgo::mul(Image_or_Const A, Image_or_Const B, ROI roi, int nthreads)
 
 template<class Rtype, class Atype, class Btype>
 static bool
-div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
-         int nthreads)
+div_impl_scalar(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+                int nthreads)
 {
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         ImageBuf::Iterator<Rtype> r(R, roi);
@@ -216,6 +324,73 @@ div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
 
 
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl_hwy(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+             int nthreads)
+{
+    auto Rv = HwyPixels(R);
+    auto Av = HwyPixels(A);
+    auto Bv = HwyPixels(B);
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const int nchannels = RoiNChannels(roi);
+        const bool contig   = ChannelsContiguous<Rtype>(Rv, nchannels)
+                            && ChannelsContiguous<Atype>(Av, nchannels)
+                            && ChannelsContiguous<Btype>(Bv, nchannels);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            Rtype* r_row       = RoiRowPtr<Rtype>(Rv, y, roi);
+            const Atype* a_row = RoiRowPtr<Atype>(Av, y, roi);
+            const Btype* b_row = RoiRowPtr<Btype>(Bv, y, roi);
+
+            if (contig) {
+                size_t n = static_cast<size_t>(roi.width())
+                           * static_cast<size_t>(nchannels);
+                RunHwyCmd<Rtype, Atype, Btype>(
+                    r_row, a_row, b_row, n, [](auto d, auto a, auto b) {
+                        // Check for zero division: if b == 0, return 0
+                        auto zero = hn::Zero(d);
+                        auto mask = hn::Eq(b, zero);
+                        return hn::IfThenElse(mask, zero, hn::Div(a, b));
+                    });
+            } else {
+                for (int x = roi.xbegin; x < roi.xend; ++x) {
+                    Rtype* r_ptr = ChannelPtr<Rtype>(Rv, x, y, roi.chbegin);
+                    const Atype* a_ptr = ChannelPtr<Atype>(Av, x, y,
+                                                           roi.chbegin);
+                    const Btype* b_ptr = ChannelPtr<Btype>(Bv, x, y,
+                                                           roi.chbegin);
+                    for (int c = 0; c < nchannels; ++c) {
+                        float v  = static_cast<float>(b_ptr[c]);
+                        r_ptr[c] = (v == 0.0f)
+                                       ? static_cast<Rtype>(0.0f)
+                                       : static_cast<Rtype>(
+                                             static_cast<float>(a_ptr[c]) / v);
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<class Rtype, class Atype, class Btype>
+static bool
+div_impl(ImageBuf& R, const ImageBuf& A, const ImageBuf& B, ROI roi,
+         int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && R.localpixels() && A.localpixels()
+        && B.localpixels())
+        return div_impl_hwy<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+#endif
+    return div_impl_scalar<Rtype, Atype, Btype>(R, A, B, roi, nthreads);
+}
+
+
+
 bool
 ImageBufAlgo::div(ImageBuf& dst, Image_or_Const A_, Image_or_Const B_, ROI roi,
                   int nthreads)
diff --git a/src/libOpenImageIO/imagebufalgo_xform.cpp b/src/libOpenImageIO/imagebufalgo_xform.cpp
index 0abbb1ace8..ca3fa3e960 100644
--- a/src/libOpenImageIO/imagebufalgo_xform.cpp
+++ b/src/libOpenImageIO/imagebufalgo_xform.cpp
@@ -21,6 +21,10 @@
 
 #include <Imath/ImathBox.h>
 
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+#    include "imagebufalgo_hwy_pvt.h"
+#endif
+
 OIIO_NAMESPACE_3_1_BEGIN
 
 
@@ -932,11 +936,7 @@ ImageBufAlgo::fit(ImageBuf& dst, const ImageBuf& src, KWArgs options, ROI roi,
     OIIO::pvt::LoggedTimer logtime("IBA::fit");
 
     static const ustring recognized[] = {
-        filtername_us,
-        filterwidth_us,
-        filterptr_us,
-        fillmode_us,
-        exact_us,
+        filtername_us, filterwidth_us, filterptr_us, fillmode_us, exact_us,
 #if 0 /* Not currently recognized */
         wrap_us,
         edgeclamp_us,
@@ -1072,15 +1072,39 @@ ImageBufAlgo::fit(const ImageBuf& src, KWArgs options, ROI roi, int nthreads)
 
 template<typename DSTTYPE, typename SRCTYPE>
 static bool
-resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
-          int nthreads)
+resample_scalar(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+                int nthreads)
 {
+    // This operates just like the internals of ImageBuf::interppixel(), but
+    // reuses the provided iterator to avoid the overhead of constructing a new
+    // one each time. This speeds it up by 20x! The iterator `it` must already
+    // be associated with `img`, but it need not be positioned correctly.
+    auto interppixel =
+        [](const ImageBuf& img, ImageBuf::ConstIterator<SRCTYPE>& it, float x,
+           float y, span<float> pixel, ImageBuf::WrapMode wrap) -> bool {
+        int n             = std::min(int(pixel.size()), img.spec().nchannels);
+        float* localpixel = OIIO_ALLOCA(float, n * 4);
+        float* p[4]       = { localpixel, localpixel + n, localpixel + 2 * n,
+                              localpixel + 3 * n };
+        x -= 0.5f;
+        y -= 0.5f;
+        int xtexel, ytexel;
+        float xfrac, yfrac;
+        xfrac = floorfrac(x, &xtexel);
+        yfrac = floorfrac(y, &ytexel);
+        it.rerange(xtexel, xtexel + 2, ytexel, ytexel + 2, 0, 1, wrap);
+        for (int i = 0; i < 4; ++i, ++it)
+            for (int c = 0; c < n; ++c)
+                p[i][c] = it[c];  //NOSONAR
+        bilerp(p[0], p[1], p[2], p[3], xfrac, yfrac, n, pixel.data());
+        return true;
+    };
+
     OIIO_ASSERT(src.deep() == dst.deep());
     ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
         const ImageSpec& srcspec(src.spec());
         const ImageSpec& dstspec(dst.spec());
         int nchannels = src.nchannels();
-        bool deep     = src.deep();
 
         // Local copies of the source image window, converted to float
         float srcfx = srcspec.full_x;
@@ -1109,25 +1133,10 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
                 float s      = (x - dstfx + 0.5f) * dstpixelwidth;
                 float src_xf = srcfx + s * srcfw;
                 int src_x    = ifloor(src_xf);
-                if (deep) {
-                    srcpel.pos(src_x, src_y, 0);
-                    int nsamps = srcpel.deep_samples();
-                    OIIO_DASSERT(nsamps == out.deep_samples());
-                    if (!nsamps || nsamps != out.deep_samples())
-                        continue;
-                    for (int c = 0; c < nchannels; ++c) {
-                        if (dstspec.channelformat(c) == TypeDesc::UINT32)
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(
-                                    c, samp, srcpel.deep_value_uint(c, samp));
-                        else
-                            for (int samp = 0; samp < nsamps; ++samp)
-                                out.set_deep_value(c, samp,
-                                                   srcpel.deep_value(c, samp));
-                    }
-                } else if (interpolate) {
+                if (interpolate) {
                     // Non-deep image, bilinearly interpolate
-                    src.interppixel(src_xf, src_yf, pel, ImageBuf::WrapClamp);
+                    interppixel(src, srcpel, src_xf, src_yf, pel,
+                                ImageBuf::WrapClamp);
                     for (int c = roi.chbegin; c < roi.chend; ++c)
                         out[c] = pel[c];
                 } else {
@@ -1142,6 +1151,265 @@ resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
     return true;
 }
 
+static bool
+resample_deep(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+              int nthreads)
+{
+    // If it's deep, figure out the sample allocations first, because
+    // it's not thread-safe to do that simultaneously with copying the
+    // values.
+    const ImageSpec& srcspec(src.spec());
+    const ImageSpec& dstspec(dst.spec());
+    float srcfx          = srcspec.full_x;
+    float srcfy          = srcspec.full_y;
+    float srcfw          = srcspec.full_width;
+    float srcfh          = srcspec.full_height;
+    float dstpixelwidth  = 1.0f / dstspec.full_width;
+    float dstpixelheight = 1.0f / dstspec.full_height;
+    ImageBuf::ConstIterator<float> srcpel(src, roi);
+    ImageBuf::Iterator<float> dstpel(dst, roi);
+    for (; !dstpel.done(); ++dstpel, ++srcpel) {
+        float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
+        float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
+        int src_y = ifloor(srcfy + t * srcfh);
+        int src_x = ifloor(srcfx + s * srcfw);
+        srcpel.pos(src_x, src_y, 0);
+        dstpel.set_deep_samples(srcpel.deep_samples());
+    }
+
+    OIIO_ASSERT(src.deep() == dst.deep());
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+        int nchannels = src.nchannels();
+
+        // Local copies of the source image window, converted to float
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        ImageBuf::Iterator<float> out(dst, roi);
+        ImageBuf::ConstIterator<float> srcpel(src);
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            // s,t are NDC space
+            float t = (y - dstfy + 0.5f) * dstpixelheight;
+            // src_xf, src_xf are image space float coordinates
+            float src_yf = srcfy + t * srcfh;
+            // src_x, src_y are image space integer coordinates of the floor
+            int src_y = ifloor(src_yf);
+            for (int x = roi.xbegin; x < roi.xend; ++x, ++out) {
+                float s      = (x - dstfx + 0.5f) * dstpixelwidth;
+                float src_xf = srcfx + s * srcfw;
+                int src_x    = ifloor(src_xf);
+                srcpel.pos(src_x, src_y, 0);
+                int nsamps = srcpel.deep_samples();
+                OIIO_DASSERT(nsamps == out.deep_samples());
+                if (!nsamps || nsamps != out.deep_samples())
+                    continue;
+                for (int c = 0; c < nchannels; ++c) {
+                    if (dstspec.channelformat(c) == TypeDesc::UINT32)
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value_uint(c, samp));
+                    else
+                        for (int samp = 0; samp < nsamps; ++samp)
+                            out.set_deep_value(c, samp,
+                                               srcpel.deep_value(c, samp));
+                }
+            }
+        }
+    });
+
+    return true;
+}
+
+
+
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_hwy(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+             int nthreads)
+{
+    using SimdType
+        = std::conditional_t<std::is_same_v<DSTTYPE, double>, double, float>;
+    using D      = hn::ScalableTag<SimdType>;
+    using Rebind = hn::Rebind<int32_t, D>;
+
+    ImageBufAlgo::parallel_image(roi, nthreads, [&](ROI roi) {
+        const ImageSpec& srcspec(src.spec());
+        const ImageSpec& dstspec(dst.spec());
+
+        // Local copies of the source image window, converted to SimdType
+        float srcfx = srcspec.full_x;
+        float srcfy = srcspec.full_y;
+        float srcfw = srcspec.full_width;
+        float srcfh = srcspec.full_height;
+
+        float dstfx          = dstspec.full_x;
+        float dstfy          = dstspec.full_y;
+        float dstfw          = dstspec.full_width;
+        float dstfh          = dstspec.full_height;
+        float dstpixelwidth  = 1.0f / dstfw;
+        float dstpixelheight = 1.0f / dstfh;
+
+        const size_t src_scanline_bytes = srcspec.scanline_bytes();
+        const size_t dst_scanline_bytes = dstspec.scanline_bytes();
+        const size_t src_pixel_bytes    = srcspec.pixel_bytes();
+        const size_t dst_pixel_bytes    = dstspec.pixel_bytes();
+
+        const uint8_t* src_base = (const uint8_t*)src.localpixels();
+        uint8_t* dst_base       = (uint8_t*)dst.localpixels();
+
+        D d;
+        Rebind d_i32;
+        int N = hn::Lanes(d);
+
+        for (int y = roi.ybegin; y < roi.yend; ++y) {
+            float t      = (y - dstfy + 0.5f) * dstpixelheight;
+            float src_yf = srcfy + t * srcfh;
+            // Pixel-center convention: subtract 0.5 before interpolation
+            src_yf -= 0.5f;
+            int src_y   = ifloor(src_yf);
+            SimdType fy = (SimdType)(src_yf - src_y);
+
+            // Clamp Y to valid range
+            int src_y_clamped = clamp(src_y, src.ybegin(), src.yend() - 1);
+            // Neighbor Y (for bilinear)
+            int src_y_next_clamped = clamp(src_y + 1, src.ybegin(),
+                                           src.yend() - 1);
+
+            // Pre-calculate row pointers
+            const uint8_t* row0 = src_base
+                                  + (src_y_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+            const uint8_t* row1 = src_base
+                                  + (src_y_next_clamped - src.ybegin())
+                                        * src_scanline_bytes;
+
+            uint8_t* dst_row = dst_base
+                               + (y - dst.ybegin()) * dst_scanline_bytes;
+
+            for (int x = roi.xbegin; x < roi.xend; x += N) {
+                // Handle remaining pixels if less than N
+                int n = std::min(N, roi.xend - x);
+
+                // Compute src_xf for N pixels
+                auto idx_i32 = hn::Iota(d_i32, (float)x);
+
+                auto x_simd     = hn::ConvertTo(d, idx_i32);
+                auto s          = hn::Mul(hn::Sub(hn::Add(x_simd,
+                                                          hn::Set(d, (SimdType)0.5f)),
+                                                  hn::Set(d, (SimdType)dstfx)),
+                                          hn::Set(d, (SimdType)dstpixelwidth));
+                auto src_xf_vec = hn::MulAdd(s, hn::Set(d, (SimdType)srcfw),
+                                             hn::Set(d, (SimdType)srcfx));
+                // Pixel-center convention: subtract 0.5 before interpolation
+                src_xf_vec = hn::Sub(src_xf_vec, hn::Set(d, (SimdType)0.5f));
+
+                auto src_x_vec = hn::Floor(src_xf_vec);
+                auto fx        = hn::Sub(src_xf_vec, src_x_vec);
+                auto ix        = hn::ConvertTo(d_i32, src_x_vec);
+
+                // Clamp X
+                auto min_x = hn::Set(d_i32, src.xbegin());
+                auto max_x = hn::Set(d_i32, src.xend() - 1);
+                auto ix0   = hn::Min(hn::Max(ix, min_x), max_x);
+                auto ix1
+                    = hn::Min(hn::Max(hn::Add(ix, hn::Set(d_i32, 1)), min_x),
+                              max_x);
+
+                // Adjust to 0-based offset from buffer start
+                auto x_offset  = hn::Sub(ix0, min_x);
+                auto x1_offset = hn::Sub(ix1, min_x);
+
+                // Loop over channels
+                for (int c = roi.chbegin; c < roi.chend; ++c) {
+                    // Manual gather loop for now to be safe with types and offsets
+                    SimdType v00_arr[16], v01_arr[16], v10_arr[16], v11_arr[16];
+                    int32_t x0_arr[16], x1_arr[16];
+                    hn::Store(x_offset, d_i32, x0_arr);
+                    hn::Store(x1_offset, d_i32, x1_arr);
+
+                    for (int i = 0; i < n; ++i) {
+                        size_t off0 = (size_t)x0_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+                        size_t off1 = (size_t)x1_arr[i] * src_pixel_bytes
+                                      + (size_t)c * sizeof(SRCTYPE);
+
+                        auto load_val = [](const uint8_t* ptr) -> SimdType {
+                            return (SimdType)(*(const SRCTYPE*)ptr);
+                        };
+
+                        v00_arr[i] = load_val(row0 + off0);
+                        v01_arr[i] = load_val(row0 + off1);
+                        v10_arr[i] = load_val(row1 + off0);
+                        v11_arr[i] = load_val(row1 + off1);
+                    }
+
+                    auto val00 = hn::Load(d, v00_arr);
+                    auto val01 = hn::Load(d, v01_arr);
+                    auto val10 = hn::Load(d, v10_arr);
+                    auto val11 = hn::Load(d, v11_arr);
+
+                    // Bilinear Interpolation
+                    auto one = hn::Set(d, (SimdType)1.0f);
+                    auto w00 = hn::Mul(hn::Sub(one, fx),
+                                       hn::Sub(one, hn::Set(d, fy)));
+                    auto w01 = hn::Mul(fx, hn::Sub(one, hn::Set(d, fy)));
+                    auto w10 = hn::Mul(hn::Sub(one, fx), hn::Set(d, fy));
+                    auto w11 = hn::Mul(fx, hn::Set(d, fy));
+
+                    // Use FMA (Fused Multiply-Add) for better performance
+                    auto res = hn::Mul(val00, w00);
+                    res      = hn::MulAdd(val01, w01,
+                                          res);  // res = res + val01 * w01
+                    res      = hn::MulAdd(val10, w10,
+                                          res);  // res = res + val10 * w10
+                    res      = hn::MulAdd(val11, w11,
+                                          res);  // res = res + val11 * w11
+
+                    // Store
+                    SimdType res_arr[16];
+                    hn::Store(res, d, res_arr);
+                    for (int i = 0; i < n; ++i) {
+                        DSTTYPE* dptr
+                            = (DSTTYPE*)(dst_row
+                                         + (size_t)(x - roi.xbegin + i)
+                                               * dst_pixel_bytes
+                                         + (size_t)c * sizeof(DSTTYPE));
+                        *dptr = (DSTTYPE)res_arr[i];
+                    }
+                }
+            }
+        }
+    });
+    return true;
+}
+#endif  // defined(OIIO_USE_HWY) && OIIO_USE_HWY
+
+template<typename DSTTYPE, typename SRCTYPE>
+static bool
+resample_(ImageBuf& dst, const ImageBuf& src, bool interpolate, ROI roi,
+          int nthreads)
+{
+#if defined(OIIO_USE_HWY) && OIIO_USE_HWY
+    if (OIIO::pvt::enable_hwy && dst.localpixels() && src.localpixels())
+        return resample_hwy<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                              nthreads);
+#endif
+
+    return resample_scalar<DSTTYPE, SRCTYPE>(dst, src, interpolate, roi,
+                                             nthreads);
+}
 
 
 bool
@@ -1155,27 +1423,7 @@ ImageBufAlgo::resample(ImageBuf& dst, const ImageBuf& src, bool interpolate,
         return false;
 
     if (dst.deep()) {
-        // If it's deep, figure out the sample allocations first, because
-        // it's not thread-safe to do that simultaneously with copying the
-        // values.
-        const ImageSpec& srcspec(src.spec());
-        const ImageSpec& dstspec(dst.spec());
-        float srcfx          = srcspec.full_x;
-        float srcfy          = srcspec.full_y;
-        float srcfw          = srcspec.full_width;
-        float srcfh          = srcspec.full_height;
-        float dstpixelwidth  = 1.0f / dstspec.full_width;
-        float dstpixelheight = 1.0f / dstspec.full_height;
-        ImageBuf::ConstIterator<float> srcpel(src, roi);
-        ImageBuf::Iterator<float> dstpel(dst, roi);
-        for (; !dstpel.done(); ++dstpel, ++srcpel) {
-            float s   = (dstpel.x() - dstspec.full_x + 0.5f) * dstpixelwidth;
-            float t   = (dstpel.y() - dstspec.full_y + 0.5f) * dstpixelheight;
-            int src_y = ifloor(srcfy + t * srcfh);
-            int src_x = ifloor(srcfx + s * srcfw);
-            srcpel.pos(src_x, src_y, 0);
-            dstpel.set_deep_samples(srcpel.deep_samples());
-        }
+        return resample_deep(dst, src, interpolate, roi, nthreads);
     }
 
     bool ok;
diff --git a/src/libOpenImageIO/imageio.cpp b/src/libOpenImageIO/imageio.cpp
index 909f8529d4..aa8babf9b4 100644
--- a/src/libOpenImageIO/imageio.cpp
+++ b/src/libOpenImageIO/imageio.cpp
@@ -53,6 +53,7 @@ int png_linear_premult(0);
 int tiff_half(0);
 int tiff_multithread(1);
 int dds_bc5normal(0);
+int enable_hwy(1);  // Enable Google Highway SIMD optimizations by default
 int limit_channels(1024);
 int limit_imagesize_MB(std::min(32 * 1024,
                                 int(Sysutil::physical_memory() >> 20)));
@@ -406,6 +407,10 @@ attribute(string_view name, TypeDesc type, const void* val)
         dds_bc5normal = *(const int*)val;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        enable_hwy = *(const int*)val;
+        return true;
+    }
     if (name == "limits:channels" && type == TypeInt) {
         limit_channels = *(const int*)val;
         return true;
@@ -612,6 +617,10 @@ getattribute(string_view name, TypeDesc type, void* val)
         *(int*)val = dds_bc5normal;
         return true;
     }
+    if (name == "enable_hwy" && type == TypeInt) {
+        *(int*)val = enable_hwy;
+        return true;
+    }
     if (name == "oiio:print_uncaught_errors" && type == TypeInt) {
         *(int*)val = oiio_print_uncaught_errors;
         return true;