From 1fa81433ee1027cadc5fb2f4522b34b20a32aa80 Mon Sep 17 00:00:00 2001 From: Gioele Cantoni Date: Wed, 29 Oct 2025 19:24:08 +0100 Subject: [PATCH 1/2] fix(android): add support for amerabi-v7a (arm NEON 32bit); fixes #30 --- .github/workflows/main.yml | 10 ++- Makefile | 19 ++++-- src/distance-neon.c | 123 +++++++++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+), 8 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7653ebe..d80183d 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -11,7 +11,7 @@ jobs: build: runs-on: ${{ matrix.os }} container: ${{ matrix.container && matrix.container || '' }} - name: ${{ matrix.name }}${{ matrix.arch && format('-{0}', matrix.arch) || '' }} build${{ matrix.arch != 'arm64-v8a' && matrix.name != 'ios-sim' && matrix.name != 'ios' && matrix.name != 'apple-xcframework' && matrix.name != 'android-aar' && ( matrix.name != 'macos' || matrix.arch != 'x86_64' ) && ' + test' || ''}} + name: ${{ matrix.name }}${{ matrix.arch && format('-{0}', matrix.arch) || '' }} build${{ matrix.arch != 'arm64-v8a' && matrix.arch != 'armeabi-v7a' && matrix.name != 'ios-sim' && matrix.name != 'ios' && matrix.name != 'apple-xcframework' && matrix.name != 'android-aar' && ( matrix.name != 'macos' || matrix.arch != 'x86_64' ) && ' + test' || ''}} timeout-minutes: 20 strategy: fail-fast: false @@ -47,6 +47,10 @@ jobs: arch: arm64-v8a name: android make: PLATFORM=android ARCH=arm64-v8a + - os: ubuntu-22.04 + arch: armeabi-v7a + name: android + make: PLATFORM=android ARCH=armeabi-v7a - os: ubuntu-22.04 arch: x86_64 name: android @@ -140,7 +144,7 @@ jobs: security delete-keychain build.keychain - name: android setup test environment - if: matrix.name == 'android' && matrix.arch != 'arm64-v8a' + if: matrix.name == 'android' && matrix.arch != 'arm64-v8a' && matrix.arch != 'armeabi-v7a' run: | echo "::group::enable kvm group perms" @@ -168,7 +172,7 @@ jobs: echo "::endgroup::" - name: android test sqlite-vector - if: matrix.name == 'android' && matrix.arch != 'arm64-v8a' + if: matrix.name == 'android' && matrix.arch != 'arm64-v8a' && matrix.arch != 'armeabi-v7a' uses: reactivecircus/android-emulator-runner@v2.34.0 with: api-level: 26 diff --git a/Makefile b/Makefile index e02aaba..70ea497 100644 --- a/Makefile +++ b/Makefile @@ -59,7 +59,7 @@ else ifeq ($(PLATFORM),macos) STRIP = strip -x -S $@ else ifeq ($(PLATFORM),android) ifndef ARCH # Set ARCH to find Android NDK's Clang compiler, the user should set the ARCH - $(error "Android ARCH must be set to ARCH=x86_64 or ARCH=arm64-v8a") + $(error "Android ARCH must be set to ARCH=x86_64, ARCH=arm64-v8a, or ARCH=armeabi-v7a") endif ifndef ANDROID_NDK # Set ANDROID_NDK path to find android build tools; e.g. on MacOS: export ANDROID_NDK=/Users/username/Library/Android/sdk/ndk/25.2.9519653 $(error "Android NDK must be set") @@ -67,8 +67,14 @@ else ifeq ($(PLATFORM),android) BIN = $(ANDROID_NDK)/toolchains/llvm/prebuilt/$(HOST)-x86_64/bin ifneq (,$(filter $(ARCH),arm64 arm64-v8a)) override ARCH := aarch64 + ANDROID_ABI := android26 + else ifeq ($(ARCH),armeabi-v7a) + override ARCH := armv7a + ANDROID_ABI := androideabi26 + else + ANDROID_ABI := android26 endif - CC = $(BIN)/$(ARCH)-linux-android26-clang + CC = $(BIN)/$(ARCH)-linux-$(ANDROID_ABI)-clang TARGET := $(DIST_DIR)/vector.so LDFLAGS += -lm -shared STRIP = $(BIN)/llvm-strip --strip-unneeded $@ @@ -184,11 +190,14 @@ $(DIST_DIR)/%.xcframework: $(LIB_NAMES) xcframework: $(DIST_DIR)/vector.xcframework -AAR_ARM = packages/android/src/main/jniLibs/arm64-v8a/ +AAR_ARM64 = packages/android/src/main/jniLibs/arm64-v8a/ +AAR_ARM = packages/android/src/main/jniLibs/armeabi-v7a/ AAR_X86 = packages/android/src/main/jniLibs/x86_64/ aar: - mkdir -p $(AAR_ARM) $(AAR_X86) + mkdir -p $(AAR_ARM64) $(AAR_ARM) $(AAR_X86) $(MAKE) clean && $(MAKE) PLATFORM=android ARCH=arm64-v8a + mv $(DIST_DIR)/vector.so $(AAR_ARM64) + $(MAKE) clean && $(MAKE) PLATFORM=android ARCH=armeabi-v7a mv $(DIST_DIR)/vector.so $(AAR_ARM) $(MAKE) clean && $(MAKE) PLATFORM=android ARCH=x86_64 mv $(DIST_DIR)/vector.so $(AAR_X86) @@ -208,7 +217,7 @@ help: @echo " linux (default on Linux)" @echo " macos (default on macOS)" @echo " windows (default on Windows)" - @echo " android (needs ARCH to be set to x86_64 or arm64-v8a and ANDROID_NDK to be set)" + @echo " android (needs ARCH to be set to x86_64, arm64-v8a, or armeabi-v7a and ANDROID_NDK to be set)" @echo " ios (only on macOS)" @echo " ios-sim (only on macOS)" @echo "" diff --git a/src/distance-neon.c b/src/distance-neon.c index d2c227e..dd3b2f0 100644 --- a/src/distance-neon.c +++ b/src/distance-neon.c @@ -18,6 +18,17 @@ extern distance_function_t dispatch_distance_table[VECTOR_DISTANCE_MAX][VECTOR_TYPE_MAX]; extern char *distance_backend_name; +// Helper function for 32-bit ARM: vmaxv_u16 is not available in ARMv7 NEON +#if __SIZEOF_POINTER__ == 4 +static inline uint16_t vmaxv_u16_compat(uint16x4_t v) { + // Use pairwise max to reduce vector + uint16x4_t m = vpmax_u16(v, v); // [max(v0,v1), max(v2,v3), max(v0,v1), max(v2,v3)] + m = vpmax_u16(m, m); // [max(all), max(all), max(all), max(all)] + return vget_lane_u16(m, 0); +} +#define vmaxv_u16 vmaxv_u16_compat +#endif + // MARK: FLOAT32 - float float32_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool use_sqrt) { @@ -158,6 +169,31 @@ float bfloat16_distance_l2_impl_neon (const void *v1, const void *v2, int n, boo const uint16_t *a = (const uint16_t *)v1; const uint16_t *b = (const uint16_t *)v2; +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: use scalar double accumulation (no float64x2_t in NEON) + double sum = 0.0; + int i = 0; + + for (; i <= n - 4; i += 4) { + uint16x4_t av16 = vld1_u16(a + i); + uint16x4_t bv16 = vld1_u16(b + i); + + float32x4_t va = bf16x4_to_f32x4_u16(av16); + float32x4_t vb = bf16x4_to_f32x4_u16(bv16); + float32x4_t d = vsubq_f32(va, vb); + // mask-out NaNs: m = (d==d) + uint32x4_t m = vceqq_f32(d, d); + d = vbslq_f32(m, d, vdupq_n_f32(0.0f)); + + // Store and accumulate in scalar double + float tmp[4]; + vst1q_f32(tmp, d); + for (int j = 0; j < 4; j++) { + double dj = (double)tmp[j]; + sum = fma(dj, dj, sum); + } + } +#else // Accumulate in f64 to avoid overflow from huge bf16 values. float64x2_t acc0 = vdupq_n_f64(0.0), acc1 = vdupq_n_f64(0.0); int i = 0; @@ -205,6 +241,7 @@ float bfloat16_distance_l2_impl_neon (const void *v1, const void *v2, int n, boo } double sum = vaddvq_f64(vaddq_f64(acc0, acc1)); +#endif // scalar tail; treat NaN as 0, Inf as +Inf result for (; i < n; ++i) { @@ -409,8 +446,15 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool const uint16x4_t SIGN_MASK = vdup_n_u16(0x8000u); const uint16x4_t ZERO16 = vdup_n_u16(0); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: use scalar double accumulation + double sum = 0.0; + int i = 0; +#else + // 64-bit ARM: use float64x2_t NEON intrinsics float64x2_t acc0 = vdupq_n_f64(0.0), acc1 = vdupq_n_f64(0.0); int i = 0; +#endif for (; i <= n - 4; i += 4) { uint16x4_t av16 = vld1_u16(a + i); @@ -443,6 +487,16 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool uint32x4_t m = vceqq_f32(d32, d32); /* true where not-NaN */ d32 = vbslq_f32(m, d32, vdupq_n_f32(0.0f)); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: accumulate in scalar double + float tmp[4]; + vst1q_f32(tmp, d32); + for (int j = 0; j < 4; j++) { + double dj = (double)tmp[j]; + sum = fma(dj, dj, sum); + } +#else + // 64-bit ARM: use NEON f64 operations float64x2_t dlo = vcvt_f64_f32(vget_low_f32(d32)); float64x2_t dhi = vcvt_f64_f32(vget_high_f32(d32)); #if defined(__ARM_FEATURE_FMA) @@ -451,10 +505,13 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool #else acc0 = vaddq_f64(acc0, vmulq_f64(dlo, dlo)); acc1 = vaddq_f64(acc1, vmulq_f64(dhi, dhi)); +#endif #endif } +#if __SIZEOF_POINTER__ != 4 double sum = vaddvq_f64(vaddq_f64(acc0, acc1)); +#endif /* tail (scalar; same Inf/NaN policy) */ for (; i < n; ++i) { @@ -487,10 +544,17 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { const uint16x4_t FRAC_MASK = vdup_n_u16(0x03FFu); const uint16x4_t ZERO16 = vdup_n_u16(0); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: use scalar double accumulation + double dot = 0.0, normx = 0.0, normy = 0.0; + int i = 0; +#else + // 64-bit ARM: use float64x2_t NEON intrinsics float64x2_t acc_dot_lo = vdupq_n_f64(0.0), acc_dot_hi = vdupq_n_f64(0.0); float64x2_t acc_a2_lo = vdupq_n_f64(0.0), acc_a2_hi = vdupq_n_f64(0.0); float64x2_t acc_b2_lo = vdupq_n_f64(0.0), acc_b2_hi = vdupq_n_f64(0.0); int i = 0; +#endif for (; i <= n - 4; i += 4) { uint16x4_t av16 = vld1_u16(a + i); @@ -512,6 +576,19 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { ax = vbslq_f32(mx, ax, vdupq_n_f32(0.0f)); by = vbslq_f32(my, by, vdupq_n_f32(0.0f)); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: accumulate in scalar double + float ax_tmp[4], by_tmp[4]; + vst1q_f32(ax_tmp, ax); + vst1q_f32(by_tmp, by); + for (int j = 0; j < 4; j++) { + double x = (double)ax_tmp[j]; + double y = (double)by_tmp[j]; + dot += x * y; + normx += x * x; + normy += y * y; + } +#else /* widen to f64 and accumulate */ float64x2_t ax_lo = vcvt_f64_f32(vget_low_f32(ax)), ax_hi = vcvt_f64_f32(vget_high_f32(ax)); float64x2_t by_lo = vcvt_f64_f32(vget_low_f32(by)), by_hi = vcvt_f64_f32(vget_high_f32(by)); @@ -530,12 +607,15 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { acc_a2_hi = vaddq_f64(acc_a2_hi, vmulq_f64(ax_hi, ax_hi)); acc_b2_lo = vaddq_f64(acc_b2_lo, vmulq_f64(by_lo, by_lo)); acc_b2_hi = vaddq_f64(acc_b2_hi, vmulq_f64(by_hi, by_hi)); +#endif #endif } +#if __SIZEOF_POINTER__ != 4 double dot = vaddvq_f64(vaddq_f64(acc_dot_lo, acc_dot_hi)); double normx= vaddvq_f64(vaddq_f64(acc_a2_lo, acc_a2_hi)); double normy= vaddvq_f64(vaddq_f64(acc_b2_lo, acc_b2_hi)); +#endif /* tail (scalar) */ for (; i < n; ++i) { @@ -569,8 +649,15 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { const uint16x4_t FRAC_MASK = vdup_n_u16(0x03FFu); const uint16x4_t ZERO16 = vdup_n_u16(0); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: use scalar double accumulation + double dot = 0.0; + int i = 0; +#else + // 64-bit ARM: use float64x2_t NEON intrinsics float64x2_t acc_lo = vdupq_n_f64(0.0), acc_hi = vdupq_n_f64(0.0); int i = 0; +#endif for (; i <= n - 4; i += 4) { uint16x4_t av16 = vld1_u16(a + i); @@ -588,7 +675,11 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { if (isnan(x) || isnan(y)) continue; double p = (double)x * (double)y; if (isinf(p)) return (p>0)? -INFINITY : INFINITY; +#if __SIZEOF_POINTER__ == 4 + dot += p; +#else acc_lo = vsetq_lane_f64(vgetq_lane_f64(acc_lo,0)+p, acc_lo, 0); /* cheap add */ +#endif } continue; } @@ -603,13 +694,26 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { by = vbslq_f32(my, by, vdupq_n_f32(0.0f)); float32x4_t prod = vmulq_f32(ax, by); + +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: accumulate in scalar double + float prod_tmp[4]; + vst1q_f32(prod_tmp, prod); + for (int j = 0; j < 4; j++) { + dot += (double)prod_tmp[j]; + } +#else + // 64-bit ARM: use NEON f64 operations float64x2_t lo = vcvt_f64_f32(vget_low_f32(prod)); float64x2_t hi = vcvt_f64_f32(vget_high_f32(prod)); acc_lo = vaddq_f64(acc_lo, lo); acc_hi = vaddq_f64(acc_hi, hi); +#endif } +#if __SIZEOF_POINTER__ != 4 double dot = vaddvq_f64(vaddq_f64(acc_lo, acc_hi)); +#endif for (; i < n; ++i) { float x = float16_to_float32(a[i]); @@ -635,8 +739,15 @@ float float16_distance_l1_neon (const void *v1, const void *v2, int n) { const uint16x4_t SIGN_MASK = vdup_n_u16(0x8000u); const uint16x4_t ZERO16 = vdup_n_u16(0); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: use scalar double accumulation + double sum = 0.0; + int i = 0; +#else + // 64-bit ARM: use float64x2_t NEON intrinsics float64x2_t acc = vdupq_n_f64(0.0); int i = 0; +#endif for (; i <= n - 4; i += 4) { uint16x4_t av16 = vld1_u16(a + i); @@ -665,13 +776,25 @@ float float16_distance_l1_neon (const void *v1, const void *v2, int n) { uint32x4_t m = vceqq_f32(d, d); /* mask NaNs -> 0 */ d = vbslq_f32(m, d, vdupq_n_f32(0.0f)); +#if __SIZEOF_POINTER__ == 4 + // 32-bit ARM: accumulate in scalar double + float tmp[4]; + vst1q_f32(tmp, d); + for (int j = 0; j < 4; j++) { + sum += (double)tmp[j]; + } +#else + // 64-bit ARM: use NEON f64 operations float64x2_t lo = vcvt_f64_f32(vget_low_f32(d)); float64x2_t hi = vcvt_f64_f32(vget_high_f32(d)); acc = vaddq_f64(acc, lo); acc = vaddq_f64(acc, hi); +#endif } +#if __SIZEOF_POINTER__ != 4 double sum = vaddvq_f64(acc); +#endif for (; i < n; ++i) { uint16_t ai=a[i], bi=b[i]; From 2ed19c8cd0b28c78369fc970855309e5ce08ae37 Mon Sep 17 00:00:00 2001 From: Gioele Cantoni Date: Wed, 5 Nov 2025 16:24:57 +0000 Subject: [PATCH 2/2] fix(distance-neon): update ARM pointer size checks to use _ARM32BIT_ macro for clarity --- src/distance-neon.c | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/distance-neon.c b/src/distance-neon.c index dd3b2f0..5900438 100644 --- a/src/distance-neon.c +++ b/src/distance-neon.c @@ -13,13 +13,18 @@ #if defined(__ARM_NEON) || defined(__ARM_NEON__) + +#if __SIZEOF_POINTER__ == 4 +#define _ARM32BIT_ 1 +#endif + #include extern distance_function_t dispatch_distance_table[VECTOR_DISTANCE_MAX][VECTOR_TYPE_MAX]; extern char *distance_backend_name; // Helper function for 32-bit ARM: vmaxv_u16 is not available in ARMv7 NEON -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ static inline uint16_t vmaxv_u16_compat(uint16x4_t v) { // Use pairwise max to reduce vector uint16x4_t m = vpmax_u16(v, v); // [max(v0,v1), max(v2,v3), max(v0,v1), max(v2,v3)] @@ -169,7 +174,7 @@ float bfloat16_distance_l2_impl_neon (const void *v1, const void *v2, int n, boo const uint16_t *a = (const uint16_t *)v1; const uint16_t *b = (const uint16_t *)v2; -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: use scalar double accumulation (no float64x2_t in NEON) double sum = 0.0; int i = 0; @@ -446,7 +451,7 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool const uint16x4_t SIGN_MASK = vdup_n_u16(0x8000u); const uint16x4_t ZERO16 = vdup_n_u16(0); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: use scalar double accumulation double sum = 0.0; int i = 0; @@ -487,7 +492,7 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool uint32x4_t m = vceqq_f32(d32, d32); /* true where not-NaN */ d32 = vbslq_f32(m, d32, vdupq_n_f32(0.0f)); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: accumulate in scalar double float tmp[4]; vst1q_f32(tmp, d32); @@ -509,7 +514,7 @@ float float16_distance_l2_impl_neon (const void *v1, const void *v2, int n, bool #endif } -#if __SIZEOF_POINTER__ != 4 +#ifndef _ARM32BIT_ double sum = vaddvq_f64(vaddq_f64(acc0, acc1)); #endif @@ -544,7 +549,7 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { const uint16x4_t FRAC_MASK = vdup_n_u16(0x03FFu); const uint16x4_t ZERO16 = vdup_n_u16(0); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: use scalar double accumulation double dot = 0.0, normx = 0.0, normy = 0.0; int i = 0; @@ -576,7 +581,7 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { ax = vbslq_f32(mx, ax, vdupq_n_f32(0.0f)); by = vbslq_f32(my, by, vdupq_n_f32(0.0f)); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: accumulate in scalar double float ax_tmp[4], by_tmp[4]; vst1q_f32(ax_tmp, ax); @@ -611,7 +616,7 @@ float float16_distance_cosine_neon (const void *v1, const void *v2, int n) { #endif } -#if __SIZEOF_POINTER__ != 4 +#ifndef _ARM32BIT_ double dot = vaddvq_f64(vaddq_f64(acc_dot_lo, acc_dot_hi)); double normx= vaddvq_f64(vaddq_f64(acc_a2_lo, acc_a2_hi)); double normy= vaddvq_f64(vaddq_f64(acc_b2_lo, acc_b2_hi)); @@ -649,7 +654,7 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { const uint16x4_t FRAC_MASK = vdup_n_u16(0x03FFu); const uint16x4_t ZERO16 = vdup_n_u16(0); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: use scalar double accumulation double dot = 0.0; int i = 0; @@ -675,7 +680,7 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { if (isnan(x) || isnan(y)) continue; double p = (double)x * (double)y; if (isinf(p)) return (p>0)? -INFINITY : INFINITY; -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ dot += p; #else acc_lo = vsetq_lane_f64(vgetq_lane_f64(acc_lo,0)+p, acc_lo, 0); /* cheap add */ @@ -695,7 +700,7 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { float32x4_t prod = vmulq_f32(ax, by); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: accumulate in scalar double float prod_tmp[4]; vst1q_f32(prod_tmp, prod); @@ -711,7 +716,7 @@ float float16_distance_dot_neon (const void *v1, const void *v2, int n) { #endif } -#if __SIZEOF_POINTER__ != 4 +#ifndef _ARM32BIT_ double dot = vaddvq_f64(vaddq_f64(acc_lo, acc_hi)); #endif @@ -739,7 +744,7 @@ float float16_distance_l1_neon (const void *v1, const void *v2, int n) { const uint16x4_t SIGN_MASK = vdup_n_u16(0x8000u); const uint16x4_t ZERO16 = vdup_n_u16(0); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: use scalar double accumulation double sum = 0.0; int i = 0; @@ -776,7 +781,7 @@ float float16_distance_l1_neon (const void *v1, const void *v2, int n) { uint32x4_t m = vceqq_f32(d, d); /* mask NaNs -> 0 */ d = vbslq_f32(m, d, vdupq_n_f32(0.0f)); -#if __SIZEOF_POINTER__ == 4 +#ifdef _ARM32BIT_ // 32-bit ARM: accumulate in scalar double float tmp[4]; vst1q_f32(tmp, d); @@ -792,7 +797,7 @@ float float16_distance_l1_neon (const void *v1, const void *v2, int n) { #endif } -#if __SIZEOF_POINTER__ != 4 +#ifndef _ARM32BIT_ double sum = vaddvq_f64(acc); #endif