From ca53d73f3d6b7160d6d8a7aae78d653f9a4579a9 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:43:40 -0500 Subject: [PATCH 1/4] Cleanup: remove FastBinarySearch --- CMakeLists.txt | 5 +- NOTICE.md | 4 +- README.md | 2 - csrc/common.cpp | 37 --- csrc/common.h | 23 +- csrc/cpu_ops.cpp | 4 +- csrc/cpu_ops.h | 2 +- csrc/mps_ops.h | 0 docs/source/index.mdx | 2 - include/AAlloc.h | 86 ----- include/Algo-Direct-Common.h | 341 -------------------- include/Algo-Direct2.h | 307 ------------------ include/AlgoXCodes.h | 23 -- include/BinAlgo.h | 77 ----- include/BinSearch.h | 11 - include/Portable.h | 205 ------------ include/SIMD.h | 589 ----------------------------------- include/Type.h | 221 ------------- 18 files changed, 6 insertions(+), 1933 deletions(-) delete mode 100644 csrc/common.cpp delete mode 100644 csrc/mps_ops.h delete mode 100644 include/AAlloc.h delete mode 100644 include/Algo-Direct-Common.h delete mode 100644 include/Algo-Direct2.h delete mode 100644 include/AlgoXCodes.h delete mode 100644 include/BinAlgo.h delete mode 100644 include/BinSearch.h delete mode 100644 include/Portable.h delete mode 100644 include/SIMD.h delete mode 100644 include/Type.h diff --git a/CMakeLists.txt b/CMakeLists.txt index f88ac2b11..7d126d488 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,7 +23,7 @@ if(NOT CMAKE_BUILD_TYPE) endif() # Define included source files -set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp) +set(CPP_FILES csrc/cpu_ops.cpp csrc/pythonInterface.cpp) set(CUDA_FILES csrc/ops.cu csrc/kernels.cu) set(HIP_FILES csrc/ops.hip csrc/kernels.hip) set(MPS_FILES csrc/mps_ops.mm) @@ -158,7 +158,6 @@ if(BUILD_CUDA) string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math") if(PTXAS_VERBOSE) - # Verbose? Outputs register usage information, and other things... string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v") endif() @@ -268,7 +267,7 @@ endif() set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX) add_library(bitsandbytes SHARED ${SRC_FILES}) target_compile_features(bitsandbytes PUBLIC cxx_std_17) -target_include_directories(bitsandbytes PUBLIC csrc include) +target_include_directories(bitsandbytes PUBLIC csrc) if (BUILD_CPU) if (OpenMP_CXX_FOUND) diff --git a/NOTICE.md b/NOTICE.md index 660658b05..b5591965d 100644 --- a/NOTICE.md +++ b/NOTICE.md @@ -1,3 +1 @@ -The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license. - -We thank Fabio Cannizzo for this work on FastBinarySearch which is included in this project. +The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: PyTorch is licensed under the BSD license. diff --git a/README.md b/README.md index 4413ad185..deb19877c 100644 --- a/README.md +++ b/README.md @@ -180,8 +180,6 @@ The continued maintenance and development of `bitsandbytes` is made possible tha ## License `bitsandbytes` is MIT licensed. -We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization. - ## How to cite us If you found this library useful, please consider citing our work: diff --git a/csrc/common.cpp b/csrc/common.cpp deleted file mode 100644 index 834453370..000000000 --- a/csrc/common.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include - -void quantize_block(const quantize_block_args& args) { - // 1. find absmax in block - // 2. divide input value by absmax to normalize into [-1.0, 1.0] - // 3. do binary search to find the closest value - // 4. check minimal distance - // 5. store index - - // 1. find absmax in block - float absmax_block = -FLT_MAX; - for (long long i = args.block_idx; i < args.block_end; i++) - absmax_block = fmax(absmax_block, fabs(args.A[i])); - - args.absmax[args.block_idx / args.blocksize] = absmax_block; - - for (long long i = args.block_idx; i < args.block_end; i++) { - // 2. divide input value by absmax to normalize into [-1.0, 1.0] - // 3. do binary search to find the closest value - float normed_value = args.A[i] / absmax_block; - long long idx = args.bin_searcher->scalar(normed_value); - - // 4. check minimal distance - // The binary search returns always the value to the left, which might not be the closest value - if (idx < 255) { - float dist_left = fabs(normed_value - (args.code[idx])); - float dist_right = fabs(normed_value - (args.code[idx + 1])); - if (dist_right < dist_left) { - idx += 1; - } - } - - // 5. store index - args.out[i] = (unsigned char)idx; - } -} diff --git a/csrc/common.h b/csrc/common.h index 76b5d6aee..1496c0bc3 100644 --- a/csrc/common.h +++ b/csrc/common.h @@ -1,28 +1,7 @@ -#include - -#ifndef common -#define common - -using namespace BinSearch; +#pragma once typedef enum DataType_t { General8bit = 0, FP4 = 1, NF4 = 2, } DataType_t; - -struct quantize_block_args { - BinAlgo* bin_searcher; - float* code; - float* A; - float* absmax; - unsigned char* out; - long long block_end; - long long block_idx; - long long threadidx; - long long blocksize; -}; - -void quantize_block(const quantize_block_args& args); - -#endif diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index eb7225d7f..ed071395f 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,10 +1,8 @@ -#include -#include #include - #include #include #include +#include "cpu_ops.h" #ifdef HAS_OPENMP #include diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h index 6803b29f9..bf538c3ee 100644 --- a/csrc/cpu_ops.h +++ b/csrc/cpu_ops.h @@ -3,11 +3,11 @@ #include #include -#include #include #include #include #include +#include "common.h" #if defined(_OPENMP) #include diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/source/index.mdx b/docs/source/index.mdx index 064420cf7..8f69bb58f 100644 --- a/docs/source/index.mdx +++ b/docs/source/index.mdx @@ -9,5 +9,3 @@ bitsandbytes enables accessible large language models via k-bit quantization for # License bitsandbytes is MIT licensed. - -We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization. diff --git a/include/AAlloc.h b/include/AAlloc.h deleted file mode 100644 index 6c2ae419f..000000000 --- a/include/AAlloc.h +++ /dev/null @@ -1,86 +0,0 @@ -#pragma once - -#include "Portable.h" - -namespace BinSearch { -namespace Details { - -template -bool isAligned(const T *p, size_t A) -{ - return (reinterpret_cast(p) % A) == 0; -} - -template -struct AlignedVec -{ - AlignedVec() - : m_storage(0) - , m_data(0) - , m_sz(0) - { - } - - static size_t nBytes(size_t sz) - { - return sz * sizeof(T) + A; - } - - static size_t shiftAmt(char *p) - { - return A>1? (A - (reinterpret_cast(p) % A)) % A: 0; - } - - void setPtr(char *p, size_t sz) - { - m_sz = sz; - m_data = reinterpret_cast(p + shiftAmt(p)); - } - - //void setPtr(T *p, size_t sz) - //{ - // m_sz = sz; - // if (A>1) - // myassert(((reinterpret_cast(p) % A) == 0), "bad alignment"); - // m_data = p; - //} - - // internal allocation - void resize(size_t sz) - { - m_storage = new char[nBytes(sz)]; - setPtr(m_storage, sz); - } - - // external allocation - void set(char *storage, size_t sz) - { - setPtr(storage, sz); - } - - ~AlignedVec() - { - if (m_storage) - delete [] m_storage; - } - - size_t size() const { return m_sz; } - T& operator[](size_t i) { return m_data[i]; } - const T& operator[](size_t i) const { return m_data[i]; } - T* begin() { return m_data; } - T* end() { return m_data+m_sz; } - const T* begin() const { return m_data; } - const T* end() const { return m_data+m_sz; } - T& front() { return m_data[0]; } - T& back() { return m_data[m_sz-1]; } - const T& front() const { return m_data[0]; } - const T& back() const { return m_data[m_sz - 1]; } - -private: - char *m_storage; - T *m_data; - size_t m_sz; -}; - -} // namespace Details -} // namespace BinSearch diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h deleted file mode 100644 index 7b40edea9..000000000 --- a/include/Algo-Direct-Common.h +++ /dev/null @@ -1,341 +0,0 @@ -#pragma once - -#include -#include -#include -#include "AAlloc.h" - -namespace BinSearch { -namespace Details { - -namespace DirectAux { - -#define SAFETY_MULTI_PASS true - -template -struct HResults -{ - HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {} - T H; - double hRatio; - size_t nInc; -}; - - -#ifdef USE_FMA -template struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); }; -template struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); }; -template struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); }; -#else -template struct IsDirect { static const bool value = (A == Direct); }; -template struct IsDirect2 { static const bool value = (A == Direct2); }; -template struct IsDirectCache { static const bool value = (A == DirectCache); }; -#endif - -// general definition -template -struct BucketElem -{ - FORCE_INLINE void set( uint32 b, const T *) - { - m_b = b; - } - - FORCE_INLINE uint32 index() const { return m_b; } - -private: - uint32 m_b; -}; - -// specialization for DirectCache methods - -template struct MatchingIntType; -template <> struct MatchingIntType { typedef uint64 type; }; -template <> struct MatchingIntType { typedef uint32 type; }; - -template -struct BucketElem::value >::type > -{ - typedef typename MatchingIntType::type I; - - void set(uint32 b, const T *xi) - { - u.u.x = xi[b]; - u.u.b = b; - } - - FORCE_INLINE I index() const { return u.u.b; } - FORCE_INLINE T x() const { return u.u.x; } - -private: - union { - double dummy; - struct - { - T x; - I b; - } u; - } u; -}; - - -template -struct DirectTraits -{ - static void checkH(T scaler, T x0, T xN) - { - T Dn = xN - x0; - T ifmax = Dn * scaler; - myassert((ifmax < std::numeric_limits::max() - (Gap - 1)), - "Problem unfeasible: index size exceeds uint32 capacity:" - << " D[N] =" << Dn - << ", H =" << scaler - << ", H D[n] =" << ifmax << "\n" - ); - } - - FORCE_INLINE static uint32 f(T scaler, T x0, T z) - { - T tmp = scaler * (z - x0); -#ifdef USE_SSE2 - return ftoi(FVec1(tmp)); -#else - return static_cast(tmp); -#endif - } - - template - FORCE_INLINE static typename FTOITraits::vec_t f(const FVec& scaler, const FVec& x0, const FVec& z) - { - return ftoi(scaler*(z-x0)); - } - - static T cst0(T scaler, T x0) - { - return x0; - } -}; - -#ifdef USE_FMA -template -struct DirectTraits -{ - typedef FVec1 fVec1; - - static void checkH(T scaler, T H_Times_x0, T xN) - { - union { - typename FVec1::vec_t v; - T s; - } ifmax; - ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0)); - myassert((ifmax.s < std::numeric_limits::max() - (Gap - 1)), - "Problem unfeasible: index size exceeds uint32 capacity:" - << " H X[0] =" << H_Times_x0 - << ", H =" << scaler - << ", X[N] =" << xN - << ", H X[N] - H X[0] =" << ifmax.s << "\n" - ); - } - - FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi) - { - return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0))); - } - - template - FORCE_INLINE static typename FTOITraits::vec_t f(const FVec& scaler, const FVec& H_Times_X0, const FVec& z) - { - return ftoi(mulSub(scaler, z, H_Times_X0)); - } - - static T cst0(T scaler, T x0) - { - return scaler*x0; - } -}; -#endif - -template -struct DirectInfo -{ - static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA); - typedef DirectTraits fun_t; - typedef BucketElem bucket_t; - typedef AlignedVec bucketvec_t; - - struct Data { - Data() : buckets(0), xi(0), scaler(0), cst0(0) {} - Data( const T *x // for Direct must persist if xws=NULL - , uint32 n - , T H - , bucket_t *bws // assumed to gave size nb, as computed below - , T *xws = NULL // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap - ) - : buckets(bws) - , scaler(H) - , cst0(fun_t::cst0(H, x[0])) - { - myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned"); - - uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]); - - const uint32 npad = Gap-1; - const uint32 n_sz = n + npad; // size of padded vector - - if (xws) { - myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned"); - std::fill_n(xws, npad, x[0]); // pad in front with x[0] - std::copy(x, x+n, xws + npad); - xi = xws; - } - else { - myassert((Gap==1), "if Gap>1 then X workspace must be provided"); - xi = x; - } - - populateIndex(bws, nb, xi, n_sz, scaler, cst0); - } - - const bucket_t *buckets; - const T *xi; - T scaler; - T cst0; // could be x0 or (scaler*x0), depending if we are using FMA or not - } data; - - static T growStep(T H) - { - T step; - T P = next(H); - while ((step = P - H) == 0) - P = next(P); - return step; - } - - static HResults computeH(const T *px, uint32 nx) - { - myassert((nx > Gap), "Array X too small"); - myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap"); - - const T x0 = px[0]; - const T xN = px[nx-1]; - - const T range = xN - x0; - myassert((range < std::numeric_limits::max()), "range too large"); - - // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i - T deltaDMin = range; - for (uint32 i = Gap; i < nx; ++i) { - T Dnew = px[i] - x0; - T Dold = px[i - Gap] - x0; - myassert((Dnew > Dold), - "Problem unfeasible: D_i sequence not strictly increasing" - << " X[" << 0 << "]=" << x0 - << " X[" << i - Gap << "]=" << px[i - Gap] - << " X[" << i << "]=" << px[i] - << "\n" - ); - T deltaD = Dnew - Dold; - if (deltaD < deltaDMin) - deltaDMin = deltaD; - } - - // initial guess for H - const T H0 = T(1.0) / deltaDMin; - T H = H0; - - T cst0 = fun_t::cst0(H, x0); - fun_t::checkH(H, cst0, xN); - - // adjust H by trial and error until succeed - size_t nInc = 0; - bool modified = false; - size_t npasses = 0; - T step = growStep(H); - uint32 seg_already_checked_from = nx; - do { - myassert((npasses++ < 2), "verification failed\n"); - // if there has been an increase, then check only up to that point - uint32 last_seg_to_be_checked = seg_already_checked_from - 1; - modified = false; - uint32 inew = 0; - for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) { - uint32 iold = fun_t::f(H, cst0, px[i-Gap]); - uint32 inew = fun_t::f(H, cst0, px[i]); - while (inew == iold) { - seg_already_checked_from = i; - last_seg_to_be_checked = nx-1; // everything needs to be checked - modified = true; - H = H + step; - step *= 2; - // recalculate all constants and indices - cst0 = fun_t::cst0(H, x0); - fun_t::checkH(H, cst0, xN); - iold = fun_t::f(H, cst0, px[i - Gap]); - inew = fun_t::f(H, cst0, px[i]); - } - } - } while (SAFETY_MULTI_PASS && modified); - - return HResults(H, (((double)H) / H0) - 1.0, nInc); - } - - static void populateIndex(BucketElem *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0) - { - for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) { - uint32 idx = fun_t::f(scaler, cst0, px[i]); - while (b > idx) { // in the 1st iteration it is j=0 but this condition is always false - buckets[b].set( j, px ); - --b; - } - if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant - j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check - buckets[b].set(j, px); - if (b-- == 0) - break; - } - } - } - - DirectInfo(const Data& d) - : data(d) - { - } - - DirectInfo(const T* px, const uint32 n) - { - HResults res = computeH(px, n); - -#ifdef PAPER_TEST - nInc = res.nInc; - hRatio = res.hRatio; -#endif - const uint32 npad = Gap-1; - const uint32 n_sz = n + npad; // size of padded vector - - if (npad) - xi.resize(n_sz); - - T H = res.H; - T cst0 = fun_t::cst0(H, px[0]); - const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]); - buckets.resize(maxIndex + 1); - - data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL)); - } - -private: - bucketvec_t buckets; - AlignedVec xi; - -#ifdef PAPER_TEST -public: - double hRatio; - size_t nInc; -#endif -}; - - -} // namespace DirectAux -} // namespace Details -} // namespace BinSearch diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h deleted file mode 100644 index 547ca9955..000000000 --- a/include/Algo-Direct2.h +++ /dev/null @@ -1,307 +0,0 @@ -#pragma once - -#include "Algo-Direct-Common.h" - -namespace BinSearch { -namespace Details { - -template -struct AlgoScalarBase::value>::type> : DirectAux::DirectInfo<2, T, A> -{ -private: - typedef DirectAux::DirectInfo<2, T, A> base_t; - static const size_t Offset=2; - -public: - AlgoScalarBase(const T* x, const uint32 n) - : base_t(x, n) - { - } - - FORCE_INLINE uint32 scalar(T z) const - { - const T* px = base_t::data.xi; - const uint32* buckets = reinterpret_cast(base_t::data.buckets); - uint32 bidx = base_t::fun_t::f(base_t::data.scaler, base_t::data.cst0, z); - uint32 iidx = buckets[bidx]; - px += iidx; - if (z < *px) - --iidx; - if (z < *(px+1)) - --iidx; - return iidx; - } -}; - - -template -struct AlgoVecBase::value>::type> : AlgoScalarBase -{ - static const uint32 nElem = sizeof(typename InstrFloatTraits::vec_t) / sizeof(T); - - typedef FVec fVec; - typedef IVec i128; - - struct Constants - { - fVec vscaler; - fVec vcst0; - IVec one; - }; - -private: - typedef AlgoScalarBase base_t; - -#ifdef USE_SSE2 - FORCE_INLINE - //NO_INLINE - void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const - { - union U { - __m128i vec; - uint32 ui32[4]; - } u; - - const uint32* buckets = reinterpret_cast(base_t::data.buckets); - const float *xi = base_t::data.xi; - - // read indices t - const double *p3 = reinterpret_cast(&xi[(u.ui32[3] = buckets[bidx.get3()])]); - const double *p2 = reinterpret_cast(&xi[(u.ui32[2] = buckets[bidx.get2()])]); - const double *p1 = reinterpret_cast(&xi[(u.ui32[1] = buckets[bidx.get1()])]); - const double *p0 = reinterpret_cast(&xi[(u.ui32[0] = buckets[bidx.get0()])]); - -#if 0 - // read pairs ( X(t-1), X(t) ) - __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3)); - __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2)); - __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1)); - __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0)); - - // build: - // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } - // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } - __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6)); - __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6)); - __m128 u01 = _mm_unpacklo_ps(h02, h13); - __m128 u23 = _mm_unpackhi_ps(h02, h13); - __m128 vxm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6)); - __m128 vxp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6)); -#else - __m128 xp23 = _mm_castpd_ps(_mm_set_pd(*p3, *p2)); - __m128 xp01 = _mm_castpd_ps(_mm_set_pd(*p1, *p0)); - __m128 vxm = _mm_shuffle_ps(xp01, xp23, (0) + (2 << 2) + (0 << 4) + (2 << 6)); - __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6)); -#endif - IVec i(u.vec); - IVec vlem = vz < vxm; - IVec vlep = vz < vxp; - i = i + vlem + vlep; - i.store(pr); - } - - FORCE_INLINE - //NO_INLINE - void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const - { - const uint32* buckets = reinterpret_cast(base_t::data.buckets); - const double *xi = base_t::data.xi; - - uint32 b1 = buckets[bidx.get1()]; - uint32 b0 = buckets[bidx.get0()]; - - const double *p1 = &xi[b1]; - const double *p0 = &xi[b0]; - - // read pairs ( X(t-1), X(t) ) - __m128d vx1 = _mm_loadu_pd(p1); - __m128d vx0 = _mm_loadu_pd(p0); - - // build: - // { X(t(0)-1), X(t(1)-1) } - // { X(t(0)), X(t(1)) } - __m128d vxm = _mm_shuffle_pd(vx0, vx1, 0); - __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3); - - IVec i(b1, b0); - IVec vlem = (vz < vxm); - IVec vlep = (vz < vxp); - i = i + vlem + vlep; - - union { - __m128i vec; - uint32 ui32[4]; - } u; - u.vec = i; - pr[0] = u.ui32[0]; - pr[1] = u.ui32[2]; - } -#endif // USE_SSE2 - -#ifdef USE_AVX - - FORCE_INLINE - //NO_INLINE - void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const - { - const uint32* buckets = reinterpret_cast(base_t::data.buckets); - const float *xi = base_t::data.xi; - -#if 0 // use gather instructions - - IVec idxm; - idxm.setidx(buckets, bidx); - __m256i z = _mm256_setzero_si256(); - IVec minusone = _mm256_cmpeq_epi32(z,z); - IVec idxp = idxm - minusone; - - FVec vxm = _mm256_i32gather_ps(xi, idxm, sizeof(float)); - FVec vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float)); - IVec ip = idxm; - -#else // do not use gather instructions - - union U { - __m256i vec; - uint32 ui32[8]; - } u; - - // read indices t - - const double *p7 = reinterpret_cast(&xi[(u.ui32[7] = buckets[bidx.get7()])]); - const double *p6 = reinterpret_cast(&xi[(u.ui32[6] = buckets[bidx.get6()])]); - const double *p5 = reinterpret_cast(&xi[(u.ui32[5] = buckets[bidx.get5()])]); - const double *p4 = reinterpret_cast(&xi[(u.ui32[4] = buckets[bidx.get4()])]); - const double *p3 = reinterpret_cast(&xi[(u.ui32[3] = buckets[bidx.get3()])]); - const double *p2 = reinterpret_cast(&xi[(u.ui32[2] = buckets[bidx.get2()])]); - const double *p1 = reinterpret_cast(&xi[(u.ui32[1] = buckets[bidx.get1()])]); - const double *p0 = reinterpret_cast(&xi[(u.ui32[0] = buckets[bidx.get0()])]); - -#if 0 // perform 8 loads in double precision - - // read pairs ( X(t-1), X(t) ) - __m128 xp7 = _mm_castpd_ps(_mm_load_sd(p7)); - __m128 xp6 = _mm_castpd_ps(_mm_load_sd(p6)); - __m128 xp5 = _mm_castpd_ps(_mm_load_sd(p5)); - __m128 xp4 = _mm_castpd_ps(_mm_load_sd(p4)); - __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3)); - __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2)); - __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1)); - __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0)); - - // build: - // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } - // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } - __m128 h57 = _mm_shuffle_ps(xp5, xp7, (1 << 2) + (1 << 6)); // F- F+ H- H+ - __m128 h46 = _mm_shuffle_ps(xp4, xp6, (1 << 2) + (1 << 6)); // E- E+ G- G+ - __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6)); // B- B+ D- D+ - __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6)); // A- A+ C- C+ - - __m128 u01 = _mm_unpacklo_ps(h02, h13); // A- B- A+ B+ - __m128 u23 = _mm_unpackhi_ps(h02, h13); // C- D- C+ D+ - __m128 u45 = _mm_unpacklo_ps(h46, h57); // E- F- E+ F+ - __m128 u67 = _mm_unpackhi_ps(h46, h57); // G- H- G+ H+ - - __m128 abcdm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // A- B- C- D- - __m128 abcdp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // A+ B+ C+ D+ - __m128 efghm = _mm_shuffle_ps(u45, u67, (0) + (1 << 2) + (0 << 4) + (1 << 6)); // E- F- G- H- - __m128 efghp = _mm_shuffle_ps(u45, u67, (2) + (3 << 2) + (2 << 4) + (3 << 6)); // E+ F+ G+ H+ - - FVec vxp = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdm), efghm, 1); - FVec vxm = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdp), efghp, 1); - - IVec ip(u.vec); - -#else // use __mm256_set_pd - - // read pairs ( X(t-1), X(t) ) - __m256 x0145 = _mm256_castpd_ps(_mm256_set_pd(*p5, *p4, *p1, *p0)); // { x0(t-1), x0(t), x1(t-1), x1(t), x4(t-1), x4(t), x5(t-1), x5(t) } - __m256 x2367 = _mm256_castpd_ps(_mm256_set_pd(*p7, *p6, *p3, *p2)); // { x2(t-1), x2(t), x3(t-1), x3(t), x6(t-1), x6(t), x7(t-1), x7(t) } - - // { x0(t-1), x1(t-1), x2(t-1), 3(t-1, x4(t-1), x5(t-1), x6(t-1), xt(t-1) } - FVec vxm = _mm256_shuffle_ps(x0145, x2367, 0 + (2 << 2) + (0 << 4) + (2 << 6) ); - // { x0(t), x1(t), x2(t), 3(t, x4(t), x5(t), x6(t), xt(t) } - FVec vxp = _mm256_shuffle_ps(x0145, x2367, 1 + (3 << 2) + (1 << 4) + (3 << 6) ); - - IVec ip(u.vec); - -#endif - -#endif - - IVec vlem = vz < vxm; - IVec vlep = vz < vxp; - ip = ip + vlem + vlep; - - ip.store(pr); - } - - - - FORCE_INLINE - //NO_INLINE - void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const - { - union { - __m256i vec; - uint64 ui64[4]; - } u; - - const uint32* buckets = reinterpret_cast(base_t::data.buckets); - const double *xi = base_t::data.xi; - - // read indices t - const double *p3 = &xi[(u.ui64[3] = buckets[bidx.get3()])]; - const double *p2 = &xi[(u.ui64[2] = buckets[bidx.get2()])]; - const double *p1 = &xi[(u.ui64[1] = buckets[bidx.get1()])]; - const double *p0 = &xi[(u.ui64[0] = buckets[bidx.get0()])]; - - // read pairs ( X(t-1), X(t) ) - __m128d xp3 = _mm_loadu_pd(p3); - __m128d xp2 = _mm_loadu_pd(p2); - __m128d xp1 = _mm_loadu_pd(p1); - __m128d xp0 = _mm_loadu_pd(p0); - - // build: - // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) } - // { X(t(0)), X(t(1)), X(t(2)), X(t(3)) } - __m256d x02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp0), xp2, 1); - __m256d x13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp1), xp3, 1); - FVec vxm = _mm256_unpacklo_pd(x02,x13); - FVec vxp = _mm256_unpackhi_pd(x02,x13); - - -// __m128d h01m = _mm_shuffle_pd(xp0, xp1, 0); -// __m128d h23m = _mm_shuffle_pd(xp2, xp3, 0); -// __m128d h01p = _mm_shuffle_pd(xp0, xp1, 3); -// __m128d h23p = _mm_shuffle_pd(xp2, xp3, 3); -// FVec vxm = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01m), h23m, 1); -// FVec vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1); - - IVec i(u.vec); - IVec vlem = vz < vxm; - IVec vlep = vz < vxp; - i = i + vlem + vlep; - i.extractLo32s().store(pr); - } -#endif - -public: - - AlgoVecBase(const T* x, const uint32 n) : base_t(x, n) {} - - void initConstants(Constants& cst) const - { - cst.vscaler.setN(base_t::data.scaler); - cst.vcst0.setN(base_t::data.cst0); - cst.one.setN(uint32(1)); - } - - void vectorial(uint32 *pr, const T *pz, const Constants& cst) const - { - fVec vz(pz); - resolve(vz, base_t::fun_t::f(cst.vscaler, cst.vcst0, vz), pr); - } -}; -} // namespace Details -} // namespace BinSearch diff --git a/include/AlgoXCodes.h b/include/AlgoXCodes.h deleted file mode 100644 index bdc9b00b6..000000000 --- a/include/AlgoXCodes.h +++ /dev/null @@ -1,23 +0,0 @@ -ALGOENUM(DirectCacheFMA, 5) -ALGOENUM(DirectFMA, 15) -ALGOENUM(Direct2FMA, 25) -ALGOENUM(DirectCache, 10) -ALGOENUM(Direct, 20) -ALGOENUM(Direct2, 30) -ALGOENUM(Nonary, 40) -ALGOENUM(Pentary, 50) -ALGOENUM(Ternary, 60) -ALGOENUM(Eytzinger, 70) -ALGOENUM(BitSet, 80) -ALGOENUM(ClassicOffset, 90) -#ifdef PAPER_TEST -ALGOENUM(MorinOffset, 100) -ALGOENUM(BitSetNoPad, 110) -ALGOENUM(ClassicMod, 120) -ALGOENUM(MorinBranchy, 130) -ALGOENUM(Classic, 140) -ALGOENUM(LowerBound, 145) -#ifdef USE_MKL -ALGOENUM(MKL, 150) -#endif -#endif diff --git a/include/BinAlgo.h b/include/BinAlgo.h deleted file mode 100644 index aac67a0c7..000000000 --- a/include/BinAlgo.h +++ /dev/null @@ -1,77 +0,0 @@ -#pragma once - -#include "Type.h" -#include - -namespace BinSearch { - -template -struct BinAlgo : Details::BinAlgoBase -{ - typedef Details::BinAlgoBase base_t; - - BinAlgo(const T* px, const uint32 n) : base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {} - BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {} - - FORCE_INLINE - uint32 scalar(T z) const - { - if (!L || z >= x0) - if (!R || z < xN) - return base_t::scalar(z); - else - return N; - else - return std::numeric_limits::max(); - } - - - FORCE_INLINE - void vectorial(uint32 *pr, const T *pz, uint32 n) const - { - if (!L && !R) { - Details::Loop::loop(*this, pr, pz, n); - } - else { - const uint32 nElem = base_t::nElem; - const uint32 idealbufsize = 256; - const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0)); - T databuf[bufsize]; - uint32 resbuf[bufsize]; - uint32 indexbuf[bufsize]; - - uint32 *prend = pr + n; - while(pr != prend) { - uint32 cnt = 0; - uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend)); - for (uint32 j = 0; j < niter; ++j) { - T z = pz[j]; - // FIXME: use SSE2? - if (!L || z >= x0) - if (!R || z < xN) { - databuf[cnt] = z; - indexbuf[cnt] = j; - ++cnt; - } - else - pr[j] = N; - else - pr[j] = std::numeric_limits::max(); - } - // FIXME: merge these two loops - Details::Loop::loop(*this, resbuf, databuf, cnt); - for (uint32 j = 0; j < cnt; ++j) - pr[indexbuf[j]] = resbuf[j]; - pr += niter; - pz += niter; - } - } - } - - Details::CondData x0; - Details::CondData xN; - Details::CondData N; -}; - - -} // namespace BinSearch diff --git a/include/BinSearch.h b/include/BinSearch.h deleted file mode 100644 index 336f52963..000000000 --- a/include/BinSearch.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "AAlloc.h" -#include "BinAlgo.h" -#include "SIMD.h" - -#include -#include - - -#include "Algo-Direct2.h" diff --git a/include/Portable.h b/include/Portable.h deleted file mode 100644 index 59b6dc840..000000000 --- a/include/Portable.h +++ /dev/null @@ -1,205 +0,0 @@ -#pragma once -#include -#include -#include -#include - -#if defined(__aarch64__) -#ifdef __CUDACC__ -#undef USE_NEON // Doesn't work with nvcc, undefined symbols -#else -#include -#undef USE_NEON // Not yet implemented -#endif -#undef USE_AVX // x86_64 only -#undef USE_AVX2 // x86_64 only -#undef USE_SSE2 // x86_64 only -#undef USE_SSE41 // x86_64 only -#undef USE_SSE42 // x86_64 only -#undef USE_FMA // x86_64 only -#ifdef USE_NEON -typedef float32x4_t __m128; -typedef int32x4_t __m128i; -typedef float64x2_t __m128d; -#else -typedef struct {float a; float b; float c; float d;} __m128; -typedef struct {int a; int b; int c; int d;} __m128i; -typedef struct {double a; double b;} __m128d; -#endif -#elif defined(__powerpc64__) -#ifdef __CUDACC__ -#undef USE_VSX // Doesn't work with nvcc, undefined symbols -#else -#include -#undef USE_VSX // Not yet implemented -#endif -#undef USE_AVX // x86_64 only -#undef USE_AVX2 // x86_64 only -#undef USE_SSE2 // x86_64 only -#undef USE_SSE41 // x86_64 only -#undef USE_SSE42 // x86_64 only -#undef USE_FMA // x86_64 only -#ifdef USE_VSX -typedef vector float __m128; -typedef vector signed int __m128i; -typedef vector double __m128d; -#else -typedef struct {float a; float b; float c; float d;} __m128; -typedef struct {int a; int b; int c; int d;} __m128i; -typedef struct {double a; double b;} __m128d; -#endif -#else -#undef USE_NEON // ARM64 only -#undef USE_VSX // PPC only -#ifdef __FMA__ -#define USE_FMA -#endif -#if !defined(__SSE2__) && !defined(_MSC_VER) -#error Compiler must support SSE2 -#endif -#define USE_SSE2 - -#if defined(__aarch64__) -#else -#ifdef __AVX2__ -#define USE_AVX2 -#endif - -#ifdef __AVX__ -#define USE_AVX -#endif - - -#ifdef __SSE4_1__ -#define USE_SSE41 -#endif - -#ifdef __SSE4_2__ -#define USE_SSE42 -#endif -#endif -#endif - -#ifndef _MSC_VER -#include -#endif - -namespace BinSearch { - -#ifndef _MSC_VER -typedef int8_t int8; -typedef uint8_t uint8; -typedef int32_t int32; -typedef uint32_t uint32; -typedef int64_t int64; -typedef uint64_t uint64; -#else -typedef __int8 int8; -typedef unsigned __int8 uint8; -typedef __int32 int32; -typedef unsigned __int32 uint32; -typedef __int64 int64; -typedef unsigned __int64 uint64; -#endif - -namespace Details { - -#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } - -// log2 is not defined in VS2008 -#if defined(_MSC_VER) -inline uint32 log2 (uint32 val) { - if (val == 1) return 0; - uint32 ret = 0; - do { - ret++; - val >>= 1; - } while (val > 1); - return ret; -} -#endif - -#ifdef _DEBUG -#define DEBUG -#endif - -#ifdef _MSC_VER -# define FORCE_INLINE __forceinline -# define NO_INLINE __declspec(noinline) -#else -# define NO_INLINE __attribute__((noinline)) -# ifdef DEBUG -# define FORCE_INLINE NO_INLINE -# else -# define FORCE_INLINE __attribute__((always_inline)) inline -# endif -#endif - -#ifdef USE_AVX -#define COMISS "vcomiss" -#define COMISD "vcomisd" -#else -#define COMISS "comiss" -#define COMISD "comisd" -#endif - -// nextafter is not defined in VS2008 -#if defined(_MSC_VER) && (_MSC_VER <= 1500) -#include -inline float mynext(float x) -{ - return _nextafterf(x, std::numeric_limits::max()); -} - -inline double mynext(double x) -{ - return _nextafter(x, std::numeric_limits::max()); -} -inline float myprev(float x) -{ - return _nextafterf(x, -std::numeric_limits::max()); -} - -inline double myprev(double x) -{ - return _nextafter(x, -std::numeric_limits::max()); -} -#else -inline float mynext(float x) -{ - return std::nextafterf(x, std::numeric_limits::max()); -} - -inline double mynext(double x) -{ - return std::nextafter(x, std::numeric_limits::max()); -} -inline float myprev(float x) -{ - return std::nextafterf(x, -std::numeric_limits::max()); -} - -inline double myprev(double x) -{ - return std::nextafter(x, -std::numeric_limits::max()); -} -#endif - -template -inline T next(T x) -{ - for (int i = 0; i < 4; ++i) - x = mynext(x); - return x; -} - -template -inline T prev(T x) -{ - for (int i = 0; i < 4; ++i) - x = myprev(x); - return x; -} - -} // namespace Details -} // namespace BinSearch diff --git a/include/SIMD.h b/include/SIMD.h deleted file mode 100644 index 6222af04f..000000000 --- a/include/SIMD.h +++ /dev/null @@ -1,589 +0,0 @@ -#pragma once - -#include "Portable.h" - -#ifdef USE_SSE2 -#include -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#ifdef USE_SSE41 -#include -#endif -#endif -#endif - -namespace BinSearch { -namespace Details { - -template -struct FTOITraits{}; - -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - -template <> struct InstrFloatTraits -{ - typedef __m128 vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m128d vec_t; -}; - -} -} - -#if !defined(__aarch64__) && !defined(__powerpc64__) -#ifdef USE_SSE42 -#ifndef _MSC_VER -#include -#define popcnt32 _mm_popcnt_u32 -#else -#include -#define popcnt32 __popcnt -#endif -#else // USE_SSE42 -namespace BinSearch { -FORCE_INLINE int popcnt32(int x32) -{ - // strictly speaking this is not correct, as it ignores higher order bits - // however, this is only used on the resuot of movemask on a 128-bit register, which is 8 at most, so it is ok - // with 256-bit registers, SSE42 is defined, and we do not use this function - uint8 x = static_cast(x32); - x = (x & 0x55) + (x >> 1 & 0x55); - x = (x & 0x33) + (x >> 2 & 0x33); - x = (x & 0x0f) + (x >> 4 & 0x0f); - return x; -} -} // namespace -#endif - -#include "Type.h" - -namespace BinSearch { -namespace Details { - -template <> struct InstrIntTraits -{ - typedef __m128i vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m128 vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m128d vec_t; -}; - -template <> -struct FTOITraits -{ - typedef IVec vec_t; -}; - -#ifdef USE_AVX - -template <> -struct FTOITraits -{ - typedef IVec vec_t; -}; - -template <> struct InstrIntTraits -{ - typedef __m256i vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m256 vec_t; -}; - -template <> struct InstrFloatTraits -{ - typedef __m256d vec_t; -}; - -#endif - - -template -struct VecStorage -{ - typedef typename TR::vec_t vec_t; - - FORCE_INLINE operator vec_t&() { return vec; } - FORCE_INLINE operator const vec_t&() const { return vec; } - -protected: - FORCE_INLINE VecStorage() {} - FORCE_INLINE VecStorage(const vec_t& v) : vec( v ) {} - - vec_t vec; -}; - -template -struct IVecBase; - -template <> -struct IVecBase : VecStorage> -{ -protected: - FORCE_INLINE IVecBase() {} - FORCE_INLINE IVecBase( const vec_t& v) : VecStorage>( v ) {} -public: - FORCE_INLINE static vec_t zero() { return _mm_setzero_si128(); } - - FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32( vec ); } - - FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) - { -#ifdef USE_SSE41 - vec = _mm_blendv_epi8(vec, val, mask); -#else - vec = _mm_or_si128(_mm_andnot_si128(mask,vec), _mm_and_si128(mask,val)); -#endif - } - FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask) - { - vec = _mm_or_si128(vec, _mm_and_si128(val,mask)); - } -}; - -template <> -struct IVec : IVecBase -{ - FORCE_INLINE IVec() {} - FORCE_INLINE IVec( int32 i ) : IVecBase( _mm_set1_epi32( i ) ) {} - FORCE_INLINE IVec( const vec_t& v) : IVecBase( v ) {} - FORCE_INLINE IVec( uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase( _mm_set_epi32( u3, u2, u1, u0 ) ) {} - - void setN( int32 i ) { vec = _mm_set1_epi32( i ); } - -#ifdef USE_SSE41 - FORCE_INLINE int32 get1() const { return _mm_extract_epi32(vec, 1); } - FORCE_INLINE int32 get2() const { return _mm_extract_epi32(vec, 2); } - FORCE_INLINE int32 get3() const { return _mm_extract_epi32(vec, 3); } -#else - FORCE_INLINE int32 get1() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 1 ) ); } - FORCE_INLINE int32 get2() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); } - FORCE_INLINE int32 get3() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 3 ) ); } -#endif - - FORCE_INLINE void store( uint32 *pi ) const { _mm_storeu_si128( reinterpret_cast(pi), vec ); } - - FORCE_INLINE int countbit() - { - return popcnt32(_mm_movemask_ps(_mm_castsi128_ps(vec))); - } -}; - -template <> -struct IVec : IVecBase -{ - FORCE_INLINE IVec() {} - FORCE_INLINE IVec( int32 i ) : IVecBase( _mm_set1_epi64x( i ) ) {} - FORCE_INLINE IVec( const vec_t& v) : IVecBase( v ) {} - FORCE_INLINE IVec( uint64 u1, uint64 u0 ) : IVecBase( _mm_set_epi64x(u1, u0) ) {} - - void setN( int32 i ) { vec = _mm_set1_epi64x( i ); } - - FORCE_INLINE int32 get1() const - { -#ifdef USE_SSE41 - return _mm_extract_epi32(vec, 2); -#else - return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); -#endif - } - - // extract the 2 32 bits integers no. 0, 2 and store them in a __m128i - FORCE_INLINE IVec extractLo32s() const - { - return _mm_shuffle_epi32(vec, ((2 << 2) | 0)); - } - - FORCE_INLINE void store( uint32 *pi ) const - { - pi[0] = get0(); - pi[1] = get1(); - } - - FORCE_INLINE int countbit() - { -#if 1 - // takes 4 cycles - __m128i hi = _mm_shuffle_epi32(vec, 2); // 1 cycle - __m128i s = _mm_add_epi32(vec, hi); - int32 x = _mm_cvtsi128_si32(s); - return -x; -#else - // takes 6 cycles - return popcnt32(_mm_movemask_pd(_mm_castsi128_pd(vec))); -#endif - } -}; - -template -FORCE_INLINE IVec operator>> (const IVec& a, unsigned n) { return _mm_srli_epi32(a, n); } -template -FORCE_INLINE IVec operator<< (const IVec& a, unsigned n) { return _mm_slli_epi32(a, n); } -template -FORCE_INLINE IVec operator& (const IVec& a, const IVec& b ) { return _mm_and_si128( a, b ); } -template -FORCE_INLINE IVec operator| (const IVec& a, const IVec& b ) { return _mm_or_si128( a, b ); } -template -FORCE_INLINE IVec operator^ (const IVec& a, const IVec& b ) { return _mm_xor_si128( a, b ); } -template -FORCE_INLINE IVec operator+ (const IVec& a, const IVec& b ) { return _mm_add_epi32( a, b ); } -template -FORCE_INLINE IVec operator- (const IVec& a, const IVec& b ) { return _mm_sub_epi32( a, b ); } -#ifdef USE_SSE41 -template -FORCE_INLINE IVec min (const IVec& a, const IVec& b ) { return _mm_min_epi32( a, b ); } -#endif - -typedef VecStorage> FVec128Float; - -template <> -struct FVec1 : FVec128Float -{ - FORCE_INLINE FVec1() {} - FORCE_INLINE FVec1( float f ) : FVec128Float( _mm_load_ss( &f ) ) {} - FORCE_INLINE FVec1( const vec_t& v ): FVec128Float( v ) {} - - FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); } -}; - -template <> -struct FVec : FVec128Float -{ - FORCE_INLINE FVec() {} - FORCE_INLINE FVec( float f ) : FVec128Float( _mm_set1_ps( f ) ) {} - FORCE_INLINE FVec( const float *v ) : FVec128Float( _mm_loadu_ps( v ) ) {} - FORCE_INLINE FVec( const vec_t& v) : FVec128Float(v) {} - FORCE_INLINE FVec( float f3, float f2, float f1, float f0 ) : FVec128Float( _mm_set_ps(f3, f2, f1, f0) ) {} - - void set0( float f ) { vec = _mm_load_ss( &f ); } - void setN( float f ) { vec = _mm_set1_ps( f ); } - - FORCE_INLINE void setidx( const float *xi, const IVec& idx ) - { - uint32 i0 = idx.get0(); - uint32 i1 = idx.get1(); - uint32 i2 = idx.get2(); - uint32 i3 = idx.get3(); - vec = _mm_set_ps( xi[i3], xi[i2], xi[i1], xi[i0] ); - } - - FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); } - FORCE_INLINE float get1() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 1 ) ); } - FORCE_INLINE float get2() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 2 ) ); } - FORCE_INLINE float get3() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 3 ) ); } -}; - -FORCE_INLINE FVec1 operator+ (const FVec1& a, const FVec1& b) { return _mm_add_ss( a, b ); } -FORCE_INLINE FVec1 operator- (const FVec1& a, const FVec1& b) { return _mm_sub_ss( a, b ); } -FORCE_INLINE FVec1 operator* (const FVec1& a, const FVec1& b) { return _mm_mul_ss( a, b ); } -FORCE_INLINE FVec1 operator/ (const FVec1& a, const FVec1& b) { return _mm_div_ss( a, b ); } -FORCE_INLINE int ftoi (const FVec1& a) { return _mm_cvttss_si32(a); } -FORCE_INLINE IVec operator> (const FVec1& a, const FVec1& b) { return _mm_castps_si128( _mm_cmpgt_ss( a, b ) ); } -#ifdef USE_FMA -FORCE_INLINE FVec1 mulSub(const FVec1& a, const FVec1& b, const FVec1& c) { return _mm_fmsub_ss(a, b, c); } -#endif - -FORCE_INLINE FVec operator- (const FVec& a, const FVec& b) { return _mm_sub_ps( a, b ); } -FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } -FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } -FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } -#ifndef __clang__ // Conflicts with builtin operator -FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } -FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } -FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } -#endif -#ifdef USE_FMA -FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } -#endif - -typedef VecStorage> FVec128Double; - -template <> -struct FVec1 : FVec128Double -{ - FORCE_INLINE FVec1() {} - FORCE_INLINE FVec1( double f ) : FVec128Double( _mm_load_sd( &f ) ) {} - FORCE_INLINE FVec1( const vec_t& v ) : FVec128Double( v ) {} - - FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); } -}; - -template <> -struct FVec : FVec128Double -{ - FORCE_INLINE FVec() {} - FORCE_INLINE FVec( double d ) : FVec128Double( _mm_set1_pd( d ) ) {} - FORCE_INLINE FVec( const double *v ) : FVec128Double( _mm_loadu_pd( v ) ) {} - FORCE_INLINE FVec( const vec_t& v) : FVec128Double( v ) {} - FORCE_INLINE FVec( double f1, double f0 ) : FVec128Double( _mm_set_pd(f1, f0) ) {} - - void set0( double f ) { vec = _mm_load_sd( &f ); } - void setN( double f ) { vec = _mm_set1_pd( f ); } - - FORCE_INLINE void setidx( const double *xi, const IVec& idx ) - { - vec = _mm_set_pd( xi[idx.get1()], xi[idx.get0()] ); - } - - FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); } - FORCE_INLINE double get1() const { return _mm_cvtsd_f64( _mm_shuffle_pd( vec, vec, 1 ) ); }; -}; - -FORCE_INLINE FVec1 operator+ (const FVec1& a, const FVec1& b) { return _mm_add_sd( a, b ); } -FORCE_INLINE FVec1 operator- (const FVec1& a, const FVec1& b) { return _mm_sub_sd( a, b ); } -FORCE_INLINE FVec1 operator* (const FVec1& a, const FVec1& b) { return _mm_mul_sd( a, b ); } -FORCE_INLINE FVec1 operator/ (const FVec1& a, const FVec1& b) { return _mm_div_sd( a, b ); } -FORCE_INLINE int ftoi (const FVec1& a) { return _mm_cvttsd_si32(a); } -FORCE_INLINE IVec operator> (const FVec1& a, const FVec1& b) { return _mm_castpd_si128( _mm_cmpgt_sd( a, b ) ); } -#ifdef USE_FMA -FORCE_INLINE FVec1 mulSub(const FVec1& a, const FVec1& b, const FVec1& c) { return _mm_fmsub_sd(a, b, c); } -#endif - -FORCE_INLINE FVec operator- (const FVec& a, const FVec& b) { return _mm_sub_pd( a, b ); } -FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } -FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } -FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } -#ifndef __clang__ // Conflicts with builtin operator -FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } -FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } -FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } -#endif -#ifdef USE_FMA -FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } -#endif - -#ifdef USE_AVX - -template <> -struct IVecBase : VecStorage> -{ -protected: - FORCE_INLINE IVecBase() {} - FORCE_INLINE IVecBase( const vec_t& v) : VecStorage>( v ) {} -public: - FORCE_INLINE static vec_t zero() { return _mm256_setzero_si256(); } - - FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32(_mm256_castsi256_si128(vec)); } - - FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) { vec = _mm256_blendv_epi8(vec, val, mask); } - FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask) - { - vec = _mm256_blendv_epi8(vec, val, mask); - //vec = _mm256_or_si256(vec, _mm256_and_si256(val,mask)); - } - - FORCE_INLINE __m128i lo128() const { return _mm256_castsi256_si128(vec); } - FORCE_INLINE __m128i hi128() const { return _mm256_extractf128_si256(vec, 1); } -}; - -template <> -struct IVec : IVecBase -{ - FORCE_INLINE IVec() {} - FORCE_INLINE IVec( int32 i ) : IVecBase( _mm256_set1_epi32( i ) ) {} - FORCE_INLINE IVec( const vec_t& v) : IVecBase( v ) {} - FORCE_INLINE IVec(uint32 u7, uint32 u6, uint32 u5, uint32 u4, uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase(_mm256_set_epi32(u7, u6, u5, u4, u3, u2, u1, u0)) {} - - void setN( int32 i ) { vec = _mm256_set1_epi32( i ); } - - FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 1); } - FORCE_INLINE int32 get2() const { return _mm256_extract_epi32(vec, 2); } - FORCE_INLINE int32 get3() const { return _mm256_extract_epi32(vec, 3); } - FORCE_INLINE int32 get4() const { return _mm256_extract_epi32(vec, 4); } - FORCE_INLINE int32 get5() const { return _mm256_extract_epi32(vec, 5); } - FORCE_INLINE int32 get6() const { return _mm256_extract_epi32(vec, 6); } - FORCE_INLINE int32 get7() const { return _mm256_extract_epi32(vec, 7); } - - FORCE_INLINE void setidx( const uint32 *bi, const IVec& idx ) - { - vec = _mm256_i32gather_epi32(reinterpret_cast(bi), idx, sizeof(uint32)); - } - - FORCE_INLINE void store( uint32 *pi ) const { _mm256_storeu_si256( reinterpret_cast(pi), vec ); } - - FORCE_INLINE int countbit() - { - return popcnt32(_mm256_movemask_ps(_mm256_castsi256_ps(vec))); - } -}; - -template <> -struct IVec : IVecBase -{ - FORCE_INLINE IVec() {} - FORCE_INLINE IVec( int32 i ) : IVecBase( _mm256_set1_epi64x( i ) ) {} - FORCE_INLINE IVec( const vec_t& v) : IVecBase( v ) {} - FORCE_INLINE IVec(uint64 u3, uint64 u2, uint64 u1, uint64 u0) : IVecBase(_mm256_set_epi64x(u3, u2, u1, u0)) {} - - void setN( int32 i ) { vec = _mm256_set1_epi64x( i ); } - - // extract the 4 32 bits integers no. 0, 2, 4, 6 and store them in a __m128i - FORCE_INLINE IVec extractLo32s() const - { - union { - uint32 u32[4]; - __m128i u; - } mask = {0,2,4,6}; - //__m256 ps256 = _mm256_castsi256_ps(vec); - //__m128 lo128 = _mm256_castps256_ps128(ps256); - //__m128 hi128 = _mm256_extractf128_ps(ps256, 1); - //__m128 blend = _mm_shuffle_ps(lo128, hi128, 0 + (2<<2) + (0<<4) + (2<<6)); - __m256i blend = _mm256_permutevar8x32_epi32(vec, _mm256_castsi128_si256(mask.u)); - return _mm256_castsi256_si128(blend); - } - - //int32 get1() const { return _mm256_cvtsi256_si32( _mm256_shuffle_epi32( vec, 2 ) ); }; - FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 2); } - - FORCE_INLINE void store( uint32 *pi ) const - { - extractLo32s().store(pi); - } - - FORCE_INLINE int countbit() - { - return popcnt32(_mm256_movemask_pd(_mm256_castsi256_pd(vec))); - } -}; - -template -FORCE_INLINE IVec operator>> (const IVec& a, unsigned n) { return _mm256_srli_epi32(a, n); } -template -FORCE_INLINE IVec operator<< (const IVec& a, unsigned n) { return _mm256_slli_epi32(a, n); } -template -FORCE_INLINE IVec operator& (const IVec& a, const IVec& b ) { return _mm256_and_si256( a, b ); } -template -FORCE_INLINE IVec operator| (const IVec& a, const IVec& b ) { return _mm256_or_si256( a, b ); } -template -FORCE_INLINE IVec operator^ (const IVec& a, const IVec& b ) { return _mm256_xor_si256( a, b ); } -template -FORCE_INLINE IVec min (const IVec& a, const IVec& b ) { return _mm256_min_epi32( a, b ); } - -FORCE_INLINE IVec operator+ (const IVec& a, const IVec& b ) { return _mm256_add_epi32( a, b ); } -FORCE_INLINE IVec operator- (const IVec& a, const IVec& b ) { return _mm256_sub_epi32( a, b ); } -FORCE_INLINE IVec operator+ (const IVec& a, const IVec& b ) { return _mm256_add_epi64( a, b ); } -FORCE_INLINE IVec operator- (const IVec& a, const IVec& b ) { return _mm256_sub_epi64( a, b ); } - - -typedef VecStorage> FVec256Float; - -template <> -struct FVec : FVec256Float -{ - FORCE_INLINE FVec() {} - FORCE_INLINE FVec( float f ) : FVec256Float( _mm256_set1_ps( f ) ) {} - FORCE_INLINE FVec( const float *v ) : FVec256Float( _mm256_loadu_ps( v ) ) {} - FORCE_INLINE FVec( const vec_t& v) : FVec256Float(v) {} - FORCE_INLINE FVec(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0) : FVec256Float(_mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0)) {} - - //void set0( float f ) { vec = _mm256_load_ss( &f ); } - void setN( float f ) { vec = _mm256_set1_ps( f ); } - - FORCE_INLINE void setidx( const float *xi, const IVec& idx ) - { -#if 1 // use gather primitives - vec = _mm256_i32gather_ps (xi, idx, 4); -#elif 0 - uint32 i0 = idx.get0(); - uint32 i1 = idx.get1(); - uint32 i2 = idx.get2(); - uint32 i3 = idx.get3(); - uint32 i4 = idx.get4(); - uint32 i5 = idx.get5(); - uint32 i6 = idx.get6(); - uint32 i7 = idx.get7(); - vec = _mm256_set_ps( xi[i7], xi[i6], xi[i5], xi[i4], xi[i3], xi[i2], xi[i1], xi[i0] ); -#else - union { - __m256i vec; - uint32 ui32[8]; - } i; - i.vec = static_cast(idx); - vec = _mm256_set_ps(xi[i.ui32[7]], xi[i.ui32[6]], xi[i.ui32[5]], xi[i.ui32[4]], xi[i.ui32[3]], xi[i.ui32[2]], xi[i.ui32[1]], xi[i.ui32[0]]); -#endif - } - - FORCE_INLINE FVec lo128() const { return _mm256_castps256_ps128(vec); } - FORCE_INLINE FVec hi128() const { return _mm256_extractf128_ps(vec, 1); } - - //FORCE_INLINE float get0() const { return _mm256_cvtss_f32( vec ); } - //FORCE_INLINE float get1() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 1 ) ); } - //FORCE_INLINE float get2() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 2 ) ); } - //FORCE_INLINE float get3() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 3 ) ); } -}; - -FORCE_INLINE FVec operator- (const FVec& a, const FVec& b) { return _mm256_sub_ps( a, b ); } -FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm256_mul_ps( a, b ); } -FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm256_div_ps( a, b ); } -FORCE_INLINE IVec ftoi (const FVec& a) { return _mm256_cvttps_epi32(a); } -FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_LE_OS) ); } -FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_GE_OS ) ); } -FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS )); } -#ifdef USE_FMA -FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm256_fmsub_ps(a, b, c); } -#endif - -typedef VecStorage> FVec256Double; - -template <> -struct FVec : FVec256Double -{ - FORCE_INLINE FVec() {} - FORCE_INLINE FVec( double d ) : FVec256Double( _mm256_set1_pd( d ) ) {} - FORCE_INLINE FVec( const double *v ) : FVec256Double( _mm256_loadu_pd( v ) ) {} - FORCE_INLINE FVec( const vec_t& v) : FVec256Double( v ) {} - FORCE_INLINE FVec(double d3, double d2, double d1, double d0) : FVec256Double(_mm256_set_pd(d3, d2, d1, d0)) {} - - //void set0( double f ) { vec = _mm256_load_sd( &f ); } - void setN( double f ) { vec = _mm256_set1_pd( f ); } - - FORCE_INLINE void setidx( const double *xi, const IVec& idx ) - { - vec = _mm256_i32gather_pd(xi, idx, 8); - } - - FORCE_INLINE void setidx( const double *xi, const IVec& idx ) - { - vec = _mm256_i64gather_pd(xi, idx, 8); - } - -// FORCE_INLINE double get0() const { return _mm256_cvtsd_f64( vec ); } -// FORCE_INLINE double get1() const { return _mm256_cvtsd_f64( _mm256_shuffle_pd( vec, vec, 1 ) ); }; -}; - -FORCE_INLINE FVec operator- (const FVec& a, const FVec& b) { return _mm256_sub_pd( a, b ); } -FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm256_mul_pd( a, b ); } -FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm256_div_pd( a, b ); } -FORCE_INLINE IVec ftoi (const FVec& a) { return _mm256_cvttpd_epi32(a); } -FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_LE_OS ) ); } -FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS)); } -FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_GE_OS ) ); } -#ifdef USE_FMA -FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm256_fmsub_pd(a, b, c); } -#endif - -#endif - -} // namespace Details -} // namespace BinSearch -#endif // !defined(__aarch64__) diff --git a/include/Type.h b/include/Type.h deleted file mode 100644 index 2c0ab6ed6..000000000 --- a/include/Type.h +++ /dev/null @@ -1,221 +0,0 @@ - #pragma once - -#include -#include -#include - -#include "Portable.h" - -using std::size_t; - -namespace BinSearch { - -enum InstrSet { Scalar, SSE, AVX, Neon, VSX }; - -#define ALGOENUM(x, b) x, -enum Algos - { -#include "AlgoXCodes.h" - }; -#undef ALGOENUM - -namespace Details { - - template - struct InstrIntTraits; - - template - struct InstrFloatTraits; - - // base class for algorithm supporting the method: - // uint32 scalar(T z) const - template - struct AlgoScalarBase; - - // base class for algorithm supporting the following methods, constants and definitions: - // static const uint32 nElem - // struct Constants; - // void initConstants(Constants& cst) const - // void vectorial(uint32 *pr, const T *pz, const Constants& cst) const - // The function vectorial processes nElem items - template - struct AlgoVecBase; - - template struct IntTraits; - - template <> struct IntTraits - { - typedef uint32 itype; - }; - template <> struct IntTraits - { - typedef uint64 itype; - }; - - template - struct Body - { - template - FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const typename Expr::Constants& cst) - { - e.vectorial(ri, zi, cst); - Body::template iteration(e, ri + D, zi + D, cst); - } - - }; - - template <> - struct Body<0> - { - template - FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const H&) - { - } - }; - - template - struct Loop - { - typedef Algo algo_type; - static const uint32 M = 4; - static const uint32 D = algo_type::nElem; - - FORCE_INLINE static void loop(const algo_type& e, uint32 *ri, const T* zi, uint32 n) - { - typename algo_type::Constants cst; - e.initConstants(cst); - - uint32 j = 0; - while (j + (D*M) <= n) { - Details::Body::template iteration(e, ri + j, zi + j, cst); - j += (D*M); - } - while (j + D <= n) { - e.vectorial(ri + j, zi + j, cst); - j += D; - } - while (D > 1 && j < n) { - ri[j] = e.scalar(zi[j]); - j += 1; - } - } - }; - - template - struct _Pipeliner - { - template - FORCE_INLINE static void go(const Expr& e, Data* d) - { - e.template run(d); - _Pipeliner::go(e, d); - } - }; - - template - struct _Pipeliner - { - template - FORCE_INLINE static void go(const Expr& e, Data* d) - { - } - }; - - template - struct Pipeliner - { - template - FORCE_INLINE static void go(const Expr& e, Data* d) - { - _Pipeliner::go(e, d); - } - }; - - -#if 1 - template - char is_complete_impl(char (*)[sizeof(T)]); - - template - long is_complete_impl(...); - - template - struct IsComplete - { - static const bool value = sizeof(is_complete_impl(0)) == sizeof(char); - }; -#else - template - std::true_type is_complete_impl(T *); - - std::false_type is_complete_impl(...); - - template - struct IsComplete : decltype(is_complete_impl(std::declval())) {}; -#endif - -template -struct AlgoScalarToVec : AlgoScalarBase -{ - typedef AlgoScalarBase base_t; - - AlgoScalarToVec(const typename base_t::Data& d) : base_t(d) {} - AlgoScalarToVec(const T* px, const uint32 n) : base_t(px, n) {} - - static const uint32 nElem = 1; - - struct Constants - { - }; - - void initConstants(Constants& cst) const - { - } - - FORCE_INLINE - void vectorial(uint32 *pr, const T *pz, const Constants& cst) const - { - *pr = base_t::scalar(*pz); - } -}; - -template -struct conditional { typedef T type; }; - -template -struct conditional { typedef F type; }; - -template -struct CondData -{ - FORCE_INLINE CondData(T x) : v(x) {} - FORCE_INLINE operator const T&() const { return v;} -private: - T v; -}; - -template -struct CondData -{ - FORCE_INLINE CondData(T) {} - FORCE_INLINE operator const T() const { return 0;} -}; - -template -struct BinAlgoBase : Details::conditional< Details::IsComplete>::value - , Details::AlgoVecBase - , Details::AlgoScalarToVec - >::type -{ - typedef typename Details::conditional< Details::IsComplete>::value - , Details::AlgoVecBase - , Details::AlgoScalarToVec - >::type base_t; - - BinAlgoBase(const T* px, const uint32 n) : base_t(px, n) {} - BinAlgoBase(const typename base_t::Data& d) : base_t(d) {} -}; - -} // namespace Details - -} // namespace BinSearch From 3f932b40cf675eccad4e41e52a084f7633c912a5 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:50:55 -0500 Subject: [PATCH 2/4] Fix --- csrc/cpu_ops.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index ed071395f..7fd1b1722 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include "cpu_ops.h" From f90467d5a77edbd1555757047ccb29a2c77a31a2 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:52:53 -0500 Subject: [PATCH 3/4] Fix lint --- csrc/cpu_ops.cpp | 4 ++-- csrc/cpu_ops.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index 7fd1b1722..836624f27 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,9 +1,9 @@ -#include +#include "cpu_ops.h" #include #include #include +#include #include -#include "cpu_ops.h" #ifdef HAS_OPENMP #include diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h index bf538c3ee..3ecdad99b 100644 --- a/csrc/cpu_ops.h +++ b/csrc/cpu_ops.h @@ -1,13 +1,13 @@ #ifndef BITSANDBYTES_CPU_OPS_H #define BITSANDBYTES_CPU_OPS_H +#include "common.h" #include #include #include #include #include #include -#include "common.h" #if defined(_OPENMP) #include From dcbfcd3ebed239927c5874b9cefefa98313c538f Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Tue, 2 Dec 2025 12:57:33 -0500 Subject: [PATCH 4/4] Fix --- csrc/ops.cu | 2 -- csrc/ops.hip | 2 -- 2 files changed, 4 deletions(-) diff --git a/csrc/ops.cu b/csrc/ops.cu index 6b9fa87bf..7b0c60cca 100644 --- a/csrc/ops.cu +++ b/csrc/ops.cu @@ -3,7 +3,6 @@ // This source code is licensed under the MIT license found in the // LICENSE file in the root directory of this source tree. -#include #include #include #include @@ -12,7 +11,6 @@ #define ERR_NOT_IMPLEMENTED 100 -using namespace BinSearch; using std::cout; using std::endl; diff --git a/csrc/ops.hip b/csrc/ops.hip index 794445bc7..26a2362e3 100644 --- a/csrc/ops.hip +++ b/csrc/ops.hip @@ -15,13 +15,11 @@ #include #endif #include -#include #include #include #define ERR_NOT_IMPLEMENTED 100 -using namespace BinSearch; using std::cout; using std::endl;