From ca53d73f3d6b7160d6d8a7aae78d653f9a4579a9 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:43:40 -0500
Subject: [PATCH 1/4] Cleanup: remove FastBinarySearch

---
 CMakeLists.txt               |   5 +-
 NOTICE.md                    |   4 +-
 README.md                    |   2 -
 csrc/common.cpp              |  37 ---
 csrc/common.h                |  23 +-
 csrc/cpu_ops.cpp             |   4 +-
 csrc/cpu_ops.h               |   2 +-
 csrc/mps_ops.h               |   0
 docs/source/index.mdx        |   2 -
 include/AAlloc.h             |  86 -----
 include/Algo-Direct-Common.h | 341 --------------------
 include/Algo-Direct2.h       | 307 ------------------
 include/AlgoXCodes.h         |  23 --
 include/BinAlgo.h            |  77 -----
 include/BinSearch.h          |  11 -
 include/Portable.h           | 205 ------------
 include/SIMD.h               | 589 -----------------------------------
 include/Type.h               | 221 -------------
 18 files changed, 6 insertions(+), 1933 deletions(-)
 delete mode 100644 csrc/common.cpp
 delete mode 100644 csrc/mps_ops.h
 delete mode 100644 include/AAlloc.h
 delete mode 100644 include/Algo-Direct-Common.h
 delete mode 100644 include/Algo-Direct2.h
 delete mode 100644 include/AlgoXCodes.h
 delete mode 100644 include/BinAlgo.h
 delete mode 100644 include/BinSearch.h
 delete mode 100644 include/Portable.h
 delete mode 100644 include/SIMD.h
 delete mode 100644 include/Type.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index f88ac2b11..7d126d488 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 # Define included source files
-set(CPP_FILES csrc/common.cpp csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
+set(CPP_FILES csrc/cpu_ops.cpp csrc/pythonInterface.cpp)
 set(CUDA_FILES csrc/ops.cu csrc/kernels.cu)
 set(HIP_FILES csrc/ops.hip csrc/kernels.hip)
 set(MPS_FILES csrc/mps_ops.mm)
@@ -158,7 +158,6 @@ if(BUILD_CUDA)
     string(APPEND CMAKE_CUDA_FLAGS " --use_fast_math")
 
     if(PTXAS_VERBOSE)
-        # Verbose? Outputs register usage information, and other things...
         string(APPEND CMAKE_CUDA_FLAGS " -Xptxas=-v")
     endif()
 
@@ -268,7 +267,7 @@ endif()
 set_source_files_properties(${CPP_FILES} PROPERTIES LANGUAGE CXX)
 add_library(bitsandbytes SHARED ${SRC_FILES})
 target_compile_features(bitsandbytes PUBLIC cxx_std_17)
-target_include_directories(bitsandbytes PUBLIC csrc include)
+target_include_directories(bitsandbytes PUBLIC csrc)
 
 if (BUILD_CPU)
     if (OpenMP_CXX_FOUND)
diff --git a/NOTICE.md b/NOTICE.md
index 660658b05..b5591965d 100644
--- a/NOTICE.md
+++ b/NOTICE.md
@@ -1,3 +1 @@
-The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: Pytorch is licensed under the BSD license.
-
-We thank Fabio Cannizzo for this work on FastBinarySearch which is included in this project.
+The majority of bitsandbytes is licensed under MIT, however portions of the project are available under separate license terms: PyTorch is licensed under the BSD license.
diff --git a/README.md b/README.md
index 4413ad185..deb19877c 100644
--- a/README.md
+++ b/README.md
@@ -180,8 +180,6 @@ The continued maintenance and development of `bitsandbytes` is made possible tha
 ## License
 `bitsandbytes` is MIT licensed.
 
-We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization.
-
 ## How to cite us
 If you found this library useful, please consider citing our work:
 
diff --git a/csrc/common.cpp b/csrc/common.cpp
deleted file mode 100644
index 834453370..000000000
--- a/csrc/common.cpp
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <common.h>
-#include <float.h>
-
-void quantize_block(const quantize_block_args& args) {
-    // 1. find absmax in block
-    // 2. divide input value by absmax to normalize into [-1.0, 1.0]
-    // 3. do binary search to find the closest value
-    // 4. check minimal distance
-    // 5. store index
-
-    // 1. find absmax in block
-    float absmax_block = -FLT_MAX;
-    for (long long i = args.block_idx; i < args.block_end; i++)
-        absmax_block = fmax(absmax_block, fabs(args.A[i]));
-
-    args.absmax[args.block_idx / args.blocksize] = absmax_block;
-
-    for (long long i = args.block_idx; i < args.block_end; i++) {
-        // 2. divide input value by absmax to normalize into [-1.0, 1.0]
-        // 3. do binary search to find the closest value
-        float normed_value = args.A[i] / absmax_block;
-        long long idx = args.bin_searcher->scalar(normed_value);
-
-        // 4. check minimal distance
-        // The binary search returns always the value to the left, which might not be the closest value
-        if (idx < 255) {
-            float dist_left = fabs(normed_value - (args.code[idx]));
-            float dist_right = fabs(normed_value - (args.code[idx + 1]));
-            if (dist_right < dist_left) {
-                idx += 1;
-            }
-        }
-
-        // 5. store index
-        args.out[i] = (unsigned char)idx;
-    }
-}
diff --git a/csrc/common.h b/csrc/common.h
index 76b5d6aee..1496c0bc3 100644
--- a/csrc/common.h
+++ b/csrc/common.h
@@ -1,28 +1,7 @@
-#include <BinSearch.h>
-
-#ifndef common
-#define common
-
-using namespace BinSearch;
+#pragma once
 
 typedef enum DataType_t {
     General8bit = 0,
     FP4 = 1,
     NF4 = 2,
 } DataType_t;
-
-struct quantize_block_args {
-    BinAlgo<Scalar, float, Direct2>* bin_searcher;
-    float* code;
-    float* A;
-    float* absmax;
-    unsigned char* out;
-    long long block_end;
-    long long block_idx;
-    long long threadidx;
-    long long blocksize;
-};
-
-void quantize_block(const quantize_block_args& args);
-
-#endif
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index eb7225d7f..ed071395f 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,10 +1,8 @@
-#include <BinSearch.h>
-#include <cpu_ops.h>
 #include <thread>
-
 #include <algorithm>
 #include <cmath>
 #include <vector>
+#include "cpu_ops.h"
 
 #ifdef HAS_OPENMP
 #include <omp.h>
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
index 6803b29f9..bf538c3ee 100644
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
@@ -3,11 +3,11 @@
 
 #include <algorithm>
 #include <cmath>
-#include <common.h>
 #include <cstdint>
 #include <cstring>
 #include <thread>
 #include <type_traits>
+#include "common.h"
 
 #if defined(_OPENMP)
 #include <omp.h>
diff --git a/csrc/mps_ops.h b/csrc/mps_ops.h
deleted file mode 100644
index e69de29bb..000000000
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 064420cf7..8f69bb58f 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -9,5 +9,3 @@ bitsandbytes enables accessible large language models via k-bit quantization for
 # License
 
 bitsandbytes is MIT licensed.
-
-We thank Fabio Cannizzo for his work on [FastBinarySearch](https://github.com/fabiocannizzo/FastBinarySearch) which we use for CPU quantization.
diff --git a/include/AAlloc.h b/include/AAlloc.h
deleted file mode 100644
index 6c2ae419f..000000000
--- a/include/AAlloc.h
+++ /dev/null
@@ -1,86 +0,0 @@
-#pragma once
-
-#include "Portable.h"
-
-namespace BinSearch {
-namespace Details {
-
-template <typename T>
-bool isAligned(const T *p, size_t A)
-{
-    return (reinterpret_cast<size_t>(p) % A) == 0;
-}
-
-template <class T, size_t A=64>
-struct AlignedVec
-{
-    AlignedVec()
-        : m_storage(0)
-        , m_data(0)
-        , m_sz(0)
-    {
-    }
-
-    static size_t nBytes(size_t sz)
-    {
-        return sz * sizeof(T) + A;
-    }
-
-    static size_t shiftAmt(char *p)
-    {
-        return A>1? (A - (reinterpret_cast<size_t>(p) % A)) % A: 0;
-    }
-
-    void setPtr(char *p, size_t sz)
-    {
-        m_sz = sz;
-        m_data = reinterpret_cast<T *>(p + shiftAmt(p));
-    }
-
-    //void setPtr(T *p, size_t sz)
-    //{
-    //    m_sz = sz;
-    //    if (A>1)
-    //        myassert(((reinterpret_cast<size_t>(p) % A) == 0), "bad alignment");
-    //    m_data = p;
-    //}
-
-    // internal allocation
-    void resize(size_t sz)
-    {
-        m_storage = new char[nBytes(sz)];
-        setPtr(m_storage, sz);
-    }
-
-    // external allocation
-    void set(char *storage, size_t sz)
-    {
-        setPtr(storage, sz);
-    }
-
-    ~AlignedVec()
-    {
-        if (m_storage)
-            delete [] m_storage;
-    }
-
-    size_t size() const { return m_sz; }
-    T& operator[](size_t i) { return m_data[i]; }
-    const T& operator[](size_t i) const { return m_data[i]; }
-    T* begin()  { return m_data;  }
-    T* end()  { return m_data+m_sz; }
-    const T* begin() const { return m_data;  }
-    const T* end() const { return m_data+m_sz; }
-    T& front() { return m_data[0]; }
-    T& back() { return m_data[m_sz-1]; }
-    const T& front() const { return m_data[0]; }
-    const T& back() const { return m_data[m_sz - 1]; }
-
-private:
-    char *m_storage;
-    T *m_data;
-    size_t m_sz;
-};
-
-} // namespace Details
-} // namespace BinSearch
diff --git a/include/Algo-Direct-Common.h b/include/Algo-Direct-Common.h
deleted file mode 100644
index 7b40edea9..000000000
--- a/include/Algo-Direct-Common.h
+++ /dev/null
@@ -1,341 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <limits>
-#include <type_traits>
-#include "AAlloc.h"
-
-namespace BinSearch {
-namespace Details {
-
-namespace DirectAux {
-
-#define SAFETY_MULTI_PASS true
-
-template <typename T>
-struct HResults
-{
-    HResults(T h, double ratio, size_t n) : H(h), hRatio(ratio), nInc(n) {}
-    T H;
-    double hRatio;
-    size_t nInc;
-};
-
-
-#ifdef USE_FMA
-template <Algos A> struct IsDirect { static const bool value = (A == Direct) || (A == DirectFMA); };
-template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2) || (A == Direct2FMA); };
-template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache) || (A == DirectCacheFMA); };
-#else
-template <Algos A> struct IsDirect { static const bool value = (A == Direct); };
-template <Algos A> struct IsDirect2 { static const bool value = (A == Direct2); };
-template <Algos A> struct IsDirectCache { static const bool value = (A == DirectCache); };
-#endif
-
-// general definition
-template <Algos A, typename T, typename Enable = void>
-struct BucketElem
-{
-    FORCE_INLINE void set( uint32 b, const T *)
-    {
-        m_b = b;
-    }
-
-    FORCE_INLINE uint32 index() const { return m_b; }
-
-private:
-    uint32 m_b;
-};
-
-// specialization for DirectCache methods
-
-template <typename T> struct MatchingIntType;
-template <> struct MatchingIntType<double> { typedef uint64 type; };
-template <> struct MatchingIntType<float> { typedef uint32 type; };
-
-template <Algos A, typename T>
-struct BucketElem<A, T, typename std::enable_if< IsDirectCache<A>::value >::type >
-{
-    typedef typename MatchingIntType<T>::type I;
-
-    void set(uint32 b, const T *xi)
-    {
-        u.u.x = xi[b];
-        u.u.b = b;
-    }
-
-    FORCE_INLINE I index() const { return u.u.b; }
-    FORCE_INLINE T x() const { return u.u.x; }
-
-private:
-    union {
-        double dummy;
-        struct
-        {
-            T x;
-            I b;
-        } u;
-    } u;
-};
-
-
-template <bool UseFMA, unsigned char Gap, typename T>
-struct DirectTraits
-{
-    static void checkH(T scaler, T x0, T xN)
-    {
-        T Dn = xN - x0;
-        T ifmax = Dn * scaler;
-        myassert((ifmax < std::numeric_limits<uint32>::max() - (Gap - 1)),
-            "Problem unfeasible: index size exceeds uint32 capacity:"
-            << " D[N] =" << Dn
-            << ", H =" << scaler
-            << ", H D[n] =" << ifmax << "\n"
-        );
-    }
-
-    FORCE_INLINE static uint32 f(T scaler, T x0, T z)
-    {
-        T tmp = scaler * (z - x0);
-#ifdef USE_SSE2
-        return ftoi(FVec1<SSE,T>(tmp));
-#else
-        return static_cast<uint32>(tmp);
-#endif
-    }
-
-    template <InstrSet I>
-    FORCE_INLINE static typename FTOITraits<I, T>::vec_t f(const FVec<I, T>& scaler, const FVec<I, T>& x0, const FVec<I, T>& z)
-    {
-        return ftoi(scaler*(z-x0));
-    }
-
-    static T cst0(T scaler, T x0)
-    {
-        return x0;
-    }
-};
-
-#ifdef USE_FMA
-template <unsigned char Gap, typename T>
-struct DirectTraits<true,Gap,T>
-{
-    typedef FVec1<SSE, T> fVec1;
-
-    static void checkH(T scaler, T H_Times_x0, T xN)
-    {
-        union {
-            typename FVec1<SSE, T>::vec_t v;
-            T s;
-        } ifmax;
-        ifmax.v = mulSub(fVec1(scaler), fVec1(xN), fVec1(H_Times_x0));
-        myassert((ifmax.s < std::numeric_limits<uint32>::max() - (Gap - 1)),
-            "Problem unfeasible: index size exceeds uint32 capacity:"
-            << " H X[0] =" << H_Times_x0
-            << ", H =" << scaler
-            << ", X[N] =" << xN
-            << ", H X[N] - H X[0] =" << ifmax.s << "\n"
-        );
-    }
-
-    FORCE_INLINE static uint32 f(T scaler, T Hx0, T xi)
-    {
-        return ftoi(mulSub(fVec1(scaler), fVec1(xi), fVec1(Hx0)));
-    }
-
-    template <InstrSet I>
-    FORCE_INLINE static typename FTOITraits<I,T>::vec_t f(const FVec<I,T>& scaler, const FVec<I, T>& H_Times_X0, const FVec<I, T>& z)
-    {
-        return ftoi(mulSub(scaler, z, H_Times_X0));
-    }
-
-    static T cst0(T scaler, T x0)
-    {
-        return scaler*x0;
-    }
-};
-#endif
-
-template <unsigned char Gap, typename T, Algos A>
-struct DirectInfo
-{
-    static const bool UseFMA = (A == DirectFMA) || (A == Direct2FMA) || (A == DirectCacheFMA);
-    typedef DirectTraits<UseFMA, Gap, T> fun_t;
-    typedef BucketElem<A,T> bucket_t;
-    typedef AlignedVec<bucket_t> bucketvec_t;
-
-    struct Data {
-        Data() : buckets(0), xi(0), scaler(0), cst0(0) {}
-        Data( const T *x      // for Direct must persist if xws=NULL
-            , uint32 n
-            , T H
-            , bucket_t *bws   // assumed to gave size nb, as computed below
-            , T *xws = NULL   // assumed to have size (n+Gap-1). Optional for Direct, unused for DirectCache, required for DirectGap
-            )
-            : buckets(bws)
-            , scaler(H)
-            , cst0(fun_t::cst0(H, x[0]))
-        {
-            myassert(((bws != NULL) && (isAligned(bws,64))), "bucket pointer not allocated or incorrectly aligned");
-
-            uint32 nb = 1 + fun_t::f(H, cst0, x[n-1]);
-
-            const uint32 npad = Gap-1;
-            const uint32 n_sz = n + npad;   // size of padded vector
-
-            if (xws) {
-                myassert(isAligned(xws,8), "x pointer not allocated or incorrectly aligned");
-                std::fill_n(xws, npad, x[0]);    // pad in front with x[0]
-                std::copy(x, x+n, xws + npad);
-                xi = xws;
-            }
-            else {
-                myassert((Gap==1), "if Gap>1 then X workspace must be provided");
-                xi = x;
-            }
-
-            populateIndex(bws, nb, xi, n_sz, scaler, cst0);
-        }
-
-        const bucket_t *buckets;
-        const T *xi;
-        T scaler;
-        T cst0;  // could be x0 or (scaler*x0), depending if we are using FMA or not
-    } data;
-
-    static T growStep(T H)
-    {
-        T step;
-        T P = next(H);
-        while ((step = P - H) == 0)
-            P = next(P);
-        return step;
-    }
-
-    static HResults<T> computeH(const T *px, uint32 nx)
-    {
-        myassert((nx > Gap), "Array X too small");
-        myassert(((Gap == 1) || (Gap == 2)), "Only tested for these values of Gap");
-
-        const T x0 = px[0];
-        const T xN = px[nx-1];
-
-        const T range = xN - x0;
-        myassert((range < std::numeric_limits<T>::max()), "range too large");
-
-        // check that D_i are strictly increasing and compute minimum value D_{i+Offset}-D_i
-        T deltaDMin = range;
-        for (uint32 i = Gap; i < nx; ++i) {
-            T Dnew = px[i] - x0;
-            T Dold = px[i - Gap] - x0;
-            myassert((Dnew > Dold),
-                "Problem unfeasible: D_i sequence not strictly increasing"
-                << " X[" << 0 << "]=" << x0
-                << " X[" << i - Gap << "]=" << px[i - Gap]
-                << " X[" << i << "]=" << px[i]
-                << "\n"
-            );
-            T deltaD = Dnew - Dold;
-            if (deltaD < deltaDMin)
-                deltaDMin = deltaD;
-        }
-
-        // initial guess for H
-        const T H0 = T(1.0) / deltaDMin;
-        T H = H0;
-
-        T cst0 = fun_t::cst0(H, x0);
-        fun_t::checkH(H, cst0, xN);
-
-        // adjust H by trial and error until succeed
-        size_t nInc = 0;
-        bool modified = false;
-        size_t npasses = 0;
-        T step = growStep(H);
-        uint32 seg_already_checked_from = nx;
-        do {
-            myassert((npasses++ < 2), "verification failed\n");
-            // if there has been an increase, then check only up to that point
-            uint32 last_seg_to_be_checked = seg_already_checked_from - 1;
-            modified = false;
-            uint32 inew = 0;
-            for (uint32 i = Gap; i <= last_seg_to_be_checked; ++i) {
-                uint32 iold = fun_t::f(H, cst0, px[i-Gap]);
-                uint32 inew = fun_t::f(H, cst0, px[i]);
-                while (inew == iold) {
-                    seg_already_checked_from = i;
-                    last_seg_to_be_checked = nx-1;  // everything needs to be checked
-                    modified = true;
-                    H = H + step;
-                    step *= 2;
-                    // recalculate all constants and indices
-                    cst0 = fun_t::cst0(H, x0);
-                    fun_t::checkH(H, cst0, xN);
-                    iold = fun_t::f(H, cst0, px[i - Gap]);
-                    inew = fun_t::f(H, cst0, px[i]);
-                }
-            }
-        } while (SAFETY_MULTI_PASS && modified);
-
-        return HResults<T>(H, (((double)H) / H0) - 1.0, nInc);
-    }
-
-    static void populateIndex(BucketElem<A, T> *buckets, uint32 index_size, const T *px, uint32 x_size, T scaler, T cst0)
-    {
-        for (uint32 i = x_size-1, b = index_size-1, j=0; ; --i) {
-            uint32 idx = fun_t::f(scaler, cst0, px[i]);
-            while (b > idx) {  // in the 1st iteration it is j=0 but this condition is always false
-                buckets[b].set( j, px );
-                --b;
-            }
-            if (Gap==1 || b == idx) { // if Gap==1, which is known at compile time, the check b==idx is redundant
-                j = i - (Gap-1); // subtracting (Gap-1) points to the index of the first X-element to check
-                buckets[b].set(j, px);
-                if (b-- == 0)
-                    break;
-            }
-        }
-    }
-
-    DirectInfo(const Data& d)
-        : data(d)
-    {
-    }
-
-    DirectInfo(const T* px, const uint32 n)
-    {
-        HResults<T> res = computeH(px, n);
-
-#ifdef PAPER_TEST
-        nInc = res.nInc;
-        hRatio = res.hRatio;
-#endif
-        const uint32 npad = Gap-1;
-        const uint32 n_sz = n + npad;   // size of padded vector
-
-        if (npad)
-            xi.resize(n_sz);
-
-        T H    = res.H;
-        T cst0 = fun_t::cst0(H, px[0]);
-        const uint32 maxIndex = fun_t::f(H, cst0, px[n-1]);
-        buckets.resize(maxIndex + 1);
-
-        data = Data(px, n, H, buckets.begin(), (npad? xi.begin(): NULL));
-    }
-
-private:
-    bucketvec_t buckets;
-    AlignedVec<T,8> xi;
-
-#ifdef PAPER_TEST
-public:
-    double hRatio;
-    size_t nInc;
-#endif
-};
-
-
-} // namespace DirectAux
-} // namespace Details
-} // namespace BinSearch
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
deleted file mode 100644
index 547ca9955..000000000
--- a/include/Algo-Direct2.h
+++ /dev/null
@@ -1,307 +0,0 @@
-#pragma once
-
-#include "Algo-Direct-Common.h"
-
-namespace BinSearch {
-namespace Details {
-
-template <typename T, Algos A>
-struct AlgoScalarBase<T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : DirectAux::DirectInfo<2, T, A>
-{
-private:
-    typedef DirectAux::DirectInfo<2, T, A> base_t;
-    static const size_t Offset=2;
-
-public:
-    AlgoScalarBase(const T* x, const uint32 n)
-        : base_t(x, n)
-    {
-    }
-
-    FORCE_INLINE uint32 scalar(T z) const
-    {
-        const T* px = base_t::data.xi;
-        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
-        uint32 bidx = base_t::fun_t::f(base_t::data.scaler, base_t::data.cst0, z);
-        uint32 iidx = buckets[bidx];
-        px += iidx;
-        if (z < *px)
-            --iidx;
-        if (z < *(px+1))
-            --iidx;
-        return iidx;
-    }
-};
-
-
-template <InstrSet I, typename T, Algos A>
-struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::value>::type> : AlgoScalarBase<T, A>
-{
-    static const uint32 nElem = sizeof(typename InstrFloatTraits<I, T>::vec_t) / sizeof(T);
-
-    typedef FVec<I, T> fVec;
-    typedef IVec<SSE, T> i128;
-
-    struct Constants
-    {
-        fVec vscaler;
-        fVec vcst0;
-        IVec<I, T> one;
-    };
-
-private:
-    typedef AlgoScalarBase<T, A> base_t;
-
-#ifdef USE_SSE2
-    FORCE_INLINE
-        //NO_INLINE
-        void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
-    {
-        union U {
-            __m128i vec;
-            uint32 ui32[4];
-        } u;
-
-        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
-        const float *xi = base_t::data.xi;
-
-        // read indices t
-        const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
-        const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
-        const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
-        const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);
-
-#if 0
-        // read pairs ( X(t-1), X(t) )
-        __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
-        __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
-        __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
-        __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));
-
-        // build:
-        // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
-        // { X(t(0)),   X(t(1)),   X(t(2)),   X(t(3)) }
-        __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6));
-        __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6));
-        __m128 u01 = _mm_unpacklo_ps(h02, h13);
-        __m128 u23 = _mm_unpackhi_ps(h02, h13);
-        __m128 vxm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6));
-        __m128 vxp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6));
-#else
-        __m128 xp23 = _mm_castpd_ps(_mm_set_pd(*p3, *p2));
-        __m128 xp01 = _mm_castpd_ps(_mm_set_pd(*p1, *p0));
-        __m128 vxm = _mm_shuffle_ps(xp01, xp23, (0) + (2 << 2) + (0 << 4) + (2 << 6));
-        __m128 vxp = _mm_shuffle_ps(xp01, xp23, (1) + (3 << 2) + (1 << 4) + (3 << 6));
-#endif
-        IVec<SSE, float> i(u.vec);
-        IVec<SSE, float> vlem = vz < vxm;
-        IVec<SSE, float> vlep = vz < vxp;
-        i = i + vlem + vlep;
-        i.store(pr);
-    }
-
-    FORCE_INLINE
-        //NO_INLINE
-        void resolve(const FVec<SSE, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
-    {
-        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
-        const double *xi = base_t::data.xi;
-
-        uint32 b1 = buckets[bidx.get1()];
-        uint32 b0 = buckets[bidx.get0()];
-
-        const double *p1 = &xi[b1];
-        const double *p0 = &xi[b0];
-
-        // read pairs ( X(t-1), X(t) )
-        __m128d vx1 = _mm_loadu_pd(p1);
-        __m128d vx0 = _mm_loadu_pd(p0);
-
-        // build:
-        // { X(t(0)-1), X(t(1)-1) }
-        // { X(t(0)),   X(t(1)) }
-        __m128d vxm = _mm_shuffle_pd(vx0, vx1, 0);
-        __m128d vxp = _mm_shuffle_pd(vx0, vx1, 3);
-
-        IVec<SSE, double> i(b1, b0);
-        IVec<SSE, double> vlem = (vz < vxm);
-        IVec<SSE, double> vlep = (vz < vxp);
-        i = i + vlem + vlep;
-
-        union {
-            __m128i vec;
-            uint32 ui32[4];
-        } u;
-        u.vec = i;
-        pr[0] = u.ui32[0];
-        pr[1] = u.ui32[2];
-    }
-#endif // USE_SSE2
-
-#ifdef USE_AVX
-
-    FORCE_INLINE
-        //NO_INLINE
-        void resolve(const FVec<AVX, float>& vz, const IVec<AVX, float>& bidx, uint32 *pr) const
-    {
-        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
-        const float *xi = base_t::data.xi;
-
-#if 0   // use gather instructions
-
-        IVec<AVX,float> idxm;
-        idxm.setidx(buckets, bidx);
-        __m256i z = _mm256_setzero_si256();
-        IVec<AVX,float> minusone = _mm256_cmpeq_epi32(z,z);
-        IVec<AVX,float> idxp = idxm - minusone;
-
-        FVec<AVX, float> vxm = _mm256_i32gather_ps(xi, idxm, sizeof(float));
-        FVec<AVX, float> vxp = _mm256_i32gather_ps(xi, idxp, sizeof(float));
-        IVec<AVX, float> ip = idxm;
-
-#else // do not use gather instructions
-
-        union U {
-            __m256i vec;
-            uint32 ui32[8];
-        } u;
-
-        // read indices t
-
-        const double *p7 = reinterpret_cast<const double *>(&xi[(u.ui32[7] = buckets[bidx.get7()])]);
-        const double *p6 = reinterpret_cast<const double *>(&xi[(u.ui32[6] = buckets[bidx.get6()])]);
-        const double *p5 = reinterpret_cast<const double *>(&xi[(u.ui32[5] = buckets[bidx.get5()])]);
-        const double *p4 = reinterpret_cast<const double *>(&xi[(u.ui32[4] = buckets[bidx.get4()])]);
-        const double *p3 = reinterpret_cast<const double *>(&xi[(u.ui32[3] = buckets[bidx.get3()])]);
-        const double *p2 = reinterpret_cast<const double *>(&xi[(u.ui32[2] = buckets[bidx.get2()])]);
-        const double *p1 = reinterpret_cast<const double *>(&xi[(u.ui32[1] = buckets[bidx.get1()])]);
-        const double *p0 = reinterpret_cast<const double *>(&xi[(u.ui32[0] = buckets[bidx.get0()])]);
-
-#if 0 // perform 8 loads in double precision
-
-        // read pairs ( X(t-1), X(t) )
-        __m128 xp7 = _mm_castpd_ps(_mm_load_sd(p7));
-        __m128 xp6 = _mm_castpd_ps(_mm_load_sd(p6));
-        __m128 xp5 = _mm_castpd_ps(_mm_load_sd(p5));
-        __m128 xp4 = _mm_castpd_ps(_mm_load_sd(p4));
-        __m128 xp3 = _mm_castpd_ps(_mm_load_sd(p3));
-        __m128 xp2 = _mm_castpd_ps(_mm_load_sd(p2));
-        __m128 xp1 = _mm_castpd_ps(_mm_load_sd(p1));
-        __m128 xp0 = _mm_castpd_ps(_mm_load_sd(p0));
-
-        // build:
-        // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
-        // { X(t(0)),   X(t(1)),   X(t(2)),   X(t(3)) }
-        __m128 h57 = _mm_shuffle_ps(xp5, xp7, (1 << 2) + (1 << 6));  // F- F+ H- H+
-        __m128 h46 = _mm_shuffle_ps(xp4, xp6, (1 << 2) + (1 << 6));  // E- E+ G- G+
-        __m128 h13 = _mm_shuffle_ps(xp1, xp3, (1 << 2) + (1 << 6));  // B- B+ D- D+
-        __m128 h02 = _mm_shuffle_ps(xp0, xp2, (1 << 2) + (1 << 6));  // A- A+ C- C+
-
-        __m128 u01 = _mm_unpacklo_ps(h02, h13);  // A- B- A+ B+
-        __m128 u23 = _mm_unpackhi_ps(h02, h13);  // C- D- C+ D+
-        __m128 u45 = _mm_unpacklo_ps(h46, h57);  // E- F- E+ F+
-        __m128 u67 = _mm_unpackhi_ps(h46, h57);  // G- H- G+ H+
-
-        __m128 abcdm = _mm_shuffle_ps(u01, u23, (0) + (1 << 2) + (0 << 4) + (1 << 6));  // A- B- C- D-
-        __m128 abcdp = _mm_shuffle_ps(u01, u23, (2) + (3 << 2) + (2 << 4) + (3 << 6));  // A+ B+ C+ D+
-        __m128 efghm = _mm_shuffle_ps(u45, u67, (0) + (1 << 2) + (0 << 4) + (1 << 6));  // E- F- G- H-
-        __m128 efghp = _mm_shuffle_ps(u45, u67, (2) + (3 << 2) + (2 << 4) + (3 << 6));  // E+ F+ G+ H+
-
-        FVec<AVX, float> vxp = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdm), efghm, 1);
-        FVec<AVX, float> vxm = _mm256_insertf128_ps(_mm256_castps128_ps256(abcdp), efghp, 1);
-
-        IVec<AVX, float> ip(u.vec);
-
-#else   // use __mm256_set_pd
-
-        // read pairs ( X(t-1), X(t) )
-        __m256 x0145 = _mm256_castpd_ps(_mm256_set_pd(*p5, *p4, *p1, *p0)); // { x0(t-1), x0(t), x1(t-1), x1(t), x4(t-1), x4(t), x5(t-1), x5(t) }
-        __m256 x2367 = _mm256_castpd_ps(_mm256_set_pd(*p7, *p6, *p3, *p2)); // { x2(t-1), x2(t), x3(t-1), x3(t), x6(t-1), x6(t), x7(t-1), x7(t) }
-
-        // { x0(t-1), x1(t-1), x2(t-1), 3(t-1, x4(t-1), x5(t-1), x6(t-1), xt(t-1) }
-        FVec<AVX, float> vxm = _mm256_shuffle_ps(x0145, x2367, 0 + (2 << 2) + (0 << 4) + (2 << 6) );
-        // { x0(t), x1(t), x2(t), 3(t, x4(t), x5(t), x6(t), xt(t) }
-        FVec<AVX, float> vxp = _mm256_shuffle_ps(x0145, x2367, 1 + (3 << 2) + (1 << 4) + (3 << 6) );
-
-        IVec<AVX, float> ip(u.vec);
-
-#endif
-
-#endif
-
-        IVec<AVX, float> vlem = vz < vxm;
-        IVec<AVX, float> vlep = vz < vxp;
-        ip = ip + vlem + vlep;
-
-        ip.store(pr);
-    }
-
-
-
-    FORCE_INLINE
-        //NO_INLINE
-        void resolve(const FVec<AVX, double>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
-    {
-        union {
-            __m256i vec;
-            uint64 ui64[4];
-        } u;
-
-        const uint32* buckets = reinterpret_cast<const uint32 *>(base_t::data.buckets);
-        const double *xi = base_t::data.xi;
-
-        // read indices t
-        const double *p3 = &xi[(u.ui64[3] = buckets[bidx.get3()])];
-        const double *p2 = &xi[(u.ui64[2] = buckets[bidx.get2()])];
-        const double *p1 = &xi[(u.ui64[1] = buckets[bidx.get1()])];
-        const double *p0 = &xi[(u.ui64[0] = buckets[bidx.get0()])];
-
-        // read pairs ( X(t-1), X(t) )
-        __m128d xp3 = _mm_loadu_pd(p3);
-        __m128d xp2 = _mm_loadu_pd(p2);
-        __m128d xp1 = _mm_loadu_pd(p1);
-        __m128d xp0 = _mm_loadu_pd(p0);
-
-        // build:
-        // { X(t(0)-1), X(t(1)-1), X(t(2)-1), X(t(3)-1) }
-        // { X(t(0)),   X(t(1)),   X(t(2)),   X(t(3)) }
-        __m256d x02 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp0), xp2, 1);
-        __m256d x13 = _mm256_insertf128_pd(_mm256_castpd128_pd256(xp1), xp3, 1);
-        FVec<AVX, double> vxm = _mm256_unpacklo_pd(x02,x13);
-        FVec<AVX, double> vxp = _mm256_unpackhi_pd(x02,x13);
-
-
-//        __m128d h01m = _mm_shuffle_pd(xp0, xp1, 0);
-//        __m128d h23m = _mm_shuffle_pd(xp2, xp3, 0);
-//        __m128d h01p = _mm_shuffle_pd(xp0, xp1, 3);
-//        __m128d h23p = _mm_shuffle_pd(xp2, xp3, 3);
-//        FVec<AVX, double> vxm = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01m), h23m, 1);
-//        FVec<AVX, double> vxp = _mm256_insertf128_pd(_mm256_castpd128_pd256(h01p), h23p, 1);
-
-        IVec<AVX, double> i(u.vec);
-        IVec<AVX, double> vlem = vz < vxm;
-        IVec<AVX, double> vlep = vz < vxp;
-        i = i + vlem + vlep;
-        i.extractLo32s().store(pr);
-    }
-#endif
-
-public:
-
-    AlgoVecBase(const T* x, const uint32 n) : base_t(x, n) {}
-
-    void initConstants(Constants& cst) const
-    {
-        cst.vscaler.setN(base_t::data.scaler);
-        cst.vcst0.setN(base_t::data.cst0);
-        cst.one.setN(uint32(1));
-    }
-
-    void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
-    {
-        fVec vz(pz);
-        resolve(vz, base_t::fun_t::f(cst.vscaler, cst.vcst0, vz), pr);
-    }
-};
-} // namespace Details
-} // namespace BinSearch
diff --git a/include/AlgoXCodes.h b/include/AlgoXCodes.h
deleted file mode 100644
index bdc9b00b6..000000000
--- a/include/AlgoXCodes.h
+++ /dev/null
@@ -1,23 +0,0 @@
-ALGOENUM(DirectCacheFMA, 5)
-ALGOENUM(DirectFMA, 15)
-ALGOENUM(Direct2FMA, 25)
-ALGOENUM(DirectCache, 10)
-ALGOENUM(Direct, 20)
-ALGOENUM(Direct2, 30)
-ALGOENUM(Nonary, 40)
-ALGOENUM(Pentary, 50)
-ALGOENUM(Ternary, 60)
-ALGOENUM(Eytzinger, 70)
-ALGOENUM(BitSet, 80)
-ALGOENUM(ClassicOffset, 90)
-#ifdef PAPER_TEST
-ALGOENUM(MorinOffset, 100)
-ALGOENUM(BitSetNoPad, 110)
-ALGOENUM(ClassicMod, 120)
-ALGOENUM(MorinBranchy, 130)
-ALGOENUM(Classic, 140)
-ALGOENUM(LowerBound, 145)
-#ifdef USE_MKL
-ALGOENUM(MKL, 150)
-#endif
-#endif
diff --git a/include/BinAlgo.h b/include/BinAlgo.h
deleted file mode 100644
index aac67a0c7..000000000
--- a/include/BinAlgo.h
+++ /dev/null
@@ -1,77 +0,0 @@
-#pragma once
-
-#include "Type.h"
-#include <algorithm>
-
-namespace BinSearch {
-
-template <InstrSet I, typename T, Algos A, bool L=false, bool R=false>
-struct BinAlgo : Details::BinAlgoBase<I,T,A>
-{
-    typedef Details::BinAlgoBase<I,T,A> base_t;
-
-    BinAlgo(const T* px, const uint32 n) :  base_t(px, n), x0(px[0]), xN(px[n-1]), N(n) {}
-    BinAlgo(const T* px, const uint32 n, const typename base_t::Data& d) : base_t(d), x0(px[0]), xN(px[n-1]), N(n) {}
-
-    FORCE_INLINE
-    uint32 scalar(T z) const
-    {
-        if (!L || z >= x0)
-            if (!R || z < xN)
-                return base_t::scalar(z);
-            else
-                return N;
-        else
-            return std::numeric_limits<uint32>::max();
-    }
-
-
-    FORCE_INLINE
-    void vectorial(uint32 *pr, const T *pz, uint32 n) const
-    {
-        if (!L && !R) {
-            Details::Loop<T,base_t>::loop(*this, pr, pz, n);
-        }
-        else {
-            const uint32 nElem = base_t::nElem;
-            const uint32 idealbufsize = 256;
-            const uint32 bufsize = nElem * (idealbufsize / nElem + ((idealbufsize % nElem) ? 1 : 0));
-            T databuf[bufsize];
-            uint32 resbuf[bufsize];
-            uint32 indexbuf[bufsize];
-
-            uint32 *prend = pr + n;
-            while(pr != prend) {
-                uint32 cnt = 0;
-                uint32 niter = std::min(bufsize, (uint32)std::distance(pr,prend));
-                for (uint32 j = 0; j < niter; ++j) {
-                    T z = pz[j];
-                    // FIXME: use SSE2?
-                    if (!L || z >= x0)
-                        if (!R || z < xN) {
-                            databuf[cnt] = z;
-                            indexbuf[cnt] = j;
-                            ++cnt;
-                        }
-                        else
-                            pr[j] = N;
-                    else
-                        pr[j] = std::numeric_limits<uint32>::max();
-                }
-                // FIXME: merge these two loops
-                Details::Loop<T,base_t>::loop(*this, resbuf, databuf, cnt);
-                for (uint32 j = 0; j < cnt; ++j)
-                    pr[indexbuf[j]] = resbuf[j];
-                pr += niter;
-                pz += niter;
-            }
-        }
-    }
-
-    Details::CondData<T,L> x0;
-    Details::CondData<T,R> xN;
-    Details::CondData<uint32,R> N;
-};
-
-
-} // namespace BinSearch
diff --git a/include/BinSearch.h b/include/BinSearch.h
deleted file mode 100644
index 336f52963..000000000
--- a/include/BinSearch.h
+++ /dev/null
@@ -1,11 +0,0 @@
-#pragma once
-
-#include "AAlloc.h"
-#include "BinAlgo.h"
-#include "SIMD.h"
-
-#include <algorithm>
-#include <limits>
-
-
-#include "Algo-Direct2.h"
diff --git a/include/Portable.h b/include/Portable.h
deleted file mode 100644
index 59b6dc840..000000000
--- a/include/Portable.h
+++ /dev/null
@@ -1,205 +0,0 @@
-#pragma once
-#include <limits>
-#include <cmath>
-#include <stdexcept>
-#include <sstream>
-
-#if defined(__aarch64__)
-#ifdef __CUDACC__
-#undef USE_NEON // Doesn't work with nvcc, undefined symbols
-#else
-#include <arm_neon.h>
-#undef USE_NEON // Not yet implemented
-#endif
-#undef USE_AVX // x86_64 only
-#undef USE_AVX2 // x86_64 only
-#undef USE_SSE2 // x86_64 only
-#undef USE_SSE41 // x86_64 only
-#undef USE_SSE42 // x86_64 only
-#undef USE_FMA // x86_64 only
-#ifdef USE_NEON
-typedef float32x4_t __m128;
-typedef int32x4_t __m128i;
-typedef float64x2_t __m128d;
-#else
-typedef struct {float a; float b; float c; float d;} __m128;
-typedef struct {int a; int b; int c; int d;} __m128i;
-typedef struct {double a; double b;} __m128d;
-#endif
-#elif defined(__powerpc64__)
-#ifdef __CUDACC__
-#undef USE_VSX // Doesn't work with nvcc, undefined symbols
-#else
-#include <altivec.h>
-#undef USE_VSX // Not yet implemented
-#endif
-#undef USE_AVX // x86_64 only
-#undef USE_AVX2 // x86_64 only
-#undef USE_SSE2 // x86_64 only
-#undef USE_SSE41 // x86_64 only
-#undef USE_SSE42 // x86_64 only
-#undef USE_FMA // x86_64 only
-#ifdef USE_VSX
-typedef vector float __m128;
-typedef vector signed int __m128i;
-typedef vector double __m128d;
-#else
-typedef struct {float a; float b; float c; float d;} __m128;
-typedef struct {int a; int b; int c; int d;} __m128i;
-typedef struct {double a; double b;} __m128d;
-#endif
-#else
-#undef USE_NEON // ARM64 only
-#undef USE_VSX // PPC only
-#ifdef __FMA__
-#define USE_FMA
-#endif
-#if !defined(__SSE2__) && !defined(_MSC_VER)
-#error Compiler must support SSE2
-#endif
-#define USE_SSE2
-
-#if defined(__aarch64__)
-#else
-#ifdef __AVX2__
-#define USE_AVX2
-#endif
-
-#ifdef __AVX__
-#define USE_AVX
-#endif
-
-
-#ifdef __SSE4_1__
-#define USE_SSE41
-#endif
-
-#ifdef __SSE4_2__
-#define USE_SSE42
-#endif
-#endif
-#endif
-
-#ifndef _MSC_VER
-#include <stdint.h>
-#endif
-
-namespace BinSearch {
-
-#ifndef _MSC_VER
-typedef  int8_t   int8;
-typedef uint8_t  uint8;
-typedef  int32_t   int32;
-typedef uint32_t  uint32;
-typedef  int64_t   int64;
-typedef uint64_t  uint64;
-#else
-typedef  __int8   int8;
-typedef unsigned __int8  uint8;
-typedef  __int32   int32;
-typedef unsigned __int32  uint32;
-typedef  __int64   int64;
-typedef unsigned __int64  uint64;
-#endif
-
-namespace Details {
-
-#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
-
-// log2 is not defined in VS2008
-#if defined(_MSC_VER)
-inline uint32 log2 (uint32 val) {
-    if (val == 1) return 0;
-    uint32 ret = 0;
-    do {
-        ret++;
-        val >>= 1;
-    } while (val > 1);
-    return ret;
-}
-#endif
-
-#ifdef _DEBUG
-#define DEBUG
-#endif
-
-#ifdef _MSC_VER
-#   define FORCE_INLINE __forceinline
-#   define NO_INLINE __declspec(noinline)
-#else
-#   define NO_INLINE __attribute__((noinline))
-#   ifdef DEBUG
-#       define FORCE_INLINE NO_INLINE
-#   else
-#       define FORCE_INLINE __attribute__((always_inline)) inline
-#   endif
-#endif
-
-#ifdef USE_AVX
-#define COMISS "vcomiss"
-#define COMISD "vcomisd"
-#else
-#define COMISS "comiss"
-#define COMISD "comisd"
-#endif
-
-// nextafter is not defined in VS2008
-#if defined(_MSC_VER) && (_MSC_VER <= 1500)
-#include <float.h>
-inline float mynext(float x)
-{
-    return _nextafterf(x, std::numeric_limits<float>::max());
-}
-
-inline double mynext(double x)
-{
-    return _nextafter(x, std::numeric_limits<double>::max());
-}
-inline float myprev(float x)
-{
-    return _nextafterf(x, -std::numeric_limits<float>::max());
-}
-
-inline double myprev(double x)
-{
-    return _nextafter(x, -std::numeric_limits<double>::max());
-}
-#else
-inline float mynext(float x)
-{
-    return std::nextafterf(x, std::numeric_limits<float>::max());
-}
-
-inline double mynext(double x)
-{
-    return std::nextafter(x, std::numeric_limits<double>::max());
-}
-inline float myprev(float x)
-{
-    return std::nextafterf(x, -std::numeric_limits<float>::max());
-}
-
-inline double myprev(double x)
-{
-    return std::nextafter(x, -std::numeric_limits<double>::max());
-}
-#endif
-
-template <typename T>
-inline T next(T x)
-{
-    for (int i = 0; i < 4; ++i)
-        x = mynext(x);
-    return x;
-}
-
-template <typename T>
-inline T prev(T x)
-{
-    for (int i = 0; i < 4; ++i)
-        x = myprev(x);
-    return x;
-}
-
-} // namespace Details
-} // namespace BinSearch
diff --git a/include/SIMD.h b/include/SIMD.h
deleted file mode 100644
index 6222af04f..000000000
--- a/include/SIMD.h
+++ /dev/null
@@ -1,589 +0,0 @@
-#pragma once
-
-#include "Portable.h"
-
-#ifdef USE_SSE2
-#include <emmintrin.h>
-#if defined(USE_AVX) || defined(USE_AVX2)
-#include <immintrin.h>
-#else
-#ifdef USE_SSE41
-#include <smmintrin.h>
-#endif
-#endif
-#endif
-
-namespace BinSearch {
-namespace Details {
-
-template <InstrSet I, typename T>
-struct FTOITraits{};
-
-template <InstrSet I, class T>
-struct FVec;
-
-template <InstrSet I, class T>
-struct IVec;
-
-template <InstrSet I, class T>
-struct FVec1;
-
-template <> struct InstrFloatTraits<Scalar, float>
-{
-    typedef __m128  vec_t;
-};
-
-template <> struct InstrFloatTraits<Scalar, double>
-{
-    typedef __m128d vec_t;
-};
-
-}
-}
-
-#if !defined(__aarch64__) && !defined(__powerpc64__)
-#ifdef USE_SSE42
-#ifndef _MSC_VER
-#include <popcntintrin.h>
-#define popcnt32 _mm_popcnt_u32
-#else
-#include <intrin.h>
-#define popcnt32 __popcnt
-#endif
-#else // USE_SSE42
-namespace BinSearch {
-FORCE_INLINE int popcnt32(int x32)
-{
-    // strictly speaking this is not correct, as it ignores higher order bits
-    // however, this is only used on the resuot of movemask on a 128-bit register, which is 8 at most, so it is ok
-    // with 256-bit registers, SSE42 is defined, and we do not use this function
-    uint8 x = static_cast<uint8>(x32);
-    x = (x & 0x55) + (x >> 1 & 0x55);
-    x = (x & 0x33) + (x >> 2 & 0x33);
-    x = (x & 0x0f) + (x >> 4 & 0x0f);
-    return x;
-}
-} // namespace
-#endif
-
-#include "Type.h"
-
-namespace BinSearch {
-namespace Details {
-
-template <> struct InstrIntTraits<SSE>
-{
-    typedef __m128i vec_t;
-};
-
-template <> struct InstrFloatTraits<SSE, float>
-{
-    typedef __m128  vec_t;
-};
-
-template <> struct InstrFloatTraits<SSE, double>
-{
-    typedef __m128d vec_t;
-};
-
-template <>
-struct FTOITraits<SSE, float>
-{
-    typedef IVec<SSE, float> vec_t;
-};
-
-#ifdef USE_AVX
-
-template <>
-struct FTOITraits<AVX, float>
-{
-    typedef IVec<AVX, float> vec_t;
-};
-
-template <> struct InstrIntTraits<AVX>
-{
-    typedef __m256i vec_t;
-};
-
-template <> struct InstrFloatTraits<AVX, float>
-{
-    typedef __m256  vec_t;
-};
-
-template <> struct InstrFloatTraits<AVX, double>
-{
-    typedef __m256d vec_t;
-};
-
-#endif
-
-
-template <typename TR>
-struct VecStorage
-{
-    typedef typename TR::vec_t vec_t;
-
-    FORCE_INLINE operator vec_t&() { return vec; }
-    FORCE_INLINE operator const vec_t&() const { return vec; }
-
-protected:
-    FORCE_INLINE VecStorage() {}
-    FORCE_INLINE VecStorage(const vec_t& v) : vec( v ) {}
-
-    vec_t vec;
-};
-
-template <InstrSet>
-struct IVecBase;
-
-template <>
-struct IVecBase<SSE> : VecStorage<InstrIntTraits<SSE>>
-{
-protected:
-    FORCE_INLINE IVecBase() {}
-    FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<SSE>>( v ) {}
-public:
-    FORCE_INLINE static vec_t zero() { return _mm_setzero_si128(); }
-
-    FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32( vec ); }
-
-    FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask )
-    {
-#ifdef USE_SSE41
-        vec = _mm_blendv_epi8(vec, val, mask);
-#else
-        vec = _mm_or_si128(_mm_andnot_si128(mask,vec), _mm_and_si128(mask,val));
-#endif
-    }
-    FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask)
-    {
-        vec = _mm_or_si128(vec, _mm_and_si128(val,mask));
-    }
-};
-
-template <>
-struct IVec<SSE, float> : IVecBase<SSE>
-{
-    FORCE_INLINE IVec() {}
-    FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi32( i ) )  {}
-    FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v )              {}
-    FORCE_INLINE IVec( uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<SSE>( _mm_set_epi32( u3, u2, u1, u0 ) ) {}
-
-    void setN( int32 i ) { vec = _mm_set1_epi32( i ); }
-
-#ifdef USE_SSE41
-    FORCE_INLINE int32 get1() const { return _mm_extract_epi32(vec, 1); }
-    FORCE_INLINE int32 get2() const { return _mm_extract_epi32(vec, 2); }
-    FORCE_INLINE int32 get3() const { return _mm_extract_epi32(vec, 3); }
-#else
-    FORCE_INLINE int32 get1() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 1 ) ); }
-    FORCE_INLINE int32 get2() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) ); }
-    FORCE_INLINE int32 get3() const { return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 3 ) ); }
-#endif
-
-    FORCE_INLINE void store( uint32 *pi ) const { _mm_storeu_si128( reinterpret_cast<vec_t*>(pi), vec ); }
-
-    FORCE_INLINE int countbit()
-    {
-        return popcnt32(_mm_movemask_ps(_mm_castsi128_ps(vec)));
-    }
-};
-
-template <>
-struct IVec<SSE, double> : IVecBase<SSE>
-{
-    FORCE_INLINE IVec() {}
-    FORCE_INLINE IVec( int32 i ) : IVecBase<SSE>( _mm_set1_epi64x( i ) )    {}
-    FORCE_INLINE IVec( const vec_t& v) : IVecBase<SSE>( v )                 {}
-    FORCE_INLINE IVec( uint64 u1, uint64 u0 ) : IVecBase<SSE>( _mm_set_epi64x(u1, u0) ) {}
-
-    void setN( int32 i ) { vec = _mm_set1_epi64x( i ); }
-
-    FORCE_INLINE int32 get1() const
-    {
-#ifdef USE_SSE41
-        return _mm_extract_epi32(vec, 2);
-#else
-        return _mm_cvtsi128_si32( _mm_shuffle_epi32( vec, 2 ) );
-#endif
-    }
-
-    // extract the 2 32 bits integers no. 0, 2 and store them in a __m128i
-    FORCE_INLINE IVec<SSE,float> extractLo32s() const
-    {
-        return _mm_shuffle_epi32(vec, ((2 << 2) | 0));
-    }
-
-    FORCE_INLINE void store( uint32 *pi ) const
-    {
-        pi[0] = get0();
-        pi[1] = get1();
-    }
-
-    FORCE_INLINE int countbit()
-    {
-#if 1
-        // takes 4 cycles
-        __m128i hi = _mm_shuffle_epi32(vec, 2);  // 1 cycle
-        __m128i s = _mm_add_epi32(vec, hi);
-        int32 x = _mm_cvtsi128_si32(s);
-        return -x;
-#else
-        // takes 6 cycles
-        return popcnt32(_mm_movemask_pd(_mm_castsi128_pd(vec)));
-#endif
-    }
-};
-
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator>> (const IVec<SSE,T>& a, unsigned n)            { return _mm_srli_epi32(a, n); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator<< (const IVec<SSE,T>& a, unsigned n)            { return _mm_slli_epi32(a, n); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator&  (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_and_si128( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator|  (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_or_si128( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator^  (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_xor_si128( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator+  (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_add_epi32( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<SSE,T> operator-  (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_sub_epi32( a, b ); }
-#ifdef USE_SSE41
-template <typename T>
-FORCE_INLINE IVec<SSE,T> min        (const IVec<SSE,T>& a, const IVec<SSE,T>& b ) { return _mm_min_epi32( a, b ); }
-#endif
-
-typedef VecStorage<InstrFloatTraits<SSE,float>> FVec128Float;
-
-template <>
-struct FVec1<SSE, float> : FVec128Float
-{
-    FORCE_INLINE FVec1() {}
-    FORCE_INLINE FVec1( float f ) : FVec128Float( _mm_load_ss( &f ) ) {}
-    FORCE_INLINE FVec1( const vec_t& v ): FVec128Float( v ) {}
-
-    FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); }
-};
-
-template <>
-struct FVec<SSE, float> : FVec128Float
-{
-    FORCE_INLINE FVec() {}
-    FORCE_INLINE FVec( float f ) : FVec128Float( _mm_set1_ps( f ) ) {}
-    FORCE_INLINE FVec( const float *v ) : FVec128Float( _mm_loadu_ps( v ) ) {}
-    FORCE_INLINE FVec( const vec_t& v) : FVec128Float(v) {}
-    FORCE_INLINE FVec( float f3, float f2, float f1, float f0 ) : FVec128Float( _mm_set_ps(f3, f2, f1, f0) ) {}
-
-    void set0( float f  ) { vec = _mm_load_ss( &f ); }
-    void setN( float f  ) { vec = _mm_set1_ps( f ); }
-
-    FORCE_INLINE void setidx( const float *xi, const IVec<SSE,float>& idx )
-    {
-        uint32 i0 = idx.get0();
-        uint32 i1 = idx.get1();
-        uint32 i2 = idx.get2();
-        uint32 i3 = idx.get3();
-        vec = _mm_set_ps( xi[i3], xi[i2], xi[i1], xi[i0] );
-    }
-
-    FORCE_INLINE float get0() const { return _mm_cvtss_f32( vec ); }
-    FORCE_INLINE float get1() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 1 ) ); }
-    FORCE_INLINE float get2() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 2 ) ); }
-    FORCE_INLINE float get3() const { return _mm_cvtss_f32( _mm_shuffle_ps( vec, vec, 3 ) ); }
-};
-
-FORCE_INLINE FVec1<SSE,float> operator+  (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_add_ss( a, b ); }
-FORCE_INLINE FVec1<SSE,float> operator-  (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_sub_ss( a, b ); }
-FORCE_INLINE FVec1<SSE,float> operator*  (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_mul_ss( a, b ); }
-FORCE_INLINE FVec1<SSE,float> operator/  (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_div_ss( a, b ); }
-FORCE_INLINE int              ftoi       (const FVec1<SSE,float>& a)                            { return _mm_cvttss_si32(a); }
-FORCE_INLINE IVec<SSE,float> operator>   (const FVec1<SSE,float>& a, const FVec1<SSE,float>& b) { return _mm_castps_si128( _mm_cmpgt_ss( a, b ) ); }
-#ifdef USE_FMA
-FORCE_INLINE FVec1<SSE, float> mulSub(const FVec1<SSE, float>& a, const FVec1<SSE, float>& b, const FVec1<SSE, float>& c) { return _mm_fmsub_ss(a, b, c); }
-#endif
-
-FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_sub_ps( a, b ); }
-FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
-FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
-FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
-#ifndef __clang__ // Conflicts with builtin operator
-FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
-FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
-FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
-#endif
-#ifdef USE_FMA
-FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
-#endif
-
-typedef VecStorage<InstrFloatTraits<SSE,double>> FVec128Double;
-
-template <>
-struct FVec1<SSE, double> : FVec128Double
-{
-    FORCE_INLINE FVec1() {}
-    FORCE_INLINE FVec1( double f )       : FVec128Double( _mm_load_sd( &f ) ) {}
-    FORCE_INLINE FVec1( const vec_t& v ) : FVec128Double( v )                 {}
-
-    FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); }
-};
-
-template <>
-struct FVec<SSE, double> : FVec128Double
-{
-    FORCE_INLINE FVec() {}
-    FORCE_INLINE FVec( double d )        : FVec128Double( _mm_set1_pd( d ) )   {}
-    FORCE_INLINE FVec( const double *v ) : FVec128Double( _mm_loadu_pd( v ) )  {}
-    FORCE_INLINE FVec( const vec_t& v)   : FVec128Double( v )                  {}
-    FORCE_INLINE FVec( double f1, double f0 ) : FVec128Double( _mm_set_pd(f1, f0) ) {}
-
-    void set0( double f  ) { vec = _mm_load_sd( &f ); }
-    void setN( double f  ) { vec = _mm_set1_pd( f ); }
-
-    FORCE_INLINE void setidx( const double *xi, const IVec<SSE,double>& idx )
-    {
-        vec = _mm_set_pd( xi[idx.get1()], xi[idx.get0()] );
-    }
-
-    FORCE_INLINE double get0() const { return _mm_cvtsd_f64( vec ); }
-    FORCE_INLINE double get1() const { return _mm_cvtsd_f64( _mm_shuffle_pd( vec, vec, 1 ) ); };
-};
-
-FORCE_INLINE FVec1<SSE,double> operator+   (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_add_sd( a, b ); }
-FORCE_INLINE FVec1<SSE,double> operator-   (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_sub_sd( a, b ); }
-FORCE_INLINE FVec1<SSE,double> operator*   (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_mul_sd( a, b ); }
-FORCE_INLINE FVec1<SSE,double> operator/   (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_div_sd( a, b ); }
-FORCE_INLINE int               ftoi        (const FVec1<SSE,double>& a)                             { return _mm_cvttsd_si32(a); }
-FORCE_INLINE IVec<SSE,double> operator>    (const FVec1<SSE,double>& a, const FVec1<SSE,double>& b) { return _mm_castpd_si128( _mm_cmpgt_sd( a, b ) ); }
-#ifdef USE_FMA
-FORCE_INLINE FVec1<SSE, double> mulSub(const FVec1<SSE, double>& a, const FVec1<SSE, double>& b, const FVec1<SSE, double>& c) { return _mm_fmsub_sd(a, b, c); }
-#endif
-
-FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_sub_pd( a, b ); }
-FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
-FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
-FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
-#ifndef __clang__ // Conflicts with builtin operator
-FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
-FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
-FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
-#endif
-#ifdef USE_FMA
-FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
-#endif
-
-#ifdef USE_AVX
-
-template <>
-struct IVecBase<AVX> : VecStorage<InstrIntTraits<AVX>>
-{
-protected:
-    FORCE_INLINE IVecBase() {}
-    FORCE_INLINE IVecBase( const vec_t& v) : VecStorage<InstrIntTraits<AVX>>( v ) {}
-public:
-    FORCE_INLINE static vec_t zero() { return _mm256_setzero_si256(); }
-
-    FORCE_INLINE int32 get0() const { return _mm_cvtsi128_si32(_mm256_castsi256_si128(vec)); }
-
-    FORCE_INLINE void assignIf( const vec_t& val, const vec_t& mask ) { vec = _mm256_blendv_epi8(vec, val, mask); }
-    FORCE_INLINE void orIf(const vec_t& val, const vec_t& mask)
-    {
-        vec = _mm256_blendv_epi8(vec, val, mask);
-        //vec = _mm256_or_si256(vec, _mm256_and_si256(val,mask));
-    }
-
-    FORCE_INLINE __m128i lo128() const { return _mm256_castsi256_si128(vec); }
-    FORCE_INLINE __m128i hi128() const { return _mm256_extractf128_si256(vec, 1); }
-};
-
-template <>
-struct IVec<AVX, float> : IVecBase<AVX>
-{
-    FORCE_INLINE IVec() {}
-    FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi32( i ) )  {}
-    FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v )              {}
-    FORCE_INLINE IVec(uint32 u7, uint32 u6, uint32 u5, uint32 u4, uint32 u3, uint32 u2, uint32 u1, uint32 u0) : IVecBase<AVX>(_mm256_set_epi32(u7, u6, u5, u4, u3, u2, u1, u0))          {}
-
-    void setN( int32 i ) { vec = _mm256_set1_epi32( i ); }
-
-    FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 1); }
-    FORCE_INLINE int32 get2() const { return _mm256_extract_epi32(vec, 2); }
-    FORCE_INLINE int32 get3() const { return _mm256_extract_epi32(vec, 3); }
-    FORCE_INLINE int32 get4() const { return _mm256_extract_epi32(vec, 4); }
-    FORCE_INLINE int32 get5() const { return _mm256_extract_epi32(vec, 5); }
-    FORCE_INLINE int32 get6() const { return _mm256_extract_epi32(vec, 6); }
-    FORCE_INLINE int32 get7() const { return _mm256_extract_epi32(vec, 7); }
-
-    FORCE_INLINE void setidx( const uint32 *bi, const IVec<AVX,float>& idx )
-    {
-        vec = _mm256_i32gather_epi32(reinterpret_cast<const int32 *>(bi), idx, sizeof(uint32));
-    }
-
-    FORCE_INLINE void store( uint32 *pi ) const { _mm256_storeu_si256( reinterpret_cast<vec_t*>(pi), vec ); }
-
-    FORCE_INLINE int countbit()
-    {
-        return popcnt32(_mm256_movemask_ps(_mm256_castsi256_ps(vec)));
-    }
-};
-
-template <>
-struct IVec<AVX, double> : IVecBase<AVX>
-{
-    FORCE_INLINE IVec() {}
-    FORCE_INLINE IVec( int32 i ) : IVecBase<AVX>( _mm256_set1_epi64x( i ) )    {}
-    FORCE_INLINE IVec( const vec_t& v) : IVecBase<AVX>( v )                 {}
-    FORCE_INLINE IVec(uint64 u3, uint64 u2, uint64 u1, uint64 u0) : IVecBase<AVX>(_mm256_set_epi64x(u3, u2, u1, u0))          {}
-
-    void setN( int32 i ) { vec = _mm256_set1_epi64x( i ); }
-
-    // extract the 4 32 bits integers no. 0, 2, 4, 6 and store them in a __m128i
-    FORCE_INLINE IVec<SSE,float> extractLo32s() const
-    {
-      union {
-        uint32 u32[4];
-        __m128i u;
-      } mask = {0,2,4,6};
-      //__m256 ps256 = _mm256_castsi256_ps(vec);
-      //__m128 lo128 = _mm256_castps256_ps128(ps256);
-      //__m128 hi128 = _mm256_extractf128_ps(ps256, 1);
-      //__m128 blend = _mm_shuffle_ps(lo128, hi128, 0 + (2<<2) + (0<<4) + (2<<6));
-      __m256i blend = _mm256_permutevar8x32_epi32(vec, _mm256_castsi128_si256(mask.u));
-      return _mm256_castsi256_si128(blend);
-    }
-
-    //int32 get1() const { return _mm256_cvtsi256_si32( _mm256_shuffle_epi32( vec, 2 ) ); };
-    FORCE_INLINE int32 get1() const { return _mm256_extract_epi32(vec, 2); }
-
-    FORCE_INLINE void store( uint32 *pi ) const
-    {
-        extractLo32s().store(pi);
-    }
-
-    FORCE_INLINE int countbit()
-    {
-        return popcnt32(_mm256_movemask_pd(_mm256_castsi256_pd(vec)));
-    }
-};
-
-template <typename T>
-FORCE_INLINE IVec<AVX,T> operator>> (const IVec<AVX,T>& a, unsigned n)            { return _mm256_srli_epi32(a, n); }
-template <typename T>
-FORCE_INLINE IVec<AVX,T> operator<< (const IVec<AVX,T>& a, unsigned n)            { return _mm256_slli_epi32(a, n); }
-template <typename T>
-FORCE_INLINE IVec<AVX,T> operator&  (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_and_si256( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<AVX,T> operator|  (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_or_si256( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<AVX,T> operator^  (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_xor_si256( a, b ); }
-template <typename T>
-FORCE_INLINE IVec<AVX,T> min        (const IVec<AVX,T>& a, const IVec<AVX,T>& b ) { return _mm256_min_epi32( a, b ); }
-
-FORCE_INLINE IVec<AVX,float> operator+  (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_add_epi32( a, b ); }
-FORCE_INLINE IVec<AVX,float> operator-  (const IVec<AVX,float>& a, const IVec<AVX,float>& b ) { return _mm256_sub_epi32( a, b ); }
-FORCE_INLINE IVec<AVX,double> operator+  (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_add_epi64( a, b ); }
-FORCE_INLINE IVec<AVX,double> operator-  (const IVec<AVX,double>& a, const IVec<AVX,double>& b ) { return _mm256_sub_epi64( a, b ); }
-
-
-typedef VecStorage<InstrFloatTraits<AVX,float>> FVec256Float;
-
-template <>
-struct FVec<AVX, float> : FVec256Float
-{
-    FORCE_INLINE FVec() {}
-    FORCE_INLINE FVec( float f ) : FVec256Float( _mm256_set1_ps( f ) ) {}
-    FORCE_INLINE FVec( const float *v ) : FVec256Float( _mm256_loadu_ps( v ) ) {}
-    FORCE_INLINE FVec( const vec_t& v) : FVec256Float(v) {}
-    FORCE_INLINE FVec(float f7, float f6, float f5, float f4, float f3, float f2, float f1, float f0) : FVec256Float(_mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0))          {}
-
-    //void set0( float f  ) { vec = _mm256_load_ss( &f ); }
-    void setN( float f  ) { vec = _mm256_set1_ps( f ); }
-
-    FORCE_INLINE void setidx( const float *xi, const IVec<AVX,float>& idx )
-    {
-#if 1 // use gather primitives
-        vec = _mm256_i32gather_ps (xi, idx, 4);
-#elif 0
-        uint32 i0 = idx.get0();
-        uint32 i1 = idx.get1();
-        uint32 i2 = idx.get2();
-        uint32 i3 = idx.get3();
-        uint32 i4 = idx.get4();
-        uint32 i5 = idx.get5();
-        uint32 i6 = idx.get6();
-        uint32 i7 = idx.get7();
-        vec = _mm256_set_ps( xi[i7], xi[i6], xi[i5], xi[i4], xi[i3], xi[i2], xi[i1], xi[i0] );
-#else
-        union {
-            __m256i vec;
-            uint32 ui32[8];
-        } i;
-        i.vec = static_cast<const __m256i&>(idx);
-        vec = _mm256_set_ps(xi[i.ui32[7]], xi[i.ui32[6]], xi[i.ui32[5]], xi[i.ui32[4]], xi[i.ui32[3]], xi[i.ui32[2]], xi[i.ui32[1]], xi[i.ui32[0]]);
-#endif
-    }
-
-    FORCE_INLINE FVec<SSE, float> lo128() const { return _mm256_castps256_ps128(vec); }
-    FORCE_INLINE FVec<SSE, float> hi128() const { return _mm256_extractf128_ps(vec, 1); }
-
-    //FORCE_INLINE float get0() const { return _mm256_cvtss_f32( vec ); }
-    //FORCE_INLINE float get1() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 1 ) ); }
-    //FORCE_INLINE float get2() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 2 ) ); }
-    //FORCE_INLINE float get3() const { return _mm256_cvtss_f32( _mm256_shuffle_ps( vec, vec, 3 ) ); }
-};
-
-FORCE_INLINE FVec<AVX,float> operator-   (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_sub_ps( a, b ); }
-FORCE_INLINE FVec<AVX,float> operator*   (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_mul_ps( a, b ); }
-FORCE_INLINE FVec<AVX,float> operator/   (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_div_ps( a, b ); }
-FORCE_INLINE IVec<AVX,float> ftoi        (const FVec<AVX,float>& a)                             { return _mm256_cvttps_epi32(a); }
-FORCE_INLINE IVec<AVX,float> operator<=  (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_LE_OS) ); }
-FORCE_INLINE IVec<AVX,float> operator>=  (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_castps_si256( _mm256_cmp_ps( a, b, _CMP_GE_OS ) ); }
-FORCE_INLINE IVec<AVX,float> operator<   (const FVec<AVX,float>& a,  const FVec<AVX,float>& b)  { return _mm256_castps_si256(_mm256_cmp_ps(a, b, _CMP_LT_OS )); }
-#ifdef USE_FMA
-FORCE_INLINE FVec<AVX, float> mulSub(const FVec<AVX, float>& a, const FVec<AVX, float>& b, const FVec<AVX, float>& c) { return _mm256_fmsub_ps(a, b, c); }
-#endif
-
-typedef VecStorage<InstrFloatTraits<AVX,double>> FVec256Double;
-
-template <>
-struct FVec<AVX, double> : FVec256Double
-{
-    FORCE_INLINE FVec() {}
-    FORCE_INLINE FVec( double d )        : FVec256Double( _mm256_set1_pd( d ) )   {}
-    FORCE_INLINE FVec( const double *v ) : FVec256Double( _mm256_loadu_pd( v ) )  {}
-    FORCE_INLINE FVec( const vec_t& v)   : FVec256Double( v )                  {}
-    FORCE_INLINE FVec(double d3, double d2, double d1, double d0) : FVec256Double(_mm256_set_pd(d3, d2, d1, d0))          {}
-
-    //void set0( double f  ) { vec = _mm256_load_sd( &f ); }
-    void setN( double f  ) { vec = _mm256_set1_pd( f ); }
-
-    FORCE_INLINE void setidx( const double *xi, const IVec<SSE,float>& idx )
-    {
-        vec = _mm256_i32gather_pd(xi, idx, 8);
-    }
-
-    FORCE_INLINE void setidx( const double *xi, const IVec<AVX,double>& idx )
-    {
-        vec = _mm256_i64gather_pd(xi, idx, 8);
-    }
-
-//    FORCE_INLINE double get0() const { return _mm256_cvtsd_f64( vec ); }
-//    FORCE_INLINE double get1() const { return _mm256_cvtsd_f64( _mm256_shuffle_pd( vec, vec, 1 ) ); };
-};
-
-FORCE_INLINE FVec<AVX,double> operator-   (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_sub_pd( a, b ); }
-FORCE_INLINE FVec<AVX,double> operator*   (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_mul_pd( a, b ); }
-FORCE_INLINE FVec<AVX,double> operator/   (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_div_pd( a, b ); }
-FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<AVX,double>& a)                               { return _mm256_cvttpd_epi32(a); }
-FORCE_INLINE IVec<AVX,double> operator<=   (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_LE_OS ) ); }
-FORCE_INLINE IVec<AVX,double> operator<    (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_castpd_si256(_mm256_cmp_pd(a, b, _CMP_LT_OS)); }
-FORCE_INLINE IVec<AVX,double> operator>=   (const FVec<AVX,double>& a, const FVec<AVX,double>& b)    { return _mm256_castpd_si256(_mm256_cmp_pd( a, b, _CMP_GE_OS ) ); }
-#ifdef USE_FMA
-FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX, double>& b, const FVec<AVX, double>& c) { return _mm256_fmsub_pd(a, b, c); }
-#endif
-
-#endif
-
-} // namespace Details
-} // namespace BinSearch
-#endif // !defined(__aarch64__)
diff --git a/include/Type.h b/include/Type.h
deleted file mode 100644
index 2c0ab6ed6..000000000
--- a/include/Type.h
+++ /dev/null
@@ -1,221 +0,0 @@
- #pragma once
-
-#include <stddef.h>
-#include <vector>
-#include <limits>
-
-#include "Portable.h"
-
-using std::size_t;
-
-namespace BinSearch {
-
-enum InstrSet { Scalar, SSE, AVX, Neon, VSX };
-
-#define ALGOENUM(x, b) x,
-enum Algos
-    {
-#include "AlgoXCodes.h"
-    };
-#undef ALGOENUM
-
-namespace Details {
-
-    template <InstrSet I>
-    struct InstrIntTraits;
-
-    template <InstrSet I, typename T>
-    struct InstrFloatTraits;
-
-    // base class for algorithm supporting the method:
-    //    uint32 scalar(T z) const
-    template <typename T, Algos A, typename Enable=void>
-    struct AlgoScalarBase;
-
-    // base class for algorithm supporting the following methods, constants and definitions:
-    //    static const uint32 nElem
-    //    struct Constants;
-    //    void initConstants(Constants& cst) const
-    //    void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
-    // The function vectorial processes nElem items
-    template <InstrSet I, typename T, Algos A, typename Enable=void>
-    struct AlgoVecBase;
-
-    template <typename T> struct IntTraits;
-
-    template <> struct IntTraits<float>
-    {
-        typedef uint32 itype;
-    };
-    template <> struct IntTraits<double>
-    {
-        typedef uint64 itype;
-    };
-
-    template <int N>
-    struct Body
-    {
-        template <uint32 D, typename T, typename Expr>
-        FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const typename Expr::Constants& cst)
-        {
-            e.vectorial(ri, zi, cst);
-            Body<N - 1>::template iteration<D>(e, ri + D, zi + D, cst);
-        }
-
-    };
-
-    template <>
-    struct Body<0>
-    {
-        template <uint32 D, typename T, typename Expr, typename H>
-        FORCE_INLINE static void iteration(const Expr& e, uint32 *ri, const T* zi, const H&)
-        {
-        }
-    };
-
-    template <typename T, typename Algo>
-    struct Loop
-    {
-        typedef Algo algo_type;
-        static const uint32 M = 4;
-        static const uint32 D = algo_type::nElem;
-
-        FORCE_INLINE static void loop(const algo_type& e, uint32 *ri, const T* zi, uint32 n)
-        {
-            typename algo_type::Constants cst;
-            e.initConstants(cst);
-
-            uint32 j = 0;
-            while (j + (D*M) <= n) {
-                Details::Body<M>::template iteration<D>(e, ri + j, zi + j, cst);
-                j += (D*M);
-            }
-            while (j + D <= n) {
-                e.vectorial(ri + j, zi + j, cst);
-                j += D;
-            }
-            while (D > 1 && j < n) {
-                ri[j] = e.scalar(zi[j]);
-                j += 1;
-            }
-        }
-    };
-
-    template <uint32 nIterTot, uint32 nIterLeft>
-    struct _Pipeliner
-    {
-        template <typename Expr, typename Data>
-        FORCE_INLINE static void go(const Expr& e, Data* d)
-        {
-            e.template run<nIterTot - nIterLeft>(d);
-            _Pipeliner<nIterTot, nIterLeft - 1>::go(e, d);
-        }
-    };
-
-    template <uint32 nIterTot>
-    struct _Pipeliner<nIterTot, 0>
-    {
-        template <typename Expr, typename Data>
-        FORCE_INLINE static void go(const Expr& e, Data* d)
-        {
-        }
-    };
-
-    template <uint32 nIter>
-    struct Pipeliner
-    {
-        template <typename Expr, typename Data>
-        FORCE_INLINE static void go(const Expr& e, Data* d)
-        {
-            _Pipeliner<nIter, nIter>::go(e, d);
-        }
-    };
-
-
-#if 1
-    template <class T>
-    char is_complete_impl(char (*)[sizeof(T)]);
-
-    template <class>
-    long is_complete_impl(...);
-
-    template <class T>
-    struct IsComplete
-    {
-        static const bool value = sizeof(is_complete_impl<T>(0)) == sizeof(char);
-    };
-#else
-    template <class T, std::size_t = sizeof(T)>
-    std::true_type is_complete_impl(T *);
-
-    std::false_type is_complete_impl(...);
-
-    template <class T>
-    struct IsComplete : decltype(is_complete_impl(std::declval<T*>())) {};
-#endif
-
-template <typename T, Algos A>
-struct AlgoScalarToVec : AlgoScalarBase<T,A>
-{
-    typedef AlgoScalarBase<T, A> base_t;
-
-    AlgoScalarToVec(const typename base_t::Data& d) :  base_t(d) {}
-    AlgoScalarToVec(const T* px, const uint32 n) :  base_t(px, n) {}
-
-    static const uint32 nElem = 1;
-
-    struct Constants
-    {
-    };
-
-    void initConstants(Constants& cst) const
-    {
-    }
-
-    FORCE_INLINE
-    void vectorial(uint32 *pr, const T *pz, const Constants& cst) const
-    {
-        *pr = base_t::scalar(*pz);
-    }
-};
-
-template<bool B, class T, class F>
-struct conditional { typedef T type; };
-
-template<class T, class F>
-struct conditional<false, T, F> { typedef F type; };
-
-template <typename T, bool C>
-struct CondData
-{
-    FORCE_INLINE CondData(T x) : v(x) {}
-    FORCE_INLINE operator const T&() const { return v;}
-private:
-    T v;
-};
-
-template <typename T>
-struct CondData<T,false>
-{
-    FORCE_INLINE CondData(T) {}
-    FORCE_INLINE operator const T() const { return 0;}
-};
-
-template <InstrSet I, typename T, Algos A, bool L=false>
-struct BinAlgoBase : Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
-                                 , Details::AlgoVecBase<I, T, A>
-                                 , Details::AlgoScalarToVec<T,A>
-                                 >::type
-{
-    typedef typename Details::conditional< Details::IsComplete<Details::AlgoVecBase<I, T, A>>::value
-                                 , Details::AlgoVecBase<I, T, A>
-                                 , Details::AlgoScalarToVec<T,A>
-                                 >::type base_t;
-
-    BinAlgoBase(const T* px, const uint32 n) :  base_t(px, n) {}
-    BinAlgoBase(const typename base_t::Data& d) : base_t(d) {}
-};
-
-} // namespace Details
-
-} // namespace BinSearch

From 3f932b40cf675eccad4e41e52a084f7633c912a5 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:50:55 -0500
Subject: [PATCH 2/4] Fix

---
 csrc/cpu_ops.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index ed071395f..7fd1b1722 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,6 +1,7 @@
 #include <thread>
 #include <algorithm>
 #include <cmath>
+#include <cstdio>
 #include <vector>
 #include "cpu_ops.h"
 

From f90467d5a77edbd1555757047ccb29a2c77a31a2 Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:52:53 -0500
Subject: [PATCH 3/4] Fix lint

---
 csrc/cpu_ops.cpp | 4 ++--
 csrc/cpu_ops.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index 7fd1b1722..836624f27 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,9 +1,9 @@
-#include <thread>
+#include "cpu_ops.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdio>
+#include <thread>
 #include <vector>
-#include "cpu_ops.h"
 
 #ifdef HAS_OPENMP
 #include <omp.h>
diff --git a/csrc/cpu_ops.h b/csrc/cpu_ops.h
index bf538c3ee..3ecdad99b 100644
--- a/csrc/cpu_ops.h
+++ b/csrc/cpu_ops.h
@@ -1,13 +1,13 @@
 #ifndef BITSANDBYTES_CPU_OPS_H
 #define BITSANDBYTES_CPU_OPS_H
 
+#include "common.h"
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <thread>
 #include <type_traits>
-#include "common.h"
 
 #if defined(_OPENMP)
 #include <omp.h>

From dcbfcd3ebed239927c5874b9cefefa98313c538f Mon Sep 17 00:00:00 2001
From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com>
Date: Tue, 2 Dec 2025 12:57:33 -0500
Subject: [PATCH 4/4] Fix

---
 csrc/ops.cu  | 2 --
 csrc/ops.hip | 2 --
 2 files changed, 4 deletions(-)

diff --git a/csrc/ops.cu b/csrc/ops.cu
index 6b9fa87bf..7b0c60cca 100644
--- a/csrc/ops.cu
+++ b/csrc/ops.cu
@@ -3,7 +3,6 @@
 // This source code is licensed under the MIT license found in the
 // LICENSE file in the root directory of this source tree.
 
-#include <BinSearch.h>
 #include <cassert>
 #include <cub/device/device_scan.cuh>
 #include <kernels.cuh>
@@ -12,7 +11,6 @@
 
 #define ERR_NOT_IMPLEMENTED 100
 
-using namespace BinSearch;
 using std::cout;
 using std::endl;
 
diff --git a/csrc/ops.hip b/csrc/ops.hip
index 794445bc7..26a2362e3 100644
--- a/csrc/ops.hip
+++ b/csrc/ops.hip
@@ -15,13 +15,11 @@
 #include <hipblaslt/hipblaslt.h>
 #endif
 #include <limits>
-#include <BinSearch.h>
 #include <cassert>
 #include <common.h>
 
 #define ERR_NOT_IMPLEMENTED 100
 
-using namespace BinSearch;
 using std::cout;
 using std::endl;