dsharlet
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 1 deletion b/‎.gitignore‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/linear_algebra/Makefile‎
Lines changed: 7 additions & 2 deletions b/‎examples/linear_algebra/Makefile‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/linear_algebra/matrix.cpp‎
Lines changed: 93 additions & 9 deletions b/‎examples/linear_algebra/matrix.cpp‎
Lines changed: 93 additions & 9 deletions
diff --git a/‎include/array/array.h‎
Lines changed: 72 additions & 23 deletions b/‎include/array/array.h‎
Lines changed: 72 additions & 23 deletions
@@ -35,4 +35,8 @@ docs/*
 *~
 
 # Visual Studio folder status
-.vs
+.vs
+
+# perf files
+perf.data
+perf.data.old
@@ -2,7 +2,7 @@ CFLAGS := $(CFLAGS) -O2 -ffast-math -fstrict-aliasing -fPIE
 CXXFLAGS := $(CXXFLAGS) -std=c++14 -Wall
 LDFLAGS := $(LDFLAGS)
 
-DEPS := include/array/array.h include/array/ein_reduce.h include/array/image.h include/array/matrix.h
+DEPS := include/array/array.h include/array/ein_reduce.h include/array/image.h include/array/matrix.h include/array/z_order.h
 
 TEST_SRC := $(filter-out test/errors.cpp, $(wildcard test/*.cpp))
 TEST_OBJ := $(TEST_SRC:%.cpp=obj/%.o)
 
@@ -2,11 +2,16 @@ CFLAGS := $(CFLAGS) -O2 -march=native -ffast-math -fstrict-aliasing -fno-excepti
 CXXFLAGS := $(CXXFLAGS) -std=c++14 -Wall
 LDFLAGS := $(LDFLAGS)
 
-DEPS := ../../include/array/array.h ../../include/array/matrix.h ../benchmark.h ../../include/array/ein_reduce.h
+ifneq ($(BLAS), )
+CFLAGS += -DBLAS
+LDFLAGS += -lblas
+endif
+
+DEPS := ../../include/array/array.h ../../include/array/matrix.h ../benchmark.h ../../include/array/ein_reduce.h ../../include/array/z_order.h
 
 bin/%: %.cpp $(DEPS)
 	mkdir -p $(@D)
-	$(CXX) -I../../include -I../ -o $@ $< $(CFLAGS) $(CXXFLAGS) -lstdc++ -lm
+	$(CXX) -I../../include -I../ -o $@ $< $(CFLAGS) $(CXXFLAGS) -lstdc++ -lm $(LDFLAGS)
 
 .PHONY: all clean test
 
 
@@ -14,12 +14,17 @@
 
 #include "array/matrix.h"
 #include "array/ein_reduce.h"
+#include "array/z_order.h"
 #include "benchmark.h"
 
 #include <functional>
 #include <iostream>
 #include <random>
 
+#ifdef BLAS
+#include "cblas.h"
+#endif
+
 using namespace nda;
 
 // Make it easier to read the generated assembly for these functions.
@@ -209,9 +214,7 @@ NOINLINE void multiply_reduce_tiles(const_matrix_ref<T> A, const_matrix_ref<T> B
   }
 }
 
-//  With clang -O2, this generates (almost) the same fast inner loop as the above!!
-// It only spills one accumulator register, and produces statistically identical
-// performance.
+// With clang -O2, this generates exactly the same fast inner loop as the above!!
 template <typename T>
 NOINLINE void multiply_ein_reduce_tiles(
     const_matrix_ref<T> A, const_matrix_ref<T> B, matrix_ref<T> C) {
@@ -259,13 +262,90 @@ NOINLINE void multiply_ein_reduce_tiles(
   }
 }
 
+// This is similar to the above, but:
+// - It additionally splits the reduction dimension k,
+// - It traverses the io, jo loops in z order, to improve locality,
+// - It prefetches in the inner loop.
+// This version achieves ~90% of the theoretical peak performance of my AMD Ryzen 5800X.
+template <typename T>
+NOINLINE void multiply_reduce_tiles_z_order(const_matrix_ref<T> A, const_matrix_ref<T> B, matrix_ref<T> C) {
+  // Adjust this depending on the target architecture. For AVX2,
+  // vectors are 256-bit.
+  constexpr index_t vector_size = 32 / sizeof(T);
+  constexpr index_t cache_line_size = 64 / sizeof(T);
+
+  // We want the tiles to be as big as possible without spilling any
+  // of the accumulator registers to the stack.
+  constexpr index_t tile_rows = 4;
+  constexpr index_t tile_cols = vector_size * 3;
+  constexpr index_t tile_k = 256;
+
+  // TODO: It seems like z-ordering all of io, jo, ko should be best...
+  // But this seems better, even without the added convenience for initializing
+  // the output.
+  for (auto ko : split(A.j(), tile_k)) {
+    auto split_i = split<tile_rows>(C.i());
+    auto split_j = split<tile_cols>(C.j());
+    for_all_in_z_order(std::make_tuple(split_i, split_j), [&](auto io, auto jo) {
+      // Make a reference to this tile of the output.
+      auto C_ijo = C(io, jo);
+
+      // Define an accumulator buffer.
+      T buffer[tile_rows * tile_cols] = {0};
+      auto accumulator = make_array_ref(buffer, make_compact(C_ijo.shape()));
+
+      // Perform the matrix multiplication for this tile.
+      for (index_t k : ko) {
+        for (index_t i = 0; i < io.extent(); i += cache_line_size) {
+          _mm_prefetch(&A(io.min() + i, k + 8), _MM_HINT_T0);
+        }
+        for (index_t j = 0; j < jo.extent(); j += cache_line_size) {
+          _mm_prefetch(&B(k + 4, jo.min() + j), _MM_HINT_T0);
+        }
+        for (index_t i : io) {
+          for (index_t j : jo) {
+            accumulator(i, j) += A(i, k) * B(k, j);
+          }
+        }
+      }
+
+      // Add the accumulators for this iteration of ko to the output.
+      // Because we split the K dimension, we are doing this more than once per
+      // tile of output. To avoid adding to overlapping regions more than once
+      // (when `split<>` is applied to a dimension not divided by the split factor),
+      // we need to only initialize the result for the first iteration of ko.
+      if (ko.min() == A.j().min()) {
+        for (index_t i : io) {
+          for (index_t j : jo) {
+            C_ijo(i, j) = accumulator(i, j);
+          }
+        }
+      } else {
+        for (index_t i : io) {
+          for (index_t j : jo) {
+            C_ijo(i, j) += accumulator(i, j);
+          }
+        }
+      }
+    });
+  }
+}
+
+#ifdef BLAS
+void multiply_blas(const_matrix_ref<float> A, const_matrix_ref<float> B, matrix_ref<float> C) {
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, C.i().extent(), C.j().extent(),
+      A.j().extent(), 1.0, A.base(), A.i().stride(), B.base(), B.i().stride(), 0.0, C.base(),
+      C.i().stride());
+}
+#endif
+
 float relative_error(float A, float B) { return std::abs(A - B) / std::max(A, B); }
 
 int main(int, const char**) {
   // Define two input matrices.
-  constexpr index_t M = 32;
-  constexpr index_t K = 10000;
-  constexpr index_t N = 64;
+  constexpr index_t M = 384;
+  constexpr index_t K = 1536;
+  constexpr index_t N = 384;
   matrix<float> A({M, K});
   matrix<float> B({K, N});
 
@@ -278,8 +358,7 @@ int main(int, const char**) {
   generate(B, [&]() { return uniform(rng); });
 
   matrix<float> c_ref({M, N});
-  double ref_time = benchmark([&]() { multiply_ref(A.data(), B.data(), c_ref.data(), M, K, N); });
-  std::cout << "reference time: " << ref_time * 1e3 << " ms" << std::endl;
+  multiply_ref(A.data(), B.data(), c_ref.data(), M, K, N);
 
   struct version {
     const char* name;
@@ -294,12 +373,17 @@ int main(int, const char**) {
       {"ein_reduce_matrix", multiply_ein_reduce_matrix<float>},
       {"reduce_tiles", multiply_reduce_tiles<float>},
       {"ein_reduce_tiles", multiply_ein_reduce_tiles<float>},
+      {"reduce_tiles_z_order", multiply_reduce_tiles_z_order<float>},
+#ifdef BLAS
+      {"blas", multiply_blas},
+#endif
   };
   for (auto i : versions) {
     // Compute the result using all matrix multiply methods.
     matrix<float> C({M, N});
     double time = benchmark([&]() { i.fn(A.cref(), B.cref(), C.ref()); });
-    std::cout << i.name << " time: " << time * 1e3 << " ms" << std::endl;
+    double flops = M * N * K * 2 / time;
+    std::cout << i.name << " time: " << time * 1e3 << " ms, " << flops / 1e9 << " GFLOP/s" << std::endl;
 
     // Verify the results from all methods are equal.
     const float tolerance = 1e-4f;
 
@@ -198,10 +198,33 @@ class index_iterator {
   }
 
   NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator operator++(int) { return index_iterator(i_++); }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator operator--(int) { return index_iterator(i_--); }
   NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator& operator++() {
     ++i_;
     return *this;
   }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator& operator--() {
+    --i_;
+    return *this;
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator& operator+=(index_t r) {
+    i_ += r;
+    return *this;
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator& operator-=(index_t r) {
+    i_ -= r;
+    return *this;
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator operator+(index_t r) {
+    return index_iterator(i_ + r);
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_iterator operator-(index_t r) {
+    return index_iterator(i_ - r);
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_t operator-(const index_iterator& r) {
+    return i_ - r.i_;
+  }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_t operator[](index_t n) const { return i_ + n; }
 };
 
 template <index_t Min, index_t Extent, index_t Stride>
@@ -271,6 +294,7 @@ class interval {
   NDARRAY_INLINE NDARRAY_HOST_DEVICE void set_min(index_t min) { min_ = min; }
   /** Get or set the number of indices in this interval. */
   NDARRAY_INLINE NDARRAY_HOST_DEVICE index_t extent() const { return extent_; }
+  NDARRAY_INLINE NDARRAY_HOST_DEVICE index_t size() const { return extent_; }
   NDARRAY_INLINE NDARRAY_HOST_DEVICE void set_extent(index_t extent) { extent_ = extent; }
 
   /** Get or set the last index in this interval. */
@@ -433,6 +457,7 @@ class dim : protected interval<Min_, Extent_> {
   using base_range::begin;
   using base_range::end;
   using base_range::extent;
+  using base_range::size;
   using base_range::is_in_range;
   using base_range::max;
   using base_range::min;
@@ -490,6 +515,8 @@ using broadcast_dim = dim<Min, Extent, 0>;
 namespace internal {
 
 // An iterator for a range of intervals.
+// This is like a random access iterator in that it can move forward in constant time, but
+// but unlike a random access iterator, it cannot be moved in reverse.
 template <index_t InnerExtent = dynamic>
 class split_iterator {
   fixed_interval<InnerExtent> i;
@@ -507,47 +534,69 @@ class split_iterator {
   }
 
   NDARRAY_HOST_DEVICE fixed_interval<InnerExtent> operator*() const { return i; }
+  NDARRAY_HOST_DEVICE const fixed_interval<InnerExtent>* operator->() const { return &i; }
 
-  NDARRAY_HOST_DEVICE split_iterator& operator++() {
+  NDARRAY_HOST_DEVICE split_iterator& operator+=(index_t n) {
+    assert(n >= 0);
     if (is_static(InnerExtent)) {
       // When the extent of the inner split is a compile-time constant,
       // we can't shrink the out of bounds interval. Instead, shift the min,
       // assuming the outer dimension is bigger than the inner extent.
-      i.set_min(i.min() + InnerExtent);
+      i.set_min(i.min() + InnerExtent * n);
       // Only shift the min when this straddles the end of the buffer,
       // so the iterator can advance to the end (one past the max).
       if (i.min() <= outer_max && i.max() > outer_max) { i.set_min(outer_max - InnerExtent + 1); }
     } else {
       // When the extent of the inner split is not a compile-time constant,
       // we can just modify the extent.
-      i.set_min(i.min() + i.extent());
+      i.set_min(i.min() + i.extent() * n);
       index_t max = min(i.max(), outer_max);
       i.set_extent(max - i.min() + 1);
     }
     return *this;
   }
+  NDARRAY_HOST_DEVICE split_iterator operator+(index_t n) const {
+    split_iterator<InnerExtent> result(*this);
+    return result += n;
+  }
+  NDARRAY_HOST_DEVICE split_iterator& operator++() {
+    return *this += 1;
+  }
   NDARRAY_HOST_DEVICE split_iterator operator++(int) {
     split_iterator<InnerExtent> result(*this);
-    ++*this;
+    *this += 1;
     return result;
   }
+
+  NDARRAY_HOST_DEVICE index_t operator-(const split_iterator& r) const {
+    return r.i.extent() > 0 ? (i.max() - r.i.min() + r.i.extent() - i.extent()) / r.i.extent() : 0;
+  }
+
+  NDARRAY_HOST_DEVICE fixed_interval<InnerExtent> operator[](index_t n) const {
+    split_iterator result(*this);
+    result += n;
+    return *result;
+  }
 };
 
-// TODO: Remove this when std::iterator_range is standard.
-template <class T>
-class iterator_range {
-  T begin_;
-  T end_;
+template <index_t InnerExtent = dynamic>
+class split_result {
+public:
+  using iterator = split_iterator<InnerExtent>;
+
+private:
+  iterator begin_;
+  iterator end_;
 
 public:
-  NDARRAY_HOST_DEVICE iterator_range(T begin, T end) : begin_(begin), end_(end) {}
+  NDARRAY_HOST_DEVICE split_result(iterator begin, iterator end) : begin_(begin), end_(end) {}
 
-  NDARRAY_HOST_DEVICE T begin() const { return begin_; }
-  NDARRAY_HOST_DEVICE T end() const { return end_; }
-};
+  NDARRAY_HOST_DEVICE iterator begin() const { return begin_; }
+  NDARRAY_HOST_DEVICE iterator end() const { return end_; }
 
-template <index_t InnerExtent = dynamic>
-using split_iterator_range = iterator_range<split_iterator<InnerExtent>>;
+  NDARRAY_HOST_DEVICE index_t size() const { return end_ - begin_; }
+  NDARRAY_HOST_DEVICE iterator operator[](index_t i) const { return begin_ + i; }
+};
 
 } // namespace internal
 
@@ -562,14 +611,14 @@ using split_iterator_range = iterator_range<split_iterator<InnerExtent>>;
  * - `split<5>(interval<>(0, 12))` produces the intervals `[0, 5)`,
  *   `[5, 10)`, `[7, 12)`. Note the last two intervals overlap. */
 template <index_t InnerExtent, index_t Min, index_t Extent>
-NDARRAY_HOST_DEVICE internal::split_iterator_range<InnerExtent> split(
+NDARRAY_HOST_DEVICE internal::split_result<InnerExtent> split(
     const interval<Min, Extent>& v) {
   assert(v.extent() >= InnerExtent);
   return {{fixed_interval<InnerExtent>(v.min()), v.max()},
       {fixed_interval<InnerExtent>(v.max() + 1), v.max()}};
 }
 template <index_t InnerExtent, index_t Min, index_t Extent, index_t Stride>
-NDARRAY_HOST_DEVICE internal::split_iterator_range<InnerExtent> split(
+NDARRAY_HOST_DEVICE internal::split_result<InnerExtent> split(
     const dim<Min, Extent, Stride>& v) {
   return split<InnerExtent>(interval<Min, Extent>(v.min(), v.extent()));
 }
@@ -585,13 +634,13 @@ NDARRAY_HOST_DEVICE internal::split_iterator_range<InnerExtent> split(
 // avoid some conversion messes. dim<Min, Extent> probably can't implicitly
 // convert to interval<>.
 template <index_t Min, index_t Extent>
-NDARRAY_HOST_DEVICE internal::split_iterator_range<> split(
+NDARRAY_HOST_DEVICE internal::split_result<> split(
     const interval<Min, Extent>& v, index_t inner_extent) {
   return {{interval<>(v.min(), internal::min(inner_extent, v.extent())), v.max()},
       {interval<>(v.max() + 1, 0), v.max()}};
 }
 template <index_t Min, index_t Extent, index_t Stride>
-NDARRAY_HOST_DEVICE internal::split_iterator_range<> split(
+NDARRAY_HOST_DEVICE internal::split_result<> split(
     const dim<Min, Extent, Stride>& v, index_t inner_extent) {
   return split(interval<Min, Extent>(v.min(), v.extent()), inner_extent);
 }
@@ -608,10 +657,10 @@ NDARRAY_INLINE NDARRAY_HOST_DEVICE auto apply(Fn&& fn, const Args& args, index_s
     -> decltype(fn(std::get<Is>(args)...)) {
   return fn(std::get<Is>(args)...);
 }
-template <class Fn, class... Args>
-NDARRAY_INLINE NDARRAY_HOST_DEVICE auto apply(Fn&& fn, const std::tuple<Args...>& args)
-    -> decltype(internal::apply(fn, args, make_index_sequence<sizeof...(Args)>())) {
-  return internal::apply(fn, args, make_index_sequence<sizeof...(Args)>());
+template <class Fn, class Args>
+NDARRAY_INLINE NDARRAY_HOST_DEVICE auto apply(Fn&& fn, const Args& args)
+    -> decltype(internal::apply(fn, args, make_index_sequence<std::tuple_size<Args>::value>())) {
+  return internal::apply(fn, args, make_index_sequence<std::tuple_size<Args>::value>());
 }
 
 template <class Fn, class... Args>