diff --git a/.gitignore b/.gitignore
index a08b8e8dd7f3..888235a389d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -240,6 +240,9 @@ xcuserdata
 # NeoVim + clangd
 .cache
 
+# CCLS
+.ccls-cache
+
 # Emacs
 tags
 TAGS
diff --git a/Makefile b/Makefile
index 54c61a622ae8..a928cd9b81bb 100644
--- a/Makefile
+++ b/Makefile
@@ -535,6 +535,7 @@ SOURCE_FILES = \
   IRVisitor.cpp \
   JITModule.cpp \
   Lambda.cpp \
+  LegalizeVectors.cpp \
   Lerp.cpp \
   LICM.cpp \
   LLVM_Output.cpp \
@@ -737,6 +738,7 @@ HEADER_FILES = \
   WasmExecutor.h \
   JITModule.h \
   Lambda.h \
+  LegalizeVectors.h \
   Lerp.h \
   LICM.h \
   LLVM_Output.h \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index af419323b24e..cadfd608236a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -62,12 +62,14 @@ target_sources(
     Associativity.h
     AsyncProducers.h
     AutoScheduleUtils.h
+    BoundConstantExtentLoops.h
+    BoundSmallAllocations.h
     BoundaryConditions.h
     Bounds.h
     BoundsInference.h
-    BoundConstantExtentLoops.h
-    BoundSmallAllocations.h
     Buffer.h
+    CPlusPlusMangle.h
+    CSE.h
     Callable.h
     CanonicalizeGPUVars.h
     ClampUnsafeAccesses.h
@@ -79,18 +81,16 @@ target_sources(
     CodeGen_LLVM.h
     CodeGen_Metal_Dev.h
     CodeGen_OpenCL_Dev.h
-    CodeGen_Posix.h
     CodeGen_PTX_Dev.h
+    CodeGen_Posix.h
     CodeGen_PyTorch.h
     CodeGen_Targets.h
     CodeGen_Vulkan_Dev.h
     CodeGen_WebGPU_Dev.h
     CompilerLogger.h
     ConciseCasts.h
-    CPlusPlusMangle.h
     ConstantBounds.h
     ConstantInterval.h
-    CSE.h
     Debug.h
     DebugArguments.h
     DebugToFile.h
@@ -127,6 +127,13 @@ target_sources(
     Generator.h
     HexagonOffload.h
     HexagonOptimize.h
+    IR.h
+    IREquality.h
+    IRMatch.h
+    IRMutator.h
+    IROperator.h
+    IRPrinter.h
+    IRVisitor.h
     ImageParam.h
     InferArguments.h
     InjectHostDevBufferCopies.h
@@ -135,19 +142,13 @@ target_sources(
     IntegerDivisionTable.h
     Interval.h
     IntrusivePtr.h
-    IR.h
-    IREquality.h
-    IRMatch.h
-    IRMutator.h
-    IROperator.h
-    IRPrinter.h
-    IRVisitor.h
     JITModule.h
-    Lambda.h
-    Lerp.h
     LICM.h
     LLVM_Output.h
     LLVM_Runtime_Linker.h
+    Lambda.h
+    LegalizeVectors.h
+    Lerp.h
     LoopCarry.h
     LoopPartitioningDirective.h
     Lower.h
@@ -173,8 +174,8 @@ target_sources(
     PurifyIndexMath.h
     PythonExtensionGen.h
     Qualify.h
-    Random.h
     RDom.h
+    Random.h
     Realization.h
     RealizationOrder.h
     RebaseLoopsToZero.h
@@ -320,6 +321,7 @@ target_sources(
     IRVisitor.cpp
     JITModule.cpp
     Lambda.cpp
+    LegalizeVectors.cpp
     Lerp.cpp
     LICM.cpp
     LLVM_Output.cpp
diff --git a/src/CSE.cpp b/src/CSE.cpp
index c2a46d93bc4d..6051e5e9cf62 100644
--- a/src/CSE.cpp
+++ b/src/CSE.cpp
@@ -33,6 +33,11 @@ bool should_extract(const Expr &e, bool lift_all) {
         return false;
     }
 
+    if (const Call *c = e.as<Call>()) {
+        // Calls with side effects should not be moved.
+        return c->is_pure() || c->call_type == Call::Halide;
+    }
+
     if (lift_all) {
         return true;
     }
diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp
index 05b68447b6a4..852721e0077d 100644
--- a/src/CodeGen_Hexagon.cpp
+++ b/src/CodeGen_Hexagon.cpp
@@ -1186,15 +1186,16 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b,
                 create_bitcast(a_call->getArgOperand(1), native_ty),
                 create_bitcast(a_call->getArgOperand(0), native_ty), indices);
         } else if (ShuffleVectorInst *a_shuffle = dyn_cast<ShuffleVectorInst>(a)) {
-            bool is_identity = true;
-            for (int i = 0; i < a_elements; i++) {
-                int mask_i = a_shuffle->getMaskValue(i);
-                is_identity = is_identity && (mask_i == i || mask_i == -1);
-            }
-            if (is_identity) {
-                return shuffle_vectors(a_shuffle->getOperand(0),
-                                       a_shuffle->getOperand(1), indices);
+            std::vector<int> new_indices(indices.size());
+            for (size_t i = 0; i < indices.size(); i++) {
+                if (indices[i] != -1) {
+                    new_indices[i] = a_shuffle->getMaskValue(indices[i]);
+                } else {
+                    new_indices[i] = -1;
+                }
             }
+            return shuffle_vectors(a_shuffle->getOperand(0),
+                                   a_shuffle->getOperand(1), new_indices);
         }
     }
 
@@ -1516,7 +1517,11 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
         vector<int> i8_indices(indices.size() * replicate);
         for (size_t i = 0; i < indices.size(); i++) {
             for (int j = 0; j < replicate; j++) {
-                i8_indices[i * replicate + j] = indices[i] * replicate + j;
+                if (indices[i] == -1) {
+                    i8_indices[i * replicate + j] = -1;  // Replicate the don't-care.
+                } else {
+                    i8_indices[i * replicate + j] = indices[i] * replicate + j;
+                }
             }
         }
         Value *result = vdelta(i8_lut, i8_indices);
@@ -1556,6 +1561,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector<int> &indices) {
         Value *ret = nullptr;
         for (int i = 0; i < lut_elements; i += native_elements) {
             Value *lut_i = slice_vector(lut, i, native_elements);
+            internal_assert(get_vector_num_elements(lut_i->getType()) == native_elements);
             vector<int> indices_i(native_elements);
             vector<Constant *> mask(native_elements);
             bool all_used = true;
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
index 32984f3f2e6f..aa58a94ec4ca 100644
--- a/src/CodeGen_LLVM.cpp
+++ b/src/CodeGen_LLVM.cpp
@@ -5007,10 +5007,11 @@ Value *CodeGen_LLVM::shuffle_vectors(Value *a, Value *b,
     }
     // Check for type identity *after* normalizing to fixed vectors
     internal_assert(a->getType() == b->getType());
+    int elements_a = get_vector_num_elements(a->getType());
     vector<Constant *> llvm_indices(indices.size());
     for (size_t i = 0; i < llvm_indices.size(); i++) {
         if (indices[i] >= 0) {
-            internal_assert(indices[i] < get_vector_num_elements(a->getType()) * 2);
+            internal_assert(indices[i] < elements_a * 2) << indices[i] << "  " << elements_a * 2;
             llvm_indices[i] = ConstantInt::get(i32_t, indices[i]);
         } else {
             // Only let -1 be undef.
diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp
index 671f923ec183..19ba6c348ff9 100644
--- a/src/CodeGen_Vulkan_Dev.cpp
+++ b/src/CodeGen_Vulkan_Dev.cpp
@@ -2054,31 +2054,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
         debug(3) << "\n";
 
         if (arg_ids.size() == 1) {
-
             // 1 argument, just do a simple assignment via a cast
             SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]);
             builder.update_id(result_id);
 
         } else if (arg_ids.size() == 2) {
-
-            // 2 arguments, use a composite insert to update even and odd indices
-            uint32_t even_idx = 0;
-            uint32_t odd_idx = 1;
-            SpvFactory::Indices even_indices;
-            SpvFactory::Indices odd_indices;
-            for (int i = 0; i < op_lanes; ++i) {
-                even_indices.push_back(even_idx);
-                odd_indices.push_back(odd_idx);
-                even_idx += 2;
-                odd_idx += 2;
+            // 2 arguments, use vector-shuffle with logical indices indexing into (vec1[0], vec1[1], ..., vec2[0], vec2[1], ...)
+            SpvFactory::Indices logical_indices;
+            for (int i = 0; i < arg_lanes; ++i) {
+                logical_indices.push_back(uint32_t(i));
+                logical_indices.push_back(uint32_t(i + arg_lanes));
             }
 
             SpvId type_id = builder.declare_type(op->type);
-            SpvId value_id = builder.declare_null_constant(op->type);
-            SpvId partial_id = builder.reserve_id(SpvResultId);
             SpvId result_id = builder.reserve_id(SpvResultId);
-            builder.append(SpvFactory::composite_insert(type_id, partial_id, arg_ids[0], value_id, even_indices));
-            builder.append(SpvFactory::composite_insert(type_id, result_id, arg_ids[1], partial_id, odd_indices));
+            builder.append(SpvFactory::vector_shuffle(type_id, result_id, arg_ids[0], arg_ids[1], logical_indices));
             builder.update_id(result_id);
 
         } else {
@@ -2108,7 +2098,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) {
     } else if (op->is_extract_element()) {
         int idx = op->indices[0];
         internal_assert(idx >= 0);
-        internal_assert(idx <= op->vectors[0].type().lanes());
+        internal_assert(idx < op->vectors[0].type().lanes());
         if (op->vectors[0].type().is_vector()) {
             SpvFactory::Indices indices = {(uint32_t)idx};
             SpvId type_id = builder.declare_type(op->type);
diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp
index f7a5b5f49aa8..243760e9d050 100644
--- a/src/Deinterleave.cpp
+++ b/src/Deinterleave.cpp
@@ -299,6 +299,10 @@ class Deinterleaver : public IRGraphMutator {
         } else {
 
             Type t = op->type.with_lanes(new_lanes);
+            internal_assert((op->type.lanes() - starting_lane + lane_stride - 1) / lane_stride == new_lanes)
+                << "Deinterleaving with lane stride " << lane_stride << " and staring lane " << starting_lane
+                << " for var of Type " << op->type << " to " << t << " drops lanes unexpectedly."
+                << " Deinterleaver probably recursed too deep into types of different lane count.";
             if (external_lets.contains(op->name) &&
                 starting_lane == 0 &&
                 lane_stride == 2) {
@@ -393,8 +397,12 @@ class Deinterleaver : public IRGraphMutator {
             int index = indices.front();
             for (const auto &i : op->vectors) {
                 if (index < i.type().lanes()) {
-                    ScopedValue<int> lane(starting_lane, index);
-                    return mutate(i);
+                    if (i.type().lanes() == op->type.lanes()) {
+                        ScopedValue<int> scoped_starting_lane(starting_lane, index);
+                        return mutate(i);
+                    } else {
+                        return Shuffle::make(op->vectors, indices);
+                    }
                 }
                 index -= i.type().lanes();
             }
@@ -406,10 +414,18 @@ class Deinterleaver : public IRGraphMutator {
 };
 
 Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets) {
+    debug(3) << "Deinterleave "
+             << "(start:" << starting_lane << ", stide:" << lane_stride << ", new_lanes:" << new_lanes << "): "
+             << e << " of Type: " << e.type() << "\n";
+    Type original_type = e.type();
     e = substitute_in_all_lets(e);
     Deinterleaver d(starting_lane, lane_stride, new_lanes, lets);
     e = d.mutate(e);
     e = common_subexpression_elimination(e);
+    Type final_type = e.type();
+    int expected_lanes = (original_type.lanes() + lane_stride - starting_lane - 1) / lane_stride;
+    internal_assert(original_type.code() == final_type.code()) << "Underlying types not identical after interleaving.";
+    internal_assert(expected_lanes == final_type.lanes()) << "Number of lanes incorrect after interleaving: " << final_type.lanes() << "while expected was " << expected_lanes << ".";
     return simplify(e);
 }
 
@@ -420,12 +436,12 @@ Expr extract_odd_lanes(const Expr &e, const Scope<> &lets) {
 
 Expr extract_even_lanes(const Expr &e, const Scope<> &lets) {
     internal_assert(e.type().lanes() % 2 == 0);
-    return deinterleave(e, 0, 2, (e.type().lanes() + 1) / 2, lets);
+    return deinterleave(e, 0, 2, e.type().lanes() / 2, lets);
 }
 
 Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) {
     internal_assert(e.type().lanes() % 3 == 0);
-    return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets);
+    return deinterleave(e, lane, 3, e.type().lanes() / 3, lets);
 }
 
 }  // namespace
diff --git a/src/IR.cpp b/src/IR.cpp
index c844c672656a..006da1b87e80 100644
--- a/src/IR.cpp
+++ b/src/IR.cpp
@@ -12,7 +12,7 @@ namespace Internal {
 
 Expr Cast::make(Type t, Expr v) {
     internal_assert(v.defined()) << "Cast of undefined\n";
-    internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths\n";
+    internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths: " << v << " of type " << v.type() << " cannot be cast to " << t << "\n";
 
     Cast *node = new Cast;
     node->type = t;
@@ -281,7 +281,7 @@ Expr Ramp::make(Expr base, Expr stride, int lanes) {
 
 Expr Broadcast::make(Expr value, int lanes) {
     internal_assert(value.defined()) << "Broadcast of undefined\n";
-    internal_assert(lanes != 1) << "Broadcast of lanes 1\n";
+    internal_assert(lanes != 1) << "Broadcast over 1 lane is not a broadcast\n";
 
     Broadcast *node = new Broadcast;
     node->type = value.type().with_lanes(lanes * value.type().lanes());
diff --git a/src/IROperator.h b/src/IROperator.h
index d6d33a1cf82e..527015770093 100644
--- a/src/IROperator.h
+++ b/src/IROperator.h
@@ -1278,7 +1278,8 @@ Expr random_int(Expr seed = Expr());
 
 /** Create an Expr that prints out its value whenever it is
  * evaluated. It also prints out everything else in the arguments
- * list, separated by spaces. This can include string literals. */
+ * list, separated by spaces. This can include string literals.
+ * Evaluates to the first argument passed. */
 //@{
 Expr print(const std::vector<Expr> &values);
 
diff --git a/src/LegalizeVectors.cpp b/src/LegalizeVectors.cpp
new file mode 100644
index 000000000000..07be6d438354
--- /dev/null
+++ b/src/LegalizeVectors.cpp
@@ -0,0 +1,592 @@
+#include "LegalizeVectors.h"
+#include "CSE.h"
+#include "Deinterleave.h"
+#include "DeviceInterface.h"
+#include "IRMutator.h"
+#include "IROperator.h"
+#include "Simplify.h"
+#include "Util.h"
+
+#include <optional>
+
+namespace Halide {
+namespace Internal {
+
+namespace {
+
+using namespace std;
+
+const char *legalization_error_guide = "\n(This issue can most likely be resolved by reducing lane count for vectorize() calls in the schedule, or disabling it.)";
+
+int max_lanes_for_device(DeviceAPI api, int parent_max_lanes) {
+    std::string envvar = Halide::Internal::get_env_variable("HL_FORCE_VECTOR_LEGALIZATION");
+    if (!envvar.empty()) {
+        return std::atoi(envvar.c_str());
+    }
+    switch (api) {
+    case DeviceAPI::Metal:
+    case DeviceAPI::WebGPU:
+    case DeviceAPI::Vulkan:
+    case DeviceAPI::D3D12Compute:
+        return 4;
+    case DeviceAPI::OpenCL:
+        return 16;
+    case DeviceAPI::CUDA:
+    case DeviceAPI::Hexagon:
+    case DeviceAPI::HexagonDma:
+    case DeviceAPI::Host:
+        return 0;  // No max: LLVM based legalization
+    case DeviceAPI::None:
+        return parent_max_lanes;
+    case DeviceAPI::Default_GPU:
+        internal_error << "No GPU API was selected.";
+        return 0;
+    }
+    internal_error << "Unknown Device API";
+    return 0;
+}
+
+std::string vec_name(const string &name, int lane_start, int lane_count) {
+    return name + ".lanes_" + std::to_string(lane_start) + "_" + std::to_string(lane_start + lane_count - 1);
+}
+
+class LiftLetToLetStmt : public IRMutator {
+    using IRMutator::visit;
+
+    vector<const Let *> lets;
+    Expr visit(const Let *op) override {
+        for (const Let *existing : lets) {
+            internal_assert(existing->name != op->name)
+                << "Let " << op->name << " = ...  cannot be lifted to LetStmt because the name is not unique.";
+        }
+        lets.push_back(op);
+        return mutate(op->body);
+    }
+
+public:
+    Stmt mutate(const Stmt &s) override {
+        ScopedValue<decltype(lets)> scoped_lets(lets, {});
+        Stmt mutated = IRMutator::mutate(s);
+        for (const Let *let : reverse_view(lets)) {
+            mutated = LetStmt::make(let->name, let->value, mutated);
+        }
+        return mutated;
+    }
+
+    Expr mutate(const Expr &e) override {
+        return IRMutator::mutate(e);
+    }
+};
+
+class ExtractLanes : public IRMutator {
+    using IRMutator::visit;
+
+    int lane_start;
+    int lane_count;
+
+    Expr extract_lanes_from_make_struct(const Call *op) {
+        internal_assert(op);
+        internal_assert(op->is_intrinsic(Call::make_struct));
+        vector<Expr> args(op->args.size());
+        for (int i = 0; i < int(op->args.size()); ++i) {
+            args[i] = mutate(op->args[i]);
+        }
+        return Call::make(op->type, Call::make_struct, args, Call::Intrinsic);
+    }
+
+    Expr extract_lanes_trace(const Call *op) {
+        auto event = as_const_int(op->args[6]);
+        internal_assert(event);
+        if (*event == halide_trace_load || *event == halide_trace_store) {
+            debug(3) << "Extracting Trace Lanes: " << Expr(op) << "\n";
+            const Expr &func = op->args[0];
+            Expr values = extract_lanes_from_make_struct(op->args[1].as<Call>());
+            Expr coords = extract_lanes_from_make_struct(op->args[2].as<Call>());
+            const Expr &type_code = op->args[3];
+            const Expr &type_bits = op->args[4];
+            int type_lanes = *as_const_int(op->args[5]);
+            const Expr &event = op->args[6];
+            const Expr &parent_id = op->args[7];
+            const Expr &idx = op->args[8];
+            int size = *as_const_int(op->args[9]);
+            const Expr &tag = op->args[10];
+
+            int num_vecs = op->args[2].as<Call>()->args.size();
+            internal_assert(size == type_lanes * num_vecs) << Expr(op);
+            vector<Expr> args = {
+                func,
+                values, coords,
+                type_code, type_bits, Expr(lane_count),
+                event, parent_id, idx, Expr(lane_count * num_vecs),
+                tag};
+            Expr result = Call::make(Int(32), Call::trace, args, Call::Extern);
+            debug(4) << "  => " << result << "\n";
+            return result;
+        }
+
+        internal_error << "Unhandled trace call in LegalizeVectors' ExtractLanes: " << *event << legalization_error_guide << "\n"
+                       << "Please report this error on GitHub." << legalization_error_guide;
+        return Expr(0);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        vector<int> new_indices;
+        new_indices.reserve(lane_count);
+        for (int i = 0; i < lane_count; ++i) {
+            new_indices.push_back(op->indices[lane_start + i]);
+        }
+        return simplify(Shuffle::make(op->vectors, new_indices));
+    }
+
+    Expr visit(const Ramp *op) override {
+        if (lane_count == 1) {
+            return simplify(op->base + op->stride * lane_start);
+        }
+        return simplify(Ramp::make(op->base + op->stride * lane_start, op->stride, lane_count));
+    }
+
+    Expr visit(const Broadcast *op) override {
+        Expr value = op->value;
+        if (const Call *call = op->value.as<Call>()) {
+            if (call->name == Call::trace) {
+                value = extract_lanes_trace(call);
+            }
+        }
+        if (lane_count == 1) {
+            return value;
+        } else {
+            return Broadcast::make(value, lane_count);
+        }
+    }
+
+    Expr visit(const Variable *op) override {
+        return Variable::make(op->type.with_lanes(lane_count), vec_name(op->name, lane_start, lane_count));
+    }
+
+    Expr visit(const Load *op) override {
+        return Load::make(op->type.with_lanes(lane_count),
+                          op->name,
+                          mutate(op->index),
+                          op->image, op->param,
+                          mutate(op->predicate),
+                          op->alignment + lane_start);
+    }
+
+    Expr visit(const Call *op) override {
+        internal_assert(op->type.lanes() >= lane_start + lane_count);
+        Expr mutated = op;
+        std::vector<Expr> args;
+        args.reserve(op->args.size());
+        for (int i = 0; i < int(op->args.size()); ++i) {
+            const Expr &arg = op->args[i];
+            internal_assert(arg.type().lanes() == op->type.lanes())
+                << "Call argument " << arg << " lane count of " << arg.type().lanes()
+                << " does not match op lane count of " << op->type.lanes();
+            Expr mutated = mutate(arg);
+            internal_assert(!mutated.same_as(arg));
+            args.push_back(mutated);
+        }
+        mutated = Call::make(op->type.with_lanes(lane_count), op->name, args, op->call_type);
+        return mutated;
+    }
+
+    Expr visit(const Cast *op) override {
+        return Cast::make(op->type.with_lanes(lane_count), mutate(op->value));
+    }
+
+    Expr visit(const Reinterpret *op) override {
+        Type result_type = op->type.with_lanes(lane_count);
+        int result_scalar_bits = op->type.element_of().bits();
+        int input_scalar_bits = op->value.type().element_of().bits();
+
+        Expr value = op->value;
+        // If the bit widths of the scalar elements are the same, it's easy.
+        if (result_scalar_bits == input_scalar_bits) {
+            value = mutate(value);
+        } else {
+            // Otherwise, there can be two limiting aspects: the input lane count and the resulting lane count.
+            // In order to construct a correct Reinterpret from a small type to a wider type, we
+            // will need to produce multiple Reinterprets, all able to hold the lane count of the input
+            // and concatate the results together.
+            // Even worse, reinterpreting uint8x8 to uint64 would require intermediate reinterprets
+            // if the maximul legal vector length is 4.
+            //
+            // TODO implement this for all scenarios
+            internal_error << "Vector legalization for Reinterpret to different bit size per element is "
+                           << "not supported yet: reinterpret<" << op->type << ">(" << value.type() << ")"
+                           << legalization_error_guide;
+
+            // int input_lane_start = lane_start * result_scalar_bits / input_scalar_bits;
+            // int input_lane_count = lane_count * result_scalar_bits / input_scalar_bits;
+        }
+        Expr result = Reinterpret::make(result_type, value);
+        debug(3) << "Legalized " << Expr(op) << " to " << result << "\n";
+        return result;
+    }
+
+    Expr visit(const VectorReduce *op) override {
+        internal_assert(op->type.lanes() >= lane_start + lane_count);
+        int vecs_per_reduction = op->value.type().lanes() / op->type.lanes();
+        int input_lane_start = vecs_per_reduction * lane_start;
+        int input_lane_count = vecs_per_reduction * lane_count;
+        Expr arg = ExtractLanes(input_lane_start, input_lane_count).mutate(op->value);
+        // This might fail if the extracted lanes reference a non-existing variable!
+        return VectorReduce::make(op->op, arg, lane_count);
+    }
+
+public:
+    // Small helper to assert the transform did what it's supposed to do.
+    Expr mutate(const Expr &e) override {
+        Type original_type = e.type();
+        internal_assert(original_type.lanes() >= lane_start + lane_count)
+            << "Cannot extract lanes " << lane_start << " through " << lane_start + lane_count - 1
+            << " when the input type is " << original_type;
+        Expr result = IRMutator::mutate(e);
+        Type new_type = result.type();
+        internal_assert(new_type.lanes() == lane_count)
+            << "We didn't correctly legalize " << e << " of type " << original_type << ".\n"
+            << "Got back: " << result << " of type " << new_type << ", expected " << lane_count << " lanes.";
+        return result;
+    }
+
+    Stmt mutate(const Stmt &s) override {
+        return IRMutator::mutate(s);
+    }
+
+    ExtractLanes(int start, int count)
+        : lane_start(start), lane_count(count) {
+    }
+};
+
+class LiftExceedingVectors : public IRMutator {
+    using IRMutator::visit;
+
+    int max_lanes;
+
+    vector<pair<string, Expr>> lets;
+    bool just_in_let_definition{false};
+
+    Expr visit(const Let *op) override {
+        internal_error << "We don't want to process Lets. They should have all been converted to LetStmts.";
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const LetStmt *op) override {
+        just_in_let_definition = true;
+        Expr def = mutate(op->value);
+        just_in_let_definition = false;
+
+        Stmt body = mutate(op->body);
+        if (def.same_as(op->value) && body.same_as(op->body)) {
+            return op;
+        }
+        return LetStmt::make(op->name, std::move(def), std::move(body));
+    }
+
+    Expr visit(const Call *op) override {
+        // Custom handling of Call, to prevent certain things from being extracted out
+        // of the call arguments, as that's not always allowed.
+        bool exceeds_lanecount = op->type.lanes() > max_lanes;
+        Expr mutated = op;
+        if (exceeds_lanecount) {
+            std::vector<Expr> args;
+            args.reserve(op->args.size());
+            bool changed = false;
+            for (int i = 0; i < int(op->args.size()); ++i) {
+                bool may_extract = true;
+                if (op->is_intrinsic(Call::require)) {
+                    // Call::require is special: it behaves a little like if-then-else:
+                    // it runs the 3rd argument (the error handling part) only when there
+                    // is an error. Extracting that would unconditionally print the error.
+                    may_extract &= i < 2;
+                }
+                if (op->is_intrinsic(Call::if_then_else)) {
+                    // Only allow the condition to be extracted.
+                    may_extract &= i == 0;
+                }
+                const Expr &arg = op->args[i];
+                if (may_extract) {
+                    internal_assert(arg.type().lanes() == op->type.lanes());
+                    Expr mutated = mutate(arg);
+                    if (!mutated.same_as(arg)) {
+                        changed = true;
+                    }
+                    args.push_back(mutated);
+                } else {
+                    args.push_back(arg);
+                }
+            }
+            if (!changed) {
+                return op;
+            }
+            mutated = Call::make(op->type, op->name, args, op->call_type);
+        } else {
+            mutated = IRMutator::visit(op);
+        }
+        return mutated;
+    }
+
+public:
+    Stmt mutate(const Stmt &s) override {
+        ScopedValue<decltype(lets)> scoped_lets(lets, {});
+        just_in_let_definition = false;
+        Stmt mutated = IRMutator::mutate(s);
+        for (auto &let : reverse_view(lets)) {
+            // There is no recurse into let.second. This is handled by repeatedly calling this tranform.
+            mutated = LetStmt::make(let.first, let.second, mutated);
+        }
+        return mutated;
+    }
+
+    Expr mutate(const Expr &e) override {
+        bool exceeds_lanecount = e.type().lanes() > max_lanes;
+
+        if (exceeds_lanecount) {
+            bool should_extract = false;
+            should_extract |= e.node_type() == IRNodeType::Shuffle;
+            should_extract |= e.node_type() == IRNodeType::VectorReduce;
+
+            should_extract &= !just_in_let_definition;
+
+            debug((should_extract ? 3 : 4)) << "Max lanes (" << max_lanes << ") exceeded (" << e.type().lanes() << ") by: " << e << "\n";
+            if (should_extract) {
+                std::string name = unique_name('t');
+                Expr var = Variable::make(e.type(), name);
+                lets.emplace_back(name, e);
+                debug(3) << "  => Lifted out into " << name << "\n";
+                return var;
+            }
+        }
+
+        just_in_let_definition = false;
+        return IRMutator::mutate(e);
+    }
+
+    LiftExceedingVectors(int max_lanes)
+        : max_lanes(max_lanes) {
+        internal_assert(max_lanes != 0) << "LiftExceedingVectors should not be called when there is no lane limit.";
+    }
+};
+
+class LegalizeVectors : public IRMutator {
+    using IRMutator::visit;
+
+    int max_lanes;
+
+    Stmt visit(const LetStmt *op) override {
+        bool exceeds_lanecount = op->value.type().lanes() > max_lanes;
+
+        if (exceeds_lanecount) {
+            int num_vecs = (op->value.type().lanes() + max_lanes - 1) / max_lanes;
+            debug(3) << "Legalize let " << op->value.type() << ": " << op->name
+                     << " = " << op->value << " into " << num_vecs << " vecs\n";
+            Stmt body = IRMutator::mutate(op->body);
+            for (int i = num_vecs - 1; i >= 0; --i) {
+                int lane_start = i * max_lanes;
+                int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes);
+                std::string name = vec_name(op->name, lane_start, lane_count_for_vec);
+
+                Expr value = mutate(ExtractLanes(lane_start, lane_count_for_vec).mutate(op->value));
+
+                debug(3) << "  Add: let " << name << " = " << value << "\n";
+                body = LetStmt::make(name, value, body);
+            }
+            return body;
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+    Expr visit(const Let *op) override {
+        internal_error << "Lets should have been lifted into LetStmts.";
+        return IRMutator::visit(op);
+    }
+
+    Stmt visit(const Store *op) override {
+        bool exceeds_lanecount = op->index.type().lanes() > max_lanes;
+        if (exceeds_lanecount) {
+            // Split up in multiple stores
+            int num_vecs = (op->index.type().lanes() + max_lanes - 1) / max_lanes;
+            std::vector<Stmt> assignments;
+            assignments.reserve(num_vecs);
+            for (int i = 0; i < num_vecs; ++i) {
+                int lane_start = i * max_lanes;
+                int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes);
+                Expr rhs = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->value);
+                Expr index = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->index);
+                Expr predictate = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->predicate);
+                assignments.push_back(Store::make(
+                    op->name, std::move(rhs), std::move(index),
+                    op->param, std::move(predictate), op->alignment + lane_start));
+            }
+            Stmt result = Block::make(assignments);
+            debug(3) << "Legalized store " << Stmt(op) << " => " << result << "\n";
+            return result;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const Shuffle *op) override {
+        internal_assert(op->type.lanes() <= max_lanes) << Expr(op);
+        bool requires_mutation = false;
+        for (const auto &vec : op->vectors) {
+            if (vec.type().lanes() > max_lanes) {
+                requires_mutation = true;
+                break;
+            }
+        }
+
+        if (requires_mutation) {
+            debug(4) << "Legalizing Shuffle " << Expr(op) << "\n";
+            // We are dealing with a shuffle of an exceeding-lane-count vector argument.
+            // We can assume the variable here has extracted lane variables in surrounding Lets.
+            // So let's hope it's a simple case, and we can legalize.
+
+            vector<Expr> new_vectors;
+            vector<pair<int, int>> vector_and_lane_indices = op->vector_and_lane_indices();
+            for (int i = 0; i < int(op->vectors.size()); ++i) {
+                const Expr &vec = op->vectors[i];
+                if (vec.type().lanes() > max_lanes) {
+                    debug(4) << "  Arg " << i << ": " << vec << "\n";
+                    int num_vecs = (vec.type().lanes() + max_lanes - 1) / max_lanes;
+                    for (int i = 0; i < num_vecs; i++) {
+                        int lane_start = i * max_lanes;
+                        int lane_count_for_vec = std::min(vec.type().lanes() - lane_start, max_lanes);
+                        new_vectors.push_back(ExtractLanes(lane_start, lane_count_for_vec).mutate(vec));
+                    }
+                } else {
+                    new_vectors.push_back(IRMutator::mutate(vec));
+                }
+            }
+            Expr result = simplify(Shuffle::make(new_vectors, op->indices));
+            debug(3) << "Legalized " << Expr(op) << " => " << result << "\n";
+            return result;
+        }
+        return IRMutator::visit(op);
+    }
+
+    Expr visit(const VectorReduce *op) override {
+        const Expr &arg = op->value;
+        if (arg.type().lanes() > max_lanes) {
+            // TODO: The transformation below is not allowed under strict_float, but
+            // I don't immediately know what to do here.
+            // This should be an internal_assert.
+
+            internal_assert(op->type.lanes() == 1)
+                << "Vector legalization currently does not support VectorReduce with lanes != 1: " << Expr(op)
+                << legalization_error_guide;
+            int num_vecs = (arg.type().lanes() + max_lanes - 1) / max_lanes;
+            Expr result;
+            for (int i = 0; i < num_vecs; i++) {
+                int lane_start = i * max_lanes;
+                int lane_count_for_vec = std::min(arg.type().lanes() - lane_start, max_lanes);
+                Expr partial_arg = mutate(ExtractLanes(lane_start, lane_count_for_vec).mutate(arg));
+                Expr partial_red = VectorReduce::make(op->op, std::move(partial_arg), op->type.lanes());
+                if (i == 0) {
+                    result = partial_red;
+                } else {
+                    switch (op->op) {
+                    case VectorReduce::Add:
+                        result = result + partial_red;
+                        break;
+                    case VectorReduce::SaturatingAdd:
+                        result = saturating_add(result, partial_red);
+                        break;
+                    case VectorReduce::Mul:
+                        result = result * partial_red;
+                        break;
+                    case VectorReduce::Min:
+                        result = min(result, partial_red);
+                        break;
+                    case VectorReduce::Max:
+                        result = max(result, partial_red);
+                        break;
+                    case VectorReduce::And:
+                        result = result && partial_red;
+                        break;
+                    case VectorReduce::Or:
+                        result = result || partial_red;
+                        break;
+                    }
+                }
+            }
+            return result;
+        } else {
+            return IRMutator::visit(op);
+        }
+    }
+
+public:
+    LegalizeVectors(int max_lanes)
+        : max_lanes(max_lanes) {
+        internal_assert(max_lanes != 0) << "LegalizeVectors should not be called when there is no lane limit.";
+    }
+};
+
+}  // namespace
+
+Stmt legalize_vectors_in_device_loop(const For *op) {
+    int max_lanes = max_lanes_for_device(op->device_api, 0);
+
+    // Similar to CSE, lifting out stuff into variables.
+    // Pass 1): lift out Shuffles that exceed lane count into variables
+    // Pass 2): Rewrite those vector variables as bundles of vector variables, while legalizing all other stuff.
+    Stmt m0 = simplify(op->body);
+    Stmt m1 = common_subexpression_elimination(m0, false);
+    if (!m1.same_as(op->body)) {
+        debug(3) << "After CSE:\n"
+                 << m1 << "\n";
+    }
+    Stmt m2 = LiftLetToLetStmt().mutate(m1);
+    if (!m2.same_as(m1)) {
+        debug(3) << "After lifting Lets to LetStmts:\n"
+                 << m2 << "\n";
+    }
+
+    Stmt m3 = m2;
+    while (true) {
+        Stmt m = LiftExceedingVectors(max_lanes).mutate(m3);
+        bool modified = !m3.same_as(m);
+        m3 = std::move(m);
+        if (!modified) {
+            debug(3) << "Nothing got lifted out\n";
+            break;
+        } else {
+            debug(3) << "Atfer lifting exceeding vectors:\n"
+                     << m3 << "\n";
+        }
+    }
+
+    Stmt m4 = LegalizeVectors(max_lanes).mutate(m3);
+    if (!m4.same_as(m3)) {
+        debug(3) << "After legalizing vectors:\n"
+                 << m4 << "\n";
+    }
+    if (m4.same_as(m2)) {
+        debug(3) << "Vector Legalization did do nothing, returning input.\n";
+        return op;
+    }
+    Stmt m5 = simplify(m4);
+    if (!m4.same_as(m5)) {
+        debug(3) << "After simplify:\n"
+                 << m5 << "\n";
+    }
+    return For::make(op->name, op->min, op->max, op->for_type,
+                     op->partition_policy, op->device_api, m5);
+}
+
+Stmt legalize_vectors(const Stmt &s) {
+    class LegalizeDeviceLoops : public IRMutator {
+        using IRMutator::visit;
+        Stmt visit(const For *op) override {
+            if (max_lanes_for_device(op->device_api, 0)) {
+                return legalize_vectors_in_device_loop(op);
+            } else {
+                return IRMutator::visit(op);
+            }
+        }
+    } mutator;
+    return mutator.mutate(s);
+}
+}  // namespace Internal
+}  // namespace Halide
diff --git a/src/LegalizeVectors.h b/src/LegalizeVectors.h
new file mode 100644
index 000000000000..14fe8d806fb1
--- /dev/null
+++ b/src/LegalizeVectors.h
@@ -0,0 +1,19 @@
+#ifndef HALIDE_INTERNAL_LEGALIZE_VECTORS_H
+#define HALIDE_INTERNAL_LEGALIZE_VECTORS_H
+
+#include "Expr.h"
+
+/** \file
+ * Defines a lowering pass that legalizes vectorized expressions
+ * to have a maximal lane count.
+ */
+
+namespace Halide {
+namespace Internal {
+
+Stmt legalize_vectors(const Stmt &s);
+
+}  // namespace Internal
+}  // namespace Halide
+
+#endif
diff --git a/src/Lower.cpp b/src/Lower.cpp
index fcbc66747242..c5503166b418 100644
--- a/src/Lower.cpp
+++ b/src/Lower.cpp
@@ -42,6 +42,7 @@
 #include "InjectHostDevBufferCopies.h"
 #include "Inline.h"
 #include "LICM.h"
+#include "LegalizeVectors.h"
 #include "LoopCarry.h"
 #include "LowerParallelTasks.h"
 #include "LowerWarpShuffles.h"
@@ -444,6 +445,10 @@ void lower_impl(const vector<Function> &output_funcs,
     s = flatten_nested_ramps(s);
     log("Lowering after flattening nested ramps:", s);
 
+    debug(1) << "Legalizing vectors...\n";
+    s = legalize_vectors(s);
+    log("Lowering after legalizing vectors:", s);
+
     debug(1) << "Removing dead allocations and moving loop invariant code...\n";
     s = remove_dead_allocations(s);
     s = simplify(s);
diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp
index c7d598d5cf71..3f8a9b45538d 100644
--- a/src/Simplify_Let.cpp
+++ b/src/Simplify_Let.cpp
@@ -98,7 +98,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) {
         Expr new_var = Variable::make(f.new_value.type(), f.new_name);
         Expr replacement = new_var;
 
-        debug(4) << "simplify let " << op->name << " = " << f.value << " in...\n";
+        debug(4) << "simplify let " << op->name << " = (" << f.value.type() << ") " << f.value << " in...\n";
 
         while (true) {
             const Variable *var = f.new_value.template as<Variable>();
@@ -180,6 +180,16 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) {
                 f.new_value = cast->value;
                 new_var = Variable::make(f.new_value.type(), f.new_name);
                 replacement = substitute(f.new_name, Cast::make(cast->type, new_var), replacement);
+            } else if (shuffle && shuffle->is_concat() && is_pure(shuffle)) {
+                // Substitute in all concatenates as they will likely simplify
+                // with other shuffles.
+                // As the structure of this while loop makes it hard to peel off
+                // pure operations from _all_ arguments to the Shuffle, we will
+                // instead substitute all of the vars that go in the shuffle, and
+                // instead guard against side effects by checking with `is_pure()`.
+                replacement = substitute(f.new_name, shuffle, replacement);
+                f.new_value = Expr();
+                break;
             } else if (shuffle && shuffle->is_slice()) {
                 // Replacing new_value below might free the shuffle
                 // indices vector, so save them now.
diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp
index aecb4c6fc99a..5c84cea8d195 100644
--- a/src/Simplify_Shuffle.cpp
+++ b/src/Simplify_Shuffle.cpp
@@ -5,6 +5,7 @@
 namespace Halide {
 namespace Internal {
 
+using std::pair;
 using std::vector;
 
 Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
@@ -25,9 +26,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
         }
     }
 
-    // Mutate the vectors
     vector<Expr> new_vectors;
+    vector<int> new_indices = op->indices;
     bool changed = false;
+
+    // Mutate the vectors
     for (const Expr &vector : op->vectors) {
         ExprInfo v_info;
         Expr new_vector = mutate(vector, &v_info);
@@ -45,48 +48,151 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
         new_vectors.push_back(new_vector);
     }
 
-    // Try to convert a load with shuffled indices into a
-    // shuffle of a dense load.
+    // A concat of one vector, is just the vector.
+    // (Early check, this is repeated below, once the argument list is potentially reduced)
+    if (op->vectors.size() == 1 && op->is_concat()) {
+        return new_vectors[0];
+    }
+
+    Expr result = op;
+
+    // Analyze which input vectors are actually used. We will rewrite
+    // the vector of inputs and the indices jointly, and continue with
+    // those below.
+    {
+        vector<bool> arg_used(new_vectors.size());
+        // Figure out if all extracted lanes come from 1 component.
+        vector<pair<int, int>> src_vec_and_lane_idx = op->vector_and_lane_indices();
+        for (int i = 0; i < int(op->indices.size()); ++i) {
+            arg_used[src_vec_and_lane_idx[i].first] = true;
+        }
+        size_t num_args_used = 0;
+        for (bool used : arg_used) {
+            if (used) {
+                num_args_used++;
+            }
+        }
+
+        if (num_args_used < op->vectors.size()) {
+            // Not all arguments to the shuffle are used by the indices.
+            // Let's throw them out.
+            for (int vi = arg_used.size() - 1; vi >= 0; --vi) {
+                if (!arg_used[vi]) {
+                    int lanes_deleted = op->vectors[vi].type().lanes();
+                    int vector_start_lane = 0;
+                    for (int i = 0; i < vi; ++i) {
+                        vector_start_lane += op->vectors[i].type().lanes();
+                    }
+                    for (int &new_index : new_indices) {
+                        if (new_index > vector_start_lane) {
+                            internal_assert(new_index >= vector_start_lane + lanes_deleted);
+                            new_index -= lanes_deleted;
+                        }
+                    }
+                    new_vectors.erase(new_vectors.begin() + vi);
+                }
+            }
+
+            changed = true;
+        }
+    }
+
+    // Replace the op with the intermediate simplified result (if it changed), and continue.
+    if (changed) {
+        result = Shuffle::make(new_vectors, new_indices);
+        op = result.as<Shuffle>();
+        changed = false;
+    }
+
+    if (new_vectors.size() == 1) {
+        const Ramp *ramp = new_vectors[0].as<Ramp>();
+        if (ramp && op->is_slice()) {
+            int first_lane_in_src = op->indices[0];
+            int slice_stride = op->slice_stride();
+            if (slice_stride >= 1) {
+                return mutate(Ramp::make(ramp->base + first_lane_in_src * ramp->stride,
+                                         ramp->stride * slice_stride,
+                                         op->indices.size()),
+                              nullptr);
+            }
+        }
+
+        // Test this again, but now after new_vectors got potentially shorter.
+        if (op->is_concat()) {
+            return new_vectors[0];
+        }
+    }
+
+    // Try to convert a Shuffle of Loads into a single Load of a Ramp.
+    // Make sure to not undo the work of the StageStridedLoads pass:
+    // only if the result of the shuffled indices is a *dense* ramp, we
+    // can proceed. There are two side cases: concatenations of scalars,
+    // and when the loads weren't dense to begin with.
     if (const Load *first_load = new_vectors[0].as<Load>()) {
         vector<Expr> load_predicates;
         vector<Expr> load_indices;
+        bool all_loads_are_dense = true;
         bool unpredicated = true;
+        bool concat_of_scalars = true;
         for (const Expr &e : new_vectors) {
             const Load *load = e.as<Load>();
             if (load && load->name == first_load->name) {
                 load_predicates.push_back(load->predicate);
                 load_indices.push_back(load->index);
                 unpredicated = unpredicated && is_const_one(load->predicate);
+                if (const Ramp *index_ramp = load->index.as<Ramp>()) {
+                    if (!is_const_one(index_ramp->stride)) {
+                        all_loads_are_dense = false;
+                    }
+                } else if (!load->index.type().is_scalar()) {
+                    all_loads_are_dense = false;
+                }
+                if (!load->index.type().is_scalar()) {
+                    concat_of_scalars = false;
+                }
             } else {
                 break;
             }
         }
 
+        debug(3) << "Shuffle of Load found: " << result << " where"
+                 << " all_loads_are_dense=" << all_loads_are_dense << ","
+                 << " concat_of_scalars=" << concat_of_scalars << "\n";
+
         if (load_indices.size() == new_vectors.size()) {
+            // All of the Shuffle arguments are Loads.
             Type t = load_indices[0].type().with_lanes(op->indices.size());
             Expr shuffled_index = Shuffle::make(load_indices, op->indices);
+            debug(3) << "  Shuffled index: " << shuffled_index << "\n";
             ExprInfo shuffled_index_info;
             shuffled_index = mutate(shuffled_index, &shuffled_index_info);
-            if (shuffled_index.as<Ramp>()) {
-                ExprInfo base_info;
-                if (const Ramp *r = shuffled_index.as<Ramp>()) {
-                    mutate(r->base, &base_info);
-                }
+            debug(3) << "  Simplified shuffled index: " << shuffled_index << "\n";
+            if (const Ramp *index_ramp = shuffled_index.as<Ramp>()) {
+                if (is_const_one(index_ramp->stride) || !all_loads_are_dense || concat_of_scalars) {
+                    ExprInfo base_info;
+                    mutate(index_ramp->base, &base_info);
 
-                ModulusRemainder alignment =
-                    ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment);
+                    ModulusRemainder alignment =
+                        ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment);
 
-                Expr shuffled_predicate;
-                if (unpredicated) {
-                    shuffled_predicate = const_true(t.lanes(), nullptr);
-                } else {
-                    shuffled_predicate = Shuffle::make(load_predicates, op->indices);
-                    shuffled_predicate = mutate(shuffled_predicate, nullptr);
+                    Expr shuffled_predicate;
+                    if (unpredicated) {
+                        shuffled_predicate = const_true(t.lanes(), nullptr);
+                    } else {
+                        shuffled_predicate = Shuffle::make(load_predicates, op->indices);
+                        shuffled_predicate = mutate(shuffled_predicate, nullptr);
+                    }
+                    t = first_load->type;
+                    t = t.with_lanes(op->indices.size());
+                    Expr result = Load::make(t, first_load->name, shuffled_index, first_load->image,
+                                             first_load->param, shuffled_predicate, alignment);
+                    debug(3) << "   => " << result << "\n";
+                    return result;
                 }
-                t = first_load->type;
-                t = t.with_lanes(op->indices.size());
-                return Load::make(t, first_load->name, shuffled_index, first_load->image,
-                                  first_load->param, shuffled_predicate, alignment);
+            } else {
+                // We can't... Leave it as a Shuffle of Loads.
+                // Note: don't proceed down.
+                return result;
             }
         }
     }
@@ -256,6 +362,14 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
                 }
             }
 
+            for (size_t i = 0; i < new_vectors.size() && can_collapse; i++) {
+                if (new_vectors[i].as<Load>()) {
+                    // Don't create a Ramp of a Load, like:
+                    // ramp(buf[x], buf[x + 1] - buf[x], ...)
+                    can_collapse = false;
+                }
+            }
+
             if (can_collapse) {
                 return Ramp::make(new_vectors[0], stride, op->indices.size());
             }
@@ -289,13 +403,18 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
             if (inner_shuffle->is_concat()) {
                 int slice_min = op->indices.front();
                 int slice_max = op->indices.back();
+                if (slice_min > slice_max) {
+                    // Slices can go backward.
+                    std::swap(slice_min, slice_max);
+                }
                 int concat_index = 0;
                 int new_slice_start = -1;
                 vector<Expr> new_concat_vectors;
                 for (const auto &v : inner_shuffle->vectors) {
                     // Check if current concat vector overlaps with slice.
-                    if ((concat_index >= slice_min && concat_index <= slice_max) ||
-                        ((concat_index + v.type().lanes() - 1) >= slice_min && (concat_index + v.type().lanes() - 1) <= slice_max)) {
+                    int overlap_max = std::min(slice_max, concat_index + v.type().lanes() - 1);
+                    int overlap_min = std::max(slice_min, concat_index);
+                    if (overlap_min <= overlap_max) {
                         if (new_slice_start < 0) {
                             new_slice_start = concat_index;
                         }
@@ -305,17 +424,16 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) {
                     concat_index += v.type().lanes();
                 }
                 if (new_concat_vectors.size() < inner_shuffle->vectors.size()) {
-                    return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), op->slice_begin() - new_slice_start, op->slice_stride(), op->indices.size());
+                    return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors),
+                                               op->slice_begin() - new_slice_start,
+                                               op->slice_stride(),
+                                               op->indices.size());
                 }
             }
         }
     }
 
-    if (!changed) {
-        return op;
-    } else {
-        return Shuffle::make(new_vectors, op->indices);
-    }
+    return result;
 }
 
 }  // namespace Internal
diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp
index 2d149adbaf20..fc6fd9531983 100644
--- a/src/VectorizeLoops.cpp
+++ b/src/VectorizeLoops.cpp
@@ -732,8 +732,8 @@ class VectorSubs : public IRMutator {
 
         if (op->is_intrinsic(Call::prefetch)) {
             // We don't want prefetch args to ve vectorized, but we can't just skip the mutation
-            // (otherwise we can end up with dead loop variables. Instead, use extract_lane() on each arg
-            // to scalarize it again.
+            // (otherwise we can end up with dead loop variables). Instead, use extract_lane() on
+            // each arg to scalarize it again.
             for (auto &arg : new_args) {
                 if (arg.type().is_vector()) {
                     arg = extract_lane(arg, 0);
diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h
index aeef545385cc..ef6376a58f7d 100644
--- a/src/runtime/vulkan_internal.h
+++ b/src/runtime/vulkan_internal.h
@@ -279,6 +279,8 @@ const char *vk_get_error_name(VkResult error) {
         return "VK_ERROR_FORMAT_NOT_SUPPORTED";
     case VK_ERROR_FRAGMENTED_POOL:
         return "VK_ERROR_FRAGMENTED_POOL";
+    case VK_ERROR_UNKNOWN:
+        return "VK_ERROR_UNKNOWN";
     case VK_ERROR_SURFACE_LOST_KHR:
         return "VK_ERROR_SURFACE_LOST_KHR";
     case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR:
@@ -302,6 +304,8 @@ const char *vk_get_error_name(VkResult error) {
     }
 }
 
+#define vk_report_error(user_context, code, func) (error((user_context)) << "Vulkan: " << (func) << " returned " << vk_get_error_name((code)) << " (code: " << (code) << ") ")
+
 // --------------------------------------------------------------------------
 
 }  // namespace
diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h
index c5c3c6620a9f..09e532a50c48 100644
--- a/src/runtime/vulkan_resources.h
+++ b/src/runtime/vulkan_resources.h
@@ -85,7 +85,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator,
     debug(user_context)
         << " vk_create_command_pool (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
-        << "queue_index: " << queue_index << ")\n";
+        << "queue_index: " << queue_index << ")";
 #endif
 
     if (allocator == nullptr) {
@@ -103,7 +103,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator,
 
     VkResult result = vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create command pool!\n";
+        vk_report_error(user_context, result, "vkCreateCommandPool");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -117,7 +117,7 @@ int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkResetCommandPool(allocator->current_device(), command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT);
@@ -135,7 +135,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato
         << "command_pool: " << (void *)command_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -150,7 +150,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato
 
     VkResult result = vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to allocate command buffers!\n";
+        vk_report_error(user_context, result, "vkAllocateCommandBuffers");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -165,7 +165,7 @@ int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocat
         << "command_buffer: " << (void *)command_buffer << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -231,7 +231,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
     VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkBeginCommandBuffer");
         return halide_error_code_generic_error;
     }
 
@@ -242,7 +242,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context,
 
     result = vkEndCommandBuffer(command_buffer);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkEndCommandBuffer");
         return halide_error_code_generic_error;
     }
 
@@ -272,7 +272,7 @@ int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer
 
     VkResult result = vkQueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkSubmitQueue");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -327,7 +327,7 @@ int vk_create_descriptor_pool(void *user_context,
         << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -364,7 +364,7 @@ int vk_create_descriptor_pool(void *user_context,
 
     VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateDescriptorPool");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -380,7 +380,7 @@ int vk_destroy_descriptor_pool(void *user_context,
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks());
@@ -404,7 +404,7 @@ int vk_create_descriptor_set_layout(void *user_context,
         << "layout: " << (void *)layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -462,7 +462,7 @@ int vk_create_descriptor_set_layout(void *user_context,
     // Create the descriptor set layout
     VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout);
     if (result != VK_SUCCESS) {
-        error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateDescriptorSetLayout");
         return halide_error_code_generic_error;
     }
 
@@ -480,7 +480,7 @@ int vk_destroy_descriptor_set_layout(void *user_context,
         << "layout: " << (void *)descriptor_set_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
     vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks());
@@ -502,7 +502,7 @@ int vk_create_descriptor_set(void *user_context,
         << "descriptor_pool: " << (void *)descriptor_pool << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -517,7 +517,7 @@ int vk_create_descriptor_set(void *user_context,
 
     VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkAllocateDescriptorSets");
         return halide_error_code_generic_error;
     }
 
@@ -543,7 +543,7 @@ int vk_update_descriptor_set(void *user_context,
         << "descriptor_set: " << (void *)descriptor_set << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -600,7 +600,7 @@ int vk_update_descriptor_set(void *user_context,
             // retrieve the buffer from the region
             VkBuffer *device_buffer = reinterpret_cast<VkBuffer *>(owner->handle);
             if (device_buffer == nullptr) {
-                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!";
                 return halide_error_code_internal_error;
             }
 
@@ -673,7 +673,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!";
         return nullptr;
     }
 
@@ -686,7 +686,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context,
     // allocate a new region
     MemoryRegion *region = allocator->reserve(user_context, request);
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!\n";
+        error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!";
         return nullptr;
     }
 
@@ -708,19 +708,19 @@ int vk_update_scalar_uniform_buffer(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
     if ((region == nullptr) || (region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!";
         return halide_error_code_internal_error;
     }
 
     // map the region to a host ptr
     uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region);
     if (host_ptr == nullptr) {
-        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!\n";
+        error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!";
         return halide_error_code_internal_error;
     }
 
@@ -748,7 +748,7 @@ int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator *
         << "scalar_args_region: " << (void *)scalar_args_region << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -782,7 +782,7 @@ int vk_create_pipeline_layout(void *user_context,
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -791,7 +791,7 @@ int vk_create_pipeline_layout(void *user_context,
         if (descriptor_set_count > max_bound_descriptor_sets) {
             error(user_context) << "Vulkan: Number of descriptor sets for pipeline layout exceeds the number that can be bound by device!\n"
                                 << " requested: " << descriptor_set_count << ","
-                                << " available: " << max_bound_descriptor_sets << "\n";
+                                << " available: " << max_bound_descriptor_sets;
             return halide_error_code_incompatible_device_interface;
         }
     }
@@ -808,7 +808,7 @@ int vk_create_pipeline_layout(void *user_context,
 
     VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreatePipelineLayout");
         return halide_error_code_generic_error;
     }
     return halide_error_code_success;
@@ -826,7 +826,7 @@ int vk_destroy_pipeline_layout(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -848,11 +848,12 @@ int vk_create_compute_pipeline(void *user_context,
     debug(user_context)
         << " vk_create_compute_pipeline (user_context: " << user_context << ", "
         << "allocator: " << (void *)allocator << ", "
+        << "pipeline_name: " << pipeline_name << ", "
         << "shader_module: " << (void *)shader_module << ", "
         << "pipeline_layout: " << (void *)pipeline_layout << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -878,7 +879,10 @@ int vk_create_compute_pipeline(void *user_context,
 
     VkResult result = vkCreateComputePipelines(allocator->current_device(), VK_NULL_HANDLE, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline);
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n";
+        vk_report_error(user_context, result, "vkCreateComputePipeline")
+            << "failed to create compute pipeline " << pipeline_name << ".\n"
+            << " (This might be a bug in Halide. To debug this, see the HL_SPIRV_DUMP_FILE environment variable, and use the Khronos validator to make a bug report)";
+
         return halide_error_code_generic_error;
     }
 
@@ -905,24 +909,24 @@ int vk_setup_compute_pipeline(void *user_context,
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!";
         return halide_error_code_generic_error;
     }
 
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!";
         return halide_error_code_generic_error;
     }
 
     VkResult result = VK_SUCCESS;
     const char *entry_point_name = shader_bindings->entry_point_name;
     if (entry_point_name == nullptr) {
-        error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!\n";
+        error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!";
         return halide_error_code_generic_error;
     }
 
@@ -945,7 +949,7 @@ int vk_setup_compute_pipeline(void *user_context,
             } else {
                 // dynamic allocation
                 if (shared_mem_constant_id > 0) {
-                    error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!\n";
+                    error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!";
                     result = VK_ERROR_TOO_MANY_OBJECTS;
                     break;
                 }
@@ -978,13 +982,13 @@ int vk_setup_compute_pipeline(void *user_context,
             if (static_shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
                                     << " requested: " << static_shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
             if (dispatch_data->shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of dynamic shared memory used exceeds device limit!\n"
                                     << " requested: " << dispatch_data->shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
         }
@@ -1015,14 +1019,14 @@ int vk_setup_compute_pipeline(void *user_context,
             }
         }
         if (found_index == invalid_index) {
-            error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!\n";
+            error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!";
             result = VK_ERROR_INITIALIZATION_FAILED;
         }
     }
 
     // don't even attempt to create the pipeline layout if we encountered errors in the shader binding
     if (result != VK_SUCCESS) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result) << "\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result);
         return halide_error_code_generic_error;
     }
 
@@ -1050,7 +1054,7 @@ int vk_setup_compute_pipeline(void *user_context,
         if (shader_bindings->compute_pipeline) {
             int error_code = vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline);
             if (error_code != halide_error_code_success) {
-                error(user_context) << "Vulkan: Failed to destroy compute pipeline!\n";
+                error(user_context) << "Vulkan: Failed to destroy compute pipeline!";
                 return halide_error_code_generic_error;
             }
             shader_bindings->compute_pipeline = VK_NULL_HANDLE;
@@ -1058,7 +1062,7 @@ int vk_setup_compute_pipeline(void *user_context,
 
         int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline));
         if (error_code != halide_error_code_success) {
-            error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+            error(user_context) << "Vulkan: Failed to create compute pipeline!";
             return error_code;
         }
 
@@ -1068,7 +1072,7 @@ int vk_setup_compute_pipeline(void *user_context,
         if (shader_bindings->compute_pipeline == VK_NULL_HANDLE) {
             int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline));
             if (error_code != halide_error_code_success) {
-                error(user_context) << "Vulkan: Failed to create compute pipeline!\n";
+                error(user_context) << "Vulkan: Failed to create compute pipeline!";
                 return error_code;
             }
         }
@@ -1088,7 +1092,7 @@ int vk_destroy_compute_pipeline(void *user_context,
         << "compute_pipeline: " << (void *)compute_pipeline << ")\n";
 #endif
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!";
         return halide_error_code_generic_error;
     }
 
@@ -1110,12 +1114,12 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!";
         return nullptr;
     }
 
     if ((module_ptr == nullptr) || (module_size < (2 * sizeof(uint32_t)))) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!";
         return nullptr;
     }
 
@@ -1163,7 +1167,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     uint32_t idx = 1;  // skip past the header_word_count
     uint32_t shader_count = module_ptr[idx++];
     if (shader_count < 1) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!";
         return nullptr;  // no descriptors
     }
 
@@ -1172,7 +1176,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
     size_t shader_bindings_size = shader_count * sizeof(VulkanShaderBinding);
     VulkanShaderBinding *shader_bindings = (VulkanShaderBinding *)vk_host_malloc(user_context, shader_bindings_size, 0, alloc_scope, allocator->callbacks());
     if (shader_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!\n";
+        error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!";
         return nullptr;
     }
     memset(shader_bindings, 0, shader_bindings_size);
@@ -1205,7 +1209,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
             size_t specialization_constants_size = specialization_constants_count * sizeof(VulkanSpecializationConstant);
             specialization_constants = (VulkanSpecializationConstant *)vk_host_malloc(user_context, specialization_constants_size, 0, alloc_scope, allocator->callbacks());
             if (specialization_constants == nullptr) {
-                error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!\n";
+                error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!";
                 return nullptr;
             }
             memset(specialization_constants, 0, specialization_constants_size);
@@ -1241,7 +1245,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
             size_t shared_memory_allocations_size = shared_memory_allocations_count * sizeof(VulkanSharedMemoryAllocation);
             shared_memory_allocations = (VulkanSharedMemoryAllocation *)vk_host_malloc(user_context, shared_memory_allocations_size, 0, alloc_scope, allocator->callbacks());
             if (shared_memory_allocations == nullptr) {
-                error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!\n";
+                error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!";
                 return nullptr;
             }
             memset(shared_memory_allocations, 0, shared_memory_allocations_size);
@@ -1306,7 +1310,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA
 #endif
         shader_bindings[n].entry_point_name = (char *)vk_host_malloc(user_context, entry_point_name_length * sizeof(uint32_t), 0, alloc_scope, allocator->callbacks());
         if (shader_bindings[n].entry_point_name == nullptr) {
-            error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!\n";
+            error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!";
             return nullptr;
         }
 
@@ -1358,7 +1362,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all
             if (static_shared_mem_bytes > device_shared_mem_size) {
                 error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n"
                                     << " requested: " << static_shared_mem_bytes << " bytes,"
-                                    << " available: " << device_shared_mem_size << " bytes\n";
+                                    << " available: " << device_shared_mem_size << " bytes";
                 return halide_error_code_incompatible_device_interface;
             }
         }
@@ -1370,7 +1374,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all
         if (shader_count > max_descriptors) {
             error(user_context) << "Vulkan: Number of required descriptor sets exceeds the amount available for device!\n"
                                 << " requested: " << shader_count << ","
-                                << " available: " << max_descriptors << "\n";
+                                << " available: " << max_descriptors;
             return halide_error_code_incompatible_device_interface;
         }
     }
@@ -1466,7 +1470,7 @@ VulkanCompilationCacheEntry *vk_compile_kernel_module(void *user_context, Vulkan
         // Compile the "SPIR-V Module" for the kernel
         cache_entry->compiled_modules[i] = vk_compile_shader_module(user_context, allocator, (const char *)spirv_ptr, (int)spirv_size);
         if (cache_entry->compiled_modules[i] == nullptr) {
-            debug(user_context) << "Vulkan: Failed to compile shader module!\n";
+            debug(user_context) << "Vulkan: Failed to compile shader module!";
             error_code = halide_error_code_generic_error;
         }
 
@@ -1506,12 +1510,12 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
 #endif
 
     if (allocator == nullptr) {
-        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!\n";
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!";
         return nullptr;
     }
 
     if ((ptr == nullptr) || (size <= 0)) {
-        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!\n";
+        error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!";
         return nullptr;
     }
 
@@ -1549,7 +1553,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT;
     VulkanCompiledShaderModule *compiled_module = (VulkanCompiledShaderModule *)vk_host_malloc(user_context, sizeof(VulkanCompiledShaderModule), 0, alloc_scope, allocator->callbacks());
     if (compiled_module == nullptr) {
-        error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!\n";
+        error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!";
         return nullptr;
     }
     memset(compiled_module, 0, sizeof(VulkanCompiledShaderModule));
@@ -1557,7 +1561,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     // decode the entry point data and extract the shader bindings
     VulkanShaderBinding *decoded_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size);
     if (decoded_bindings == nullptr) {
-        error(user_context) << "Vulkan: Failed to decode shader bindings!\n";
+        error(user_context) << "Vulkan: Failed to decode shader bindings!";
         return nullptr;
     }
 
@@ -1574,8 +1578,8 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     compiled_module->shader_count = shader_count;
 
     VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &compiled_module->shader_module);
-    if ((result != VK_SUCCESS)) {
-        error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n";
+    if (result != VK_SUCCESS) {
+        vk_report_error(user_context, result, "vkCreateShaderModule");
         vk_host_free(user_context, compiled_module->shader_bindings, allocator->callbacks());
         vk_host_free(user_context, compiled_module, allocator->callbacks());
         return nullptr;
@@ -1585,7 +1589,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM
     if (compiled_module->shader_count) {
         compiled_module->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, compiled_module->shader_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks());
         if (compiled_module->descriptor_set_layouts == nullptr) {
-            error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n";
+            error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!";
             return nullptr;
         }
         memset(compiled_module->descriptor_set_layouts, 0, compiled_module->shader_count * sizeof(VkDescriptorSetLayout));
@@ -1758,7 +1762,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff
             VkBuffer *src_buffer = reinterpret_cast<VkBuffer *>(c.src);
             VkBuffer *dst_buffer = reinterpret_cast<VkBuffer *>(c.dst);
             if (!src_buffer || !dst_buffer) {
-                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n";
+                error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!";
                 return halide_error_code_internal_error;
             }
 
@@ -1796,7 +1800,7 @@ int vk_device_crop_from_offset(void *user_context,
 
     VulkanContext ctx(user_context);
     if (ctx.error != halide_error_code_success) {
-        error(user_context) << "Vulkan: Failed to acquire context!\n";
+        error(user_context) << "Vulkan: Failed to acquire context!";
         return ctx.error;
     }
 
@@ -1805,21 +1809,21 @@ int vk_device_crop_from_offset(void *user_context,
 #endif
 
     if (offset < 0) {
-        error(user_context) << "Vulkan: Invalid offset for device crop!\n";
+        error(user_context) << "Vulkan: Invalid offset for device crop!";
         return halide_error_code_device_crop_failed;
     }
 
     // get the allocated region for the device
     MemoryRegion *device_region = reinterpret_cast<MemoryRegion *>(src->device);
     if (device_region == nullptr) {
-        error(user_context) << "Vulkan: Failed to crop region! Invalide device region!\n";
+        error(user_context) << "Vulkan: Failed to crop region! Invalide device region!";
         return halide_error_code_device_crop_failed;
     }
 
     // create the croppeg region from the allocated region
     MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, (uint64_t)offset);
     if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) {
-        error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!\n";
+        error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!";
         return halide_error_code_device_crop_failed;
     }
 
diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
index 4bce8789875e..fc610b7d90de 100644
--- a/test/correctness/CMakeLists.txt
+++ b/test/correctness/CMakeLists.txt
@@ -223,6 +223,7 @@ tests(GROUPS correctness
       median3x3.cpp
       metal_precompiled_shaders.cpp
       memoize_cloned.cpp
+      metal_long_vectors.cpp
       min_extent.cpp
       mod.cpp
       mul_div_mod.cpp
diff --git a/test/error/metal_vector_too_large.cpp b/test/correctness/metal_long_vectors.cpp
similarity index 89%
rename from test/error/metal_vector_too_large.cpp
rename to test/correctness/metal_long_vectors.cpp
index bf4c74bb75a0..74c2e981fc2d 100644
--- a/test/error/metal_vector_too_large.cpp
+++ b/test/correctness/metal_long_vectors.cpp
@@ -9,7 +9,7 @@ int main(int argc, char **argv) {
     Var x("x"), y("y");
 
     f(x, y) = input(x, y) + 42;
-    f.vectorize(x, 16).gpu_blocks(y, DeviceAPI::Metal);
+    f.vectorize(x, 32).gpu_blocks(y, DeviceAPI::Metal);
 
     std::string test_object = Internal::get_test_tmp_dir() + "metal_vector_too_large.o";
     Target mac_target("x86-64-osx-metal");
diff --git a/test/correctness/require.cpp b/test/correctness/require.cpp
index 625383f460df..58226077d971 100644
--- a/test/correctness/require.cpp
+++ b/test/correctness/require.cpp
@@ -9,7 +9,7 @@ void halide_error(JITUserContext *ctx, const char *msg) {
     // Emitting "error.*:" to stdout or stderr will cause CMake to report the
     // test as a failure on Windows, regardless of error code returned,
     // hence the abbreviation to "err".
-    printf("Saw (Expected) Halide Err: %s\n", msg);
+    printf("Saw (Expected) Halide Err: %s", msg);
     error_occurred = true;
 }
 
@@ -46,14 +46,18 @@ static void test(int vector_width) {
     if (!error_occurred) {
         printf("There should have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 
+    printf("\n");
+
     p1.set(1);
     p2.set(kPrime1 - 1);
     error_occurred = false;
     result = f.realize({realize_width});
     if (error_occurred) {
-        printf("There should not have been a requirement error (vector_width = %d)\n", vector_width);
+        printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
     }
     for (int i = 0; i < realize_width; ++i) {
@@ -64,6 +68,8 @@ static void test(int vector_width) {
             exit(1);
         }
     }
+    printf("OK\n");
+    printf("\n");
 
     ImageParam input(Int(32), 2);
     Expr h = require(p1 == p2, p1);
@@ -81,8 +87,12 @@ static void test(int vector_width) {
     if (!error_occurred) {
         printf("There should have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 
+    printf("\n");
+
     p1.set(16);
     p2.set(16);
 
@@ -91,6 +101,8 @@ static void test(int vector_width) {
     if (error_occurred) {
         printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width);
         exit(1);
+    } else {
+        printf("OK\n");
     }
 }
 
diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h
index 53af05c5795f..ca507ce07038 100644
--- a/test/correctness/simd_op_check.h
+++ b/test/correctness/simd_op_check.h
@@ -506,20 +506,27 @@ class SimdOpCheckTest {
             }));
         }
 
+        std::vector<TestResult> failed_tests;
+        constexpr int tabstop = 32;
         for (auto &f : futures) {
             auto result = f.get();
-            constexpr int tabstop = 32;
             const int spaces = std::max(1, tabstop - (int)result.op.size());
             std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n";
             if (!result.error_msg.empty()) {
                 std::cerr << result.error_msg;
-                // The thread-pool destructor will block until in-progress tasks
-                // are done, and then will discard any tasks that haven't been
-                // launched yet.
-                return false;
+                failed_tests.push_back(std::move(result));
             }
         }
 
+        if (!failed_tests.empty()) {
+            std::cerr << "SIMD op check summary: " << failed_tests.size() << " tests failed:\n";
+            for (auto &result : failed_tests) {
+                const int spaces = std::max(1, tabstop - (int)result.op.size());
+                std::cerr << "   " << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n";
+            }
+            return false;
+        }
+
         return true;
     }
 
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
index 5da8e85d8b23..241152df2342 100644
--- a/test/correctness/simd_op_check_hvx.cpp
+++ b/test/correctness/simd_op_check_hvx.cpp
@@ -54,16 +54,24 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
             isa_version = 62;
         }
 
+        auto valign_test_u8 = [&](int off) {
+            return in_u8(x + off) + in_u8(x + off + 1);
+        };
+
+        auto valign_test_u16 = [&](int off) {
+            return in_u16(x + off) + in_u16(x + off + 1);
+        };
+
         // Verify that unaligned loads use the right instructions, and don't try to use
         // immediates of more than 3 bits.
-        check("valign(v*,v*,#7)", hvx_width / 1, in_u8(x + 7));
-        check("vlalign(v*,v*,#7)", hvx_width / 1, in_u8(x + hvx_width - 7));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + 8));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + hvx_width - 8));
-        check("valign(v*,v*,#6)", hvx_width / 1, in_u16(x + 3));
-        check("vlalign(v*,v*,#6)", hvx_width / 1, in_u16(x + hvx_width - 3));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + 4));
-        check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + hvx_width - 4));
+        check("valign(v*,v*,#7)", hvx_width / 1, valign_test_u8(6));
+        check("vlalign(v*,v*,#7)", hvx_width / 1, valign_test_u8(hvx_width - 7));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(8));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(hvx_width - 8));
+        check("valign(v*,v*,#6)", hvx_width / 1, valign_test_u16(3));
+        check("vlalign(v*,v*,#6)", hvx_width / 1, valign_test_u16(hvx_width - 3));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(4));
+        check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(hvx_width - 4));
 
         check("vunpack(v*.ub)", hvx_width / 1, u16(u8_1));
         check("vunpack(v*.ub)", hvx_width / 1, i16(u8_1));
diff --git a/test/correctness/specialize.cpp b/test/correctness/specialize.cpp
index 1a807003f72a..8df87dd27333 100644
--- a/test/correctness/specialize.cpp
+++ b/test/correctness/specialize.cpp
@@ -128,6 +128,11 @@ int main(int argc, char **argv) {
             }
         }
 
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
+
         // Should have used vector stores
         if (!vector_store || scalar_store) {
             printf("This was supposed to use vector stores\n");
@@ -156,6 +161,11 @@ int main(int argc, char **argv) {
             }
         }
 
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
+
         // Should have used scalar stores
         if (vector_store || !scalar_store) {
             printf("This was supposed to use scalar stores\n");
@@ -243,6 +253,10 @@ int main(int argc, char **argv) {
         // Check we don't crash with the small input, and that it uses scalar stores
         reset_trace();
         f.realize({5});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (!scalar_store || vector_store) {
             printf("These stores were supposed to be scalar.\n");
             return 1;
@@ -254,6 +268,10 @@ int main(int argc, char **argv) {
 
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (scalar_store || !vector_store) {
             printf("These stores were supposed to be vector.\n");
             return 1;
@@ -282,6 +300,10 @@ int main(int argc, char **argv) {
         // Check we used scalar stores for a strided input.
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (!scalar_store || vector_store) {
             printf("These stores were supposed to be scalar.\n");
             return 1;
@@ -293,6 +315,10 @@ int main(int argc, char **argv) {
 
         reset_trace();
         f.realize({100});
+        if (!vector_store && !scalar_store) {
+            printf("No stores were reported\n");
+            return 1;
+        }
         if (scalar_store || !vector_store) {
             printf("These stores were supposed to be vector.\n");
             return 1;
diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp
index f791385f7c25..dab19a370d93 100644
--- a/test/correctness/stage_strided_loads.cpp
+++ b/test/correctness/stage_strided_loads.cpp
@@ -10,7 +10,7 @@ class CheckForStridedLoads : public IRMutator {
         if (const Ramp *r = op->index.as<Ramp>()) {
             if (op->name == buf_name) {
                 bool dense = is_const_one(r->stride);
-                found |= !dense;
+                found_strided_load |= !dense;
                 dense_loads += dense;
             }
         }
@@ -18,27 +18,27 @@ class CheckForStridedLoads : public IRMutator {
     }
 
 public:
-    bool found = false;
+    bool found_strided_load = false;
     int dense_loads = 0;
     std::string buf_name;
 
     void check(Func f, int desired_dense_loads, std::string name = "buf") {
-        found = false;
+        found_strided_load = false;
         dense_loads = 0;
         buf_name = name;
         f.add_custom_lowering_pass(this, nullptr);
         f.compile_jit();
-        assert(!found);
+        assert(!found_strided_load);
         assert(dense_loads == desired_dense_loads);
     }
 
     void check_not(Func f, int desired_dense_loads, std::string name = "buf") {
-        found = false;
+        found_strided_load = false;
         dense_loads = 0;
         buf_name = name;
         f.add_custom_lowering_pass(this, nullptr);
         f.compile_jit();
-        assert(found);
+        assert(found_strided_load);
         assert(dense_loads == desired_dense_loads);
     }
 } checker;
diff --git a/test/correctness/vector_shuffle.cpp b/test/correctness/vector_shuffle.cpp
index aff6fcbcddcf..f0a62ab3d8cd 100644
--- a/test/correctness/vector_shuffle.cpp
+++ b/test/correctness/vector_shuffle.cpp
@@ -1,10 +1,20 @@
 #include "Halide.h"
+#include <algorithm>
+#include <cstdlib>
 #include <stdio.h>
 
 using namespace Halide;
 
-int main(int argc, char **argv) {
-    Target target = get_jit_target_from_environment();
+int test_with_indices(const Target &target, const std::vector<int> &indices0, const std::vector<int> &indices1) {
+    printf("indices0:");
+    for (int i : indices0) {
+        printf(" %d", i);
+    }
+    printf("    indices1:");
+    for (int i : indices1) {
+        printf(" %d", i);
+    }
+    printf("\n");
 
     Var x{"x"}, y{"y"};
     Func f0{"f0"}, f1{"f1"}, g{"g"};
@@ -12,15 +22,6 @@ int main(int argc, char **argv) {
     f1(x, y) = x * (y + 3);
     Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)});
     Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)});
-    std::vector<int> indices0;
-    std::vector<int> indices1;
-    if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) {
-        indices0 = {3, 1, 6, 7, 2, 4, 0, 5};
-        indices1 = {1, 0, 3, 4, 7, 0, 5, 2};
-    } else {
-        indices0 = {3, 1, 6, 7};
-        indices1 = {1, 0, 3, 4};
-    }
     Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0);
     Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1);
     Expr result = shuffle1 * shuffle2;
@@ -55,6 +56,94 @@ int main(int argc, char **argv) {
             return 1;
         }
     }
+    return 0;
+}
+
+int main(int argc, char **argv) {
+    Target target = get_jit_target_from_environment();
+
+    for (int vec_size = 8; vec_size > 1; vec_size /= 2) {
+        printf("Testing vector size %d...\n", vec_size);
+        std::vector<int> indices0, indices1;
+
+        // Test 1: All indices: foreward/backward and combined
+        for (int i = 0; i < vec_size; ++i) {
+            indices0.push_back(i);                 // forward
+            indices1.push_back(vec_size - i - 1);  // backward
+        }
+        printf("  All indices forward...\n");
+        if (test_with_indices(target, indices0, indices0)) {
+            return 1;
+        }
+        printf("  All indices backward...\n");
+        if (test_with_indices(target, indices1, indices1)) {
+            return 1;
+        }
+        printf("  All indices mixed forware / backward...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        // Test 2: Shuffled indices (4 repetitions)
+        for (int r = 0; r < 4; ++r) {
+            // Shuffle with Fisher-Yates
+            for (int i = vec_size - 1; i >= 1; --i) {
+                // indices0
+                int idx = std::rand() % (i + 1);
+                std::swap(indices0[idx], indices0[i]);
+                // indices1
+                idx = std::rand() % (i + 1);
+                std::swap(indices1[idx], indices1[i]);
+            }
+            printf("  Randomly shuffled...\n");
+            if (test_with_indices(target, indices0, indices1)) {
+                return 1;
+            }
+        }
+
+        // Test 3: Interleaved
+        indices0.clear();
+        indices1.clear();
+        for (int i = 0; i < vec_size / 2; ++i) {
+            // interleave (A, B)
+            indices0.push_back(i);
+            indices0.push_back(i + vec_size / 2);
+
+            // interleave (B, A)
+            indices1.push_back(i + vec_size / 2);
+            indices1.push_back(i);
+        }
+        printf("  Interleaved...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        // Test 4: Concat (not-really, as the input-vectors are size 4, so only if vec_size == 8, it's a concat)
+        indices0.clear();
+        indices1.clear();
+        for (int i = 0; i < vec_size; ++i) {
+            // concat (A, B)
+            indices0.push_back(i);
+
+            // concat (B, A)
+            indices1.push_back((i + vec_size / 2) % (vec_size / 2));
+        }
+        printf("  Concat...\n");
+        if (test_with_indices(target, indices0, indices1)) {
+            return 1;
+        }
+
+        if (vec_size == 4) {
+            indices0 = {1, 3, 2, 0};
+            indices1 = {2, 3, 1, 0};
+
+            printf("  Specific index combination, known to have caused problems...\n");
+            if (test_with_indices(target, indices0, indices1)) {
+                return 1;
+            }
+        }
+    }
+
     printf("Success!\n");
     return 0;
 }
diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt
index 41816d5ba36b..b7d6a380c504 100644
--- a/test/error/CMakeLists.txt
+++ b/test/error/CMakeLists.txt
@@ -79,7 +79,6 @@ tests(GROUPS error
       memoize_output_invalid.cpp
       memoize_redefine_eviction_key.cpp
       metal_threads_too_large.cpp
-      metal_vector_too_large.cpp
       mismatch_runtime_vscale.cpp
       missing_args.cpp
       no_default_device.cpp