diff --git a/.gitignore b/.gitignore index a08b8e8dd7f3..888235a389d8 100644 --- a/.gitignore +++ b/.gitignore @@ -240,6 +240,9 @@ xcuserdata # NeoVim + clangd .cache +# CCLS +.ccls-cache + # Emacs tags TAGS diff --git a/Makefile b/Makefile index 54c61a622ae8..a928cd9b81bb 100644 --- a/Makefile +++ b/Makefile @@ -535,6 +535,7 @@ SOURCE_FILES = \ IRVisitor.cpp \ JITModule.cpp \ Lambda.cpp \ + LegalizeVectors.cpp \ Lerp.cpp \ LICM.cpp \ LLVM_Output.cpp \ @@ -737,6 +738,7 @@ HEADER_FILES = \ WasmExecutor.h \ JITModule.h \ Lambda.h \ + LegalizeVectors.h \ Lerp.h \ LICM.h \ LLVM_Output.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index af419323b24e..cadfd608236a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -62,12 +62,14 @@ target_sources( Associativity.h AsyncProducers.h AutoScheduleUtils.h + BoundConstantExtentLoops.h + BoundSmallAllocations.h BoundaryConditions.h Bounds.h BoundsInference.h - BoundConstantExtentLoops.h - BoundSmallAllocations.h Buffer.h + CPlusPlusMangle.h + CSE.h Callable.h CanonicalizeGPUVars.h ClampUnsafeAccesses.h @@ -79,18 +81,16 @@ target_sources( CodeGen_LLVM.h CodeGen_Metal_Dev.h CodeGen_OpenCL_Dev.h - CodeGen_Posix.h CodeGen_PTX_Dev.h + CodeGen_Posix.h CodeGen_PyTorch.h CodeGen_Targets.h CodeGen_Vulkan_Dev.h CodeGen_WebGPU_Dev.h CompilerLogger.h ConciseCasts.h - CPlusPlusMangle.h ConstantBounds.h ConstantInterval.h - CSE.h Debug.h DebugArguments.h DebugToFile.h @@ -127,6 +127,13 @@ target_sources( Generator.h HexagonOffload.h HexagonOptimize.h + IR.h + IREquality.h + IRMatch.h + IRMutator.h + IROperator.h + IRPrinter.h + IRVisitor.h ImageParam.h InferArguments.h InjectHostDevBufferCopies.h @@ -135,19 +142,13 @@ target_sources( IntegerDivisionTable.h Interval.h IntrusivePtr.h - IR.h - IREquality.h - IRMatch.h - IRMutator.h - IROperator.h - IRPrinter.h - IRVisitor.h JITModule.h - Lambda.h - Lerp.h LICM.h LLVM_Output.h LLVM_Runtime_Linker.h + Lambda.h + LegalizeVectors.h + Lerp.h LoopCarry.h LoopPartitioningDirective.h Lower.h @@ -173,8 +174,8 @@ target_sources( PurifyIndexMath.h PythonExtensionGen.h Qualify.h - Random.h RDom.h + Random.h Realization.h RealizationOrder.h RebaseLoopsToZero.h @@ -320,6 +321,7 @@ target_sources( IRVisitor.cpp JITModule.cpp Lambda.cpp + LegalizeVectors.cpp Lerp.cpp LICM.cpp LLVM_Output.cpp diff --git a/src/CSE.cpp b/src/CSE.cpp index c2a46d93bc4d..6051e5e9cf62 100644 --- a/src/CSE.cpp +++ b/src/CSE.cpp @@ -33,6 +33,11 @@ bool should_extract(const Expr &e, bool lift_all) { return false; } + if (const Call *c = e.as()) { + // Calls with side effects should not be moved. + return c->is_pure() || c->call_type == Call::Halide; + } + if (lift_all) { return true; } diff --git a/src/CodeGen_Hexagon.cpp b/src/CodeGen_Hexagon.cpp index 05b68447b6a4..852721e0077d 100644 --- a/src/CodeGen_Hexagon.cpp +++ b/src/CodeGen_Hexagon.cpp @@ -1186,15 +1186,16 @@ Value *CodeGen_Hexagon::shuffle_vectors(Value *a, Value *b, create_bitcast(a_call->getArgOperand(1), native_ty), create_bitcast(a_call->getArgOperand(0), native_ty), indices); } else if (ShuffleVectorInst *a_shuffle = dyn_cast(a)) { - bool is_identity = true; - for (int i = 0; i < a_elements; i++) { - int mask_i = a_shuffle->getMaskValue(i); - is_identity = is_identity && (mask_i == i || mask_i == -1); - } - if (is_identity) { - return shuffle_vectors(a_shuffle->getOperand(0), - a_shuffle->getOperand(1), indices); + std::vector new_indices(indices.size()); + for (size_t i = 0; i < indices.size(); i++) { + if (indices[i] != -1) { + new_indices[i] = a_shuffle->getMaskValue(indices[i]); + } else { + new_indices[i] = -1; + } } + return shuffle_vectors(a_shuffle->getOperand(0), + a_shuffle->getOperand(1), new_indices); } } @@ -1516,7 +1517,11 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector &indices) { vector i8_indices(indices.size() * replicate); for (size_t i = 0; i < indices.size(); i++) { for (int j = 0; j < replicate; j++) { - i8_indices[i * replicate + j] = indices[i] * replicate + j; + if (indices[i] == -1) { + i8_indices[i * replicate + j] = -1; // Replicate the don't-care. + } else { + i8_indices[i * replicate + j] = indices[i] * replicate + j; + } } } Value *result = vdelta(i8_lut, i8_indices); @@ -1556,6 +1561,7 @@ Value *CodeGen_Hexagon::vdelta(Value *lut, const vector &indices) { Value *ret = nullptr; for (int i = 0; i < lut_elements; i += native_elements) { Value *lut_i = slice_vector(lut, i, native_elements); + internal_assert(get_vector_num_elements(lut_i->getType()) == native_elements); vector indices_i(native_elements); vector mask(native_elements); bool all_used = true; diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp index 32984f3f2e6f..aa58a94ec4ca 100644 --- a/src/CodeGen_LLVM.cpp +++ b/src/CodeGen_LLVM.cpp @@ -5007,10 +5007,11 @@ Value *CodeGen_LLVM::shuffle_vectors(Value *a, Value *b, } // Check for type identity *after* normalizing to fixed vectors internal_assert(a->getType() == b->getType()); + int elements_a = get_vector_num_elements(a->getType()); vector llvm_indices(indices.size()); for (size_t i = 0; i < llvm_indices.size(); i++) { if (indices[i] >= 0) { - internal_assert(indices[i] < get_vector_num_elements(a->getType()) * 2); + internal_assert(indices[i] < elements_a * 2) << indices[i] << " " << elements_a * 2; llvm_indices[i] = ConstantInt::get(i32_t, indices[i]); } else { // Only let -1 be undef. diff --git a/src/CodeGen_Vulkan_Dev.cpp b/src/CodeGen_Vulkan_Dev.cpp index 671f923ec183..19ba6c348ff9 100644 --- a/src/CodeGen_Vulkan_Dev.cpp +++ b/src/CodeGen_Vulkan_Dev.cpp @@ -2054,31 +2054,21 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) { debug(3) << "\n"; if (arg_ids.size() == 1) { - // 1 argument, just do a simple assignment via a cast SpvId result_id = cast_type(op->type, op->vectors[0].type(), arg_ids[0]); builder.update_id(result_id); } else if (arg_ids.size() == 2) { - - // 2 arguments, use a composite insert to update even and odd indices - uint32_t even_idx = 0; - uint32_t odd_idx = 1; - SpvFactory::Indices even_indices; - SpvFactory::Indices odd_indices; - for (int i = 0; i < op_lanes; ++i) { - even_indices.push_back(even_idx); - odd_indices.push_back(odd_idx); - even_idx += 2; - odd_idx += 2; + // 2 arguments, use vector-shuffle with logical indices indexing into (vec1[0], vec1[1], ..., vec2[0], vec2[1], ...) + SpvFactory::Indices logical_indices; + for (int i = 0; i < arg_lanes; ++i) { + logical_indices.push_back(uint32_t(i)); + logical_indices.push_back(uint32_t(i + arg_lanes)); } SpvId type_id = builder.declare_type(op->type); - SpvId value_id = builder.declare_null_constant(op->type); - SpvId partial_id = builder.reserve_id(SpvResultId); SpvId result_id = builder.reserve_id(SpvResultId); - builder.append(SpvFactory::composite_insert(type_id, partial_id, arg_ids[0], value_id, even_indices)); - builder.append(SpvFactory::composite_insert(type_id, result_id, arg_ids[1], partial_id, odd_indices)); + builder.append(SpvFactory::vector_shuffle(type_id, result_id, arg_ids[0], arg_ids[1], logical_indices)); builder.update_id(result_id); } else { @@ -2108,7 +2098,7 @@ void CodeGen_Vulkan_Dev::SPIRV_Emitter::visit(const Shuffle *op) { } else if (op->is_extract_element()) { int idx = op->indices[0]; internal_assert(idx >= 0); - internal_assert(idx <= op->vectors[0].type().lanes()); + internal_assert(idx < op->vectors[0].type().lanes()); if (op->vectors[0].type().is_vector()) { SpvFactory::Indices indices = {(uint32_t)idx}; SpvId type_id = builder.declare_type(op->type); diff --git a/src/Deinterleave.cpp b/src/Deinterleave.cpp index f7a5b5f49aa8..243760e9d050 100644 --- a/src/Deinterleave.cpp +++ b/src/Deinterleave.cpp @@ -299,6 +299,10 @@ class Deinterleaver : public IRGraphMutator { } else { Type t = op->type.with_lanes(new_lanes); + internal_assert((op->type.lanes() - starting_lane + lane_stride - 1) / lane_stride == new_lanes) + << "Deinterleaving with lane stride " << lane_stride << " and staring lane " << starting_lane + << " for var of Type " << op->type << " to " << t << " drops lanes unexpectedly." + << " Deinterleaver probably recursed too deep into types of different lane count."; if (external_lets.contains(op->name) && starting_lane == 0 && lane_stride == 2) { @@ -393,8 +397,12 @@ class Deinterleaver : public IRGraphMutator { int index = indices.front(); for (const auto &i : op->vectors) { if (index < i.type().lanes()) { - ScopedValue lane(starting_lane, index); - return mutate(i); + if (i.type().lanes() == op->type.lanes()) { + ScopedValue scoped_starting_lane(starting_lane, index); + return mutate(i); + } else { + return Shuffle::make(op->vectors, indices); + } } index -= i.type().lanes(); } @@ -406,10 +414,18 @@ class Deinterleaver : public IRGraphMutator { }; Expr deinterleave(Expr e, int starting_lane, int lane_stride, int new_lanes, const Scope<> &lets) { + debug(3) << "Deinterleave " + << "(start:" << starting_lane << ", stide:" << lane_stride << ", new_lanes:" << new_lanes << "): " + << e << " of Type: " << e.type() << "\n"; + Type original_type = e.type(); e = substitute_in_all_lets(e); Deinterleaver d(starting_lane, lane_stride, new_lanes, lets); e = d.mutate(e); e = common_subexpression_elimination(e); + Type final_type = e.type(); + int expected_lanes = (original_type.lanes() + lane_stride - starting_lane - 1) / lane_stride; + internal_assert(original_type.code() == final_type.code()) << "Underlying types not identical after interleaving."; + internal_assert(expected_lanes == final_type.lanes()) << "Number of lanes incorrect after interleaving: " << final_type.lanes() << "while expected was " << expected_lanes << "."; return simplify(e); } @@ -420,12 +436,12 @@ Expr extract_odd_lanes(const Expr &e, const Scope<> &lets) { Expr extract_even_lanes(const Expr &e, const Scope<> &lets) { internal_assert(e.type().lanes() % 2 == 0); - return deinterleave(e, 0, 2, (e.type().lanes() + 1) / 2, lets); + return deinterleave(e, 0, 2, e.type().lanes() / 2, lets); } Expr extract_mod3_lanes(const Expr &e, int lane, const Scope<> &lets) { internal_assert(e.type().lanes() % 3 == 0); - return deinterleave(e, lane, 3, (e.type().lanes() + 2) / 3, lets); + return deinterleave(e, lane, 3, e.type().lanes() / 3, lets); } } // namespace diff --git a/src/IR.cpp b/src/IR.cpp index c844c672656a..006da1b87e80 100644 --- a/src/IR.cpp +++ b/src/IR.cpp @@ -12,7 +12,7 @@ namespace Internal { Expr Cast::make(Type t, Expr v) { internal_assert(v.defined()) << "Cast of undefined\n"; - internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths\n"; + internal_assert(t.lanes() == v.type().lanes()) << "Cast may not change vector widths: " << v << " of type " << v.type() << " cannot be cast to " << t << "\n"; Cast *node = new Cast; node->type = t; @@ -281,7 +281,7 @@ Expr Ramp::make(Expr base, Expr stride, int lanes) { Expr Broadcast::make(Expr value, int lanes) { internal_assert(value.defined()) << "Broadcast of undefined\n"; - internal_assert(lanes != 1) << "Broadcast of lanes 1\n"; + internal_assert(lanes != 1) << "Broadcast over 1 lane is not a broadcast\n"; Broadcast *node = new Broadcast; node->type = value.type().with_lanes(lanes * value.type().lanes()); diff --git a/src/IROperator.h b/src/IROperator.h index d6d33a1cf82e..527015770093 100644 --- a/src/IROperator.h +++ b/src/IROperator.h @@ -1278,7 +1278,8 @@ Expr random_int(Expr seed = Expr()); /** Create an Expr that prints out its value whenever it is * evaluated. It also prints out everything else in the arguments - * list, separated by spaces. This can include string literals. */ + * list, separated by spaces. This can include string literals. + * Evaluates to the first argument passed. */ //@{ Expr print(const std::vector &values); diff --git a/src/LegalizeVectors.cpp b/src/LegalizeVectors.cpp new file mode 100644 index 000000000000..07be6d438354 --- /dev/null +++ b/src/LegalizeVectors.cpp @@ -0,0 +1,592 @@ +#include "LegalizeVectors.h" +#include "CSE.h" +#include "Deinterleave.h" +#include "DeviceInterface.h" +#include "IRMutator.h" +#include "IROperator.h" +#include "Simplify.h" +#include "Util.h" + +#include + +namespace Halide { +namespace Internal { + +namespace { + +using namespace std; + +const char *legalization_error_guide = "\n(This issue can most likely be resolved by reducing lane count for vectorize() calls in the schedule, or disabling it.)"; + +int max_lanes_for_device(DeviceAPI api, int parent_max_lanes) { + std::string envvar = Halide::Internal::get_env_variable("HL_FORCE_VECTOR_LEGALIZATION"); + if (!envvar.empty()) { + return std::atoi(envvar.c_str()); + } + switch (api) { + case DeviceAPI::Metal: + case DeviceAPI::WebGPU: + case DeviceAPI::Vulkan: + case DeviceAPI::D3D12Compute: + return 4; + case DeviceAPI::OpenCL: + return 16; + case DeviceAPI::CUDA: + case DeviceAPI::Hexagon: + case DeviceAPI::HexagonDma: + case DeviceAPI::Host: + return 0; // No max: LLVM based legalization + case DeviceAPI::None: + return parent_max_lanes; + case DeviceAPI::Default_GPU: + internal_error << "No GPU API was selected."; + return 0; + } + internal_error << "Unknown Device API"; + return 0; +} + +std::string vec_name(const string &name, int lane_start, int lane_count) { + return name + ".lanes_" + std::to_string(lane_start) + "_" + std::to_string(lane_start + lane_count - 1); +} + +class LiftLetToLetStmt : public IRMutator { + using IRMutator::visit; + + vector lets; + Expr visit(const Let *op) override { + for (const Let *existing : lets) { + internal_assert(existing->name != op->name) + << "Let " << op->name << " = ... cannot be lifted to LetStmt because the name is not unique."; + } + lets.push_back(op); + return mutate(op->body); + } + +public: + Stmt mutate(const Stmt &s) override { + ScopedValue scoped_lets(lets, {}); + Stmt mutated = IRMutator::mutate(s); + for (const Let *let : reverse_view(lets)) { + mutated = LetStmt::make(let->name, let->value, mutated); + } + return mutated; + } + + Expr mutate(const Expr &e) override { + return IRMutator::mutate(e); + } +}; + +class ExtractLanes : public IRMutator { + using IRMutator::visit; + + int lane_start; + int lane_count; + + Expr extract_lanes_from_make_struct(const Call *op) { + internal_assert(op); + internal_assert(op->is_intrinsic(Call::make_struct)); + vector args(op->args.size()); + for (int i = 0; i < int(op->args.size()); ++i) { + args[i] = mutate(op->args[i]); + } + return Call::make(op->type, Call::make_struct, args, Call::Intrinsic); + } + + Expr extract_lanes_trace(const Call *op) { + auto event = as_const_int(op->args[6]); + internal_assert(event); + if (*event == halide_trace_load || *event == halide_trace_store) { + debug(3) << "Extracting Trace Lanes: " << Expr(op) << "\n"; + const Expr &func = op->args[0]; + Expr values = extract_lanes_from_make_struct(op->args[1].as()); + Expr coords = extract_lanes_from_make_struct(op->args[2].as()); + const Expr &type_code = op->args[3]; + const Expr &type_bits = op->args[4]; + int type_lanes = *as_const_int(op->args[5]); + const Expr &event = op->args[6]; + const Expr &parent_id = op->args[7]; + const Expr &idx = op->args[8]; + int size = *as_const_int(op->args[9]); + const Expr &tag = op->args[10]; + + int num_vecs = op->args[2].as()->args.size(); + internal_assert(size == type_lanes * num_vecs) << Expr(op); + vector args = { + func, + values, coords, + type_code, type_bits, Expr(lane_count), + event, parent_id, idx, Expr(lane_count * num_vecs), + tag}; + Expr result = Call::make(Int(32), Call::trace, args, Call::Extern); + debug(4) << " => " << result << "\n"; + return result; + } + + internal_error << "Unhandled trace call in LegalizeVectors' ExtractLanes: " << *event << legalization_error_guide << "\n" + << "Please report this error on GitHub." << legalization_error_guide; + return Expr(0); + } + + Expr visit(const Shuffle *op) override { + vector new_indices; + new_indices.reserve(lane_count); + for (int i = 0; i < lane_count; ++i) { + new_indices.push_back(op->indices[lane_start + i]); + } + return simplify(Shuffle::make(op->vectors, new_indices)); + } + + Expr visit(const Ramp *op) override { + if (lane_count == 1) { + return simplify(op->base + op->stride * lane_start); + } + return simplify(Ramp::make(op->base + op->stride * lane_start, op->stride, lane_count)); + } + + Expr visit(const Broadcast *op) override { + Expr value = op->value; + if (const Call *call = op->value.as()) { + if (call->name == Call::trace) { + value = extract_lanes_trace(call); + } + } + if (lane_count == 1) { + return value; + } else { + return Broadcast::make(value, lane_count); + } + } + + Expr visit(const Variable *op) override { + return Variable::make(op->type.with_lanes(lane_count), vec_name(op->name, lane_start, lane_count)); + } + + Expr visit(const Load *op) override { + return Load::make(op->type.with_lanes(lane_count), + op->name, + mutate(op->index), + op->image, op->param, + mutate(op->predicate), + op->alignment + lane_start); + } + + Expr visit(const Call *op) override { + internal_assert(op->type.lanes() >= lane_start + lane_count); + Expr mutated = op; + std::vector args; + args.reserve(op->args.size()); + for (int i = 0; i < int(op->args.size()); ++i) { + const Expr &arg = op->args[i]; + internal_assert(arg.type().lanes() == op->type.lanes()) + << "Call argument " << arg << " lane count of " << arg.type().lanes() + << " does not match op lane count of " << op->type.lanes(); + Expr mutated = mutate(arg); + internal_assert(!mutated.same_as(arg)); + args.push_back(mutated); + } + mutated = Call::make(op->type.with_lanes(lane_count), op->name, args, op->call_type); + return mutated; + } + + Expr visit(const Cast *op) override { + return Cast::make(op->type.with_lanes(lane_count), mutate(op->value)); + } + + Expr visit(const Reinterpret *op) override { + Type result_type = op->type.with_lanes(lane_count); + int result_scalar_bits = op->type.element_of().bits(); + int input_scalar_bits = op->value.type().element_of().bits(); + + Expr value = op->value; + // If the bit widths of the scalar elements are the same, it's easy. + if (result_scalar_bits == input_scalar_bits) { + value = mutate(value); + } else { + // Otherwise, there can be two limiting aspects: the input lane count and the resulting lane count. + // In order to construct a correct Reinterpret from a small type to a wider type, we + // will need to produce multiple Reinterprets, all able to hold the lane count of the input + // and concatate the results together. + // Even worse, reinterpreting uint8x8 to uint64 would require intermediate reinterprets + // if the maximul legal vector length is 4. + // + // TODO implement this for all scenarios + internal_error << "Vector legalization for Reinterpret to different bit size per element is " + << "not supported yet: reinterpret<" << op->type << ">(" << value.type() << ")" + << legalization_error_guide; + + // int input_lane_start = lane_start * result_scalar_bits / input_scalar_bits; + // int input_lane_count = lane_count * result_scalar_bits / input_scalar_bits; + } + Expr result = Reinterpret::make(result_type, value); + debug(3) << "Legalized " << Expr(op) << " to " << result << "\n"; + return result; + } + + Expr visit(const VectorReduce *op) override { + internal_assert(op->type.lanes() >= lane_start + lane_count); + int vecs_per_reduction = op->value.type().lanes() / op->type.lanes(); + int input_lane_start = vecs_per_reduction * lane_start; + int input_lane_count = vecs_per_reduction * lane_count; + Expr arg = ExtractLanes(input_lane_start, input_lane_count).mutate(op->value); + // This might fail if the extracted lanes reference a non-existing variable! + return VectorReduce::make(op->op, arg, lane_count); + } + +public: + // Small helper to assert the transform did what it's supposed to do. + Expr mutate(const Expr &e) override { + Type original_type = e.type(); + internal_assert(original_type.lanes() >= lane_start + lane_count) + << "Cannot extract lanes " << lane_start << " through " << lane_start + lane_count - 1 + << " when the input type is " << original_type; + Expr result = IRMutator::mutate(e); + Type new_type = result.type(); + internal_assert(new_type.lanes() == lane_count) + << "We didn't correctly legalize " << e << " of type " << original_type << ".\n" + << "Got back: " << result << " of type " << new_type << ", expected " << lane_count << " lanes."; + return result; + } + + Stmt mutate(const Stmt &s) override { + return IRMutator::mutate(s); + } + + ExtractLanes(int start, int count) + : lane_start(start), lane_count(count) { + } +}; + +class LiftExceedingVectors : public IRMutator { + using IRMutator::visit; + + int max_lanes; + + vector> lets; + bool just_in_let_definition{false}; + + Expr visit(const Let *op) override { + internal_error << "We don't want to process Lets. They should have all been converted to LetStmts."; + return IRMutator::visit(op); + } + + Stmt visit(const LetStmt *op) override { + just_in_let_definition = true; + Expr def = mutate(op->value); + just_in_let_definition = false; + + Stmt body = mutate(op->body); + if (def.same_as(op->value) && body.same_as(op->body)) { + return op; + } + return LetStmt::make(op->name, std::move(def), std::move(body)); + } + + Expr visit(const Call *op) override { + // Custom handling of Call, to prevent certain things from being extracted out + // of the call arguments, as that's not always allowed. + bool exceeds_lanecount = op->type.lanes() > max_lanes; + Expr mutated = op; + if (exceeds_lanecount) { + std::vector args; + args.reserve(op->args.size()); + bool changed = false; + for (int i = 0; i < int(op->args.size()); ++i) { + bool may_extract = true; + if (op->is_intrinsic(Call::require)) { + // Call::require is special: it behaves a little like if-then-else: + // it runs the 3rd argument (the error handling part) only when there + // is an error. Extracting that would unconditionally print the error. + may_extract &= i < 2; + } + if (op->is_intrinsic(Call::if_then_else)) { + // Only allow the condition to be extracted. + may_extract &= i == 0; + } + const Expr &arg = op->args[i]; + if (may_extract) { + internal_assert(arg.type().lanes() == op->type.lanes()); + Expr mutated = mutate(arg); + if (!mutated.same_as(arg)) { + changed = true; + } + args.push_back(mutated); + } else { + args.push_back(arg); + } + } + if (!changed) { + return op; + } + mutated = Call::make(op->type, op->name, args, op->call_type); + } else { + mutated = IRMutator::visit(op); + } + return mutated; + } + +public: + Stmt mutate(const Stmt &s) override { + ScopedValue scoped_lets(lets, {}); + just_in_let_definition = false; + Stmt mutated = IRMutator::mutate(s); + for (auto &let : reverse_view(lets)) { + // There is no recurse into let.second. This is handled by repeatedly calling this tranform. + mutated = LetStmt::make(let.first, let.second, mutated); + } + return mutated; + } + + Expr mutate(const Expr &e) override { + bool exceeds_lanecount = e.type().lanes() > max_lanes; + + if (exceeds_lanecount) { + bool should_extract = false; + should_extract |= e.node_type() == IRNodeType::Shuffle; + should_extract |= e.node_type() == IRNodeType::VectorReduce; + + should_extract &= !just_in_let_definition; + + debug((should_extract ? 3 : 4)) << "Max lanes (" << max_lanes << ") exceeded (" << e.type().lanes() << ") by: " << e << "\n"; + if (should_extract) { + std::string name = unique_name('t'); + Expr var = Variable::make(e.type(), name); + lets.emplace_back(name, e); + debug(3) << " => Lifted out into " << name << "\n"; + return var; + } + } + + just_in_let_definition = false; + return IRMutator::mutate(e); + } + + LiftExceedingVectors(int max_lanes) + : max_lanes(max_lanes) { + internal_assert(max_lanes != 0) << "LiftExceedingVectors should not be called when there is no lane limit."; + } +}; + +class LegalizeVectors : public IRMutator { + using IRMutator::visit; + + int max_lanes; + + Stmt visit(const LetStmt *op) override { + bool exceeds_lanecount = op->value.type().lanes() > max_lanes; + + if (exceeds_lanecount) { + int num_vecs = (op->value.type().lanes() + max_lanes - 1) / max_lanes; + debug(3) << "Legalize let " << op->value.type() << ": " << op->name + << " = " << op->value << " into " << num_vecs << " vecs\n"; + Stmt body = IRMutator::mutate(op->body); + for (int i = num_vecs - 1; i >= 0; --i) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes); + std::string name = vec_name(op->name, lane_start, lane_count_for_vec); + + Expr value = mutate(ExtractLanes(lane_start, lane_count_for_vec).mutate(op->value)); + + debug(3) << " Add: let " << name << " = " << value << "\n"; + body = LetStmt::make(name, value, body); + } + return body; + } else { + return IRMutator::visit(op); + } + } + + Expr visit(const Let *op) override { + internal_error << "Lets should have been lifted into LetStmts."; + return IRMutator::visit(op); + } + + Stmt visit(const Store *op) override { + bool exceeds_lanecount = op->index.type().lanes() > max_lanes; + if (exceeds_lanecount) { + // Split up in multiple stores + int num_vecs = (op->index.type().lanes() + max_lanes - 1) / max_lanes; + std::vector assignments; + assignments.reserve(num_vecs); + for (int i = 0; i < num_vecs; ++i) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(op->value.type().lanes() - lane_start, max_lanes); + Expr rhs = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->value); + Expr index = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->index); + Expr predictate = ExtractLanes(lane_start, lane_count_for_vec).mutate(op->predicate); + assignments.push_back(Store::make( + op->name, std::move(rhs), std::move(index), + op->param, std::move(predictate), op->alignment + lane_start)); + } + Stmt result = Block::make(assignments); + debug(3) << "Legalized store " << Stmt(op) << " => " << result << "\n"; + return result; + } + return IRMutator::visit(op); + } + + Expr visit(const Shuffle *op) override { + internal_assert(op->type.lanes() <= max_lanes) << Expr(op); + bool requires_mutation = false; + for (const auto &vec : op->vectors) { + if (vec.type().lanes() > max_lanes) { + requires_mutation = true; + break; + } + } + + if (requires_mutation) { + debug(4) << "Legalizing Shuffle " << Expr(op) << "\n"; + // We are dealing with a shuffle of an exceeding-lane-count vector argument. + // We can assume the variable here has extracted lane variables in surrounding Lets. + // So let's hope it's a simple case, and we can legalize. + + vector new_vectors; + vector> vector_and_lane_indices = op->vector_and_lane_indices(); + for (int i = 0; i < int(op->vectors.size()); ++i) { + const Expr &vec = op->vectors[i]; + if (vec.type().lanes() > max_lanes) { + debug(4) << " Arg " << i << ": " << vec << "\n"; + int num_vecs = (vec.type().lanes() + max_lanes - 1) / max_lanes; + for (int i = 0; i < num_vecs; i++) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(vec.type().lanes() - lane_start, max_lanes); + new_vectors.push_back(ExtractLanes(lane_start, lane_count_for_vec).mutate(vec)); + } + } else { + new_vectors.push_back(IRMutator::mutate(vec)); + } + } + Expr result = simplify(Shuffle::make(new_vectors, op->indices)); + debug(3) << "Legalized " << Expr(op) << " => " << result << "\n"; + return result; + } + return IRMutator::visit(op); + } + + Expr visit(const VectorReduce *op) override { + const Expr &arg = op->value; + if (arg.type().lanes() > max_lanes) { + // TODO: The transformation below is not allowed under strict_float, but + // I don't immediately know what to do here. + // This should be an internal_assert. + + internal_assert(op->type.lanes() == 1) + << "Vector legalization currently does not support VectorReduce with lanes != 1: " << Expr(op) + << legalization_error_guide; + int num_vecs = (arg.type().lanes() + max_lanes - 1) / max_lanes; + Expr result; + for (int i = 0; i < num_vecs; i++) { + int lane_start = i * max_lanes; + int lane_count_for_vec = std::min(arg.type().lanes() - lane_start, max_lanes); + Expr partial_arg = mutate(ExtractLanes(lane_start, lane_count_for_vec).mutate(arg)); + Expr partial_red = VectorReduce::make(op->op, std::move(partial_arg), op->type.lanes()); + if (i == 0) { + result = partial_red; + } else { + switch (op->op) { + case VectorReduce::Add: + result = result + partial_red; + break; + case VectorReduce::SaturatingAdd: + result = saturating_add(result, partial_red); + break; + case VectorReduce::Mul: + result = result * partial_red; + break; + case VectorReduce::Min: + result = min(result, partial_red); + break; + case VectorReduce::Max: + result = max(result, partial_red); + break; + case VectorReduce::And: + result = result && partial_red; + break; + case VectorReduce::Or: + result = result || partial_red; + break; + } + } + } + return result; + } else { + return IRMutator::visit(op); + } + } + +public: + LegalizeVectors(int max_lanes) + : max_lanes(max_lanes) { + internal_assert(max_lanes != 0) << "LegalizeVectors should not be called when there is no lane limit."; + } +}; + +} // namespace + +Stmt legalize_vectors_in_device_loop(const For *op) { + int max_lanes = max_lanes_for_device(op->device_api, 0); + + // Similar to CSE, lifting out stuff into variables. + // Pass 1): lift out Shuffles that exceed lane count into variables + // Pass 2): Rewrite those vector variables as bundles of vector variables, while legalizing all other stuff. + Stmt m0 = simplify(op->body); + Stmt m1 = common_subexpression_elimination(m0, false); + if (!m1.same_as(op->body)) { + debug(3) << "After CSE:\n" + << m1 << "\n"; + } + Stmt m2 = LiftLetToLetStmt().mutate(m1); + if (!m2.same_as(m1)) { + debug(3) << "After lifting Lets to LetStmts:\n" + << m2 << "\n"; + } + + Stmt m3 = m2; + while (true) { + Stmt m = LiftExceedingVectors(max_lanes).mutate(m3); + bool modified = !m3.same_as(m); + m3 = std::move(m); + if (!modified) { + debug(3) << "Nothing got lifted out\n"; + break; + } else { + debug(3) << "Atfer lifting exceeding vectors:\n" + << m3 << "\n"; + } + } + + Stmt m4 = LegalizeVectors(max_lanes).mutate(m3); + if (!m4.same_as(m3)) { + debug(3) << "After legalizing vectors:\n" + << m4 << "\n"; + } + if (m4.same_as(m2)) { + debug(3) << "Vector Legalization did do nothing, returning input.\n"; + return op; + } + Stmt m5 = simplify(m4); + if (!m4.same_as(m5)) { + debug(3) << "After simplify:\n" + << m5 << "\n"; + } + return For::make(op->name, op->min, op->max, op->for_type, + op->partition_policy, op->device_api, m5); +} + +Stmt legalize_vectors(const Stmt &s) { + class LegalizeDeviceLoops : public IRMutator { + using IRMutator::visit; + Stmt visit(const For *op) override { + if (max_lanes_for_device(op->device_api, 0)) { + return legalize_vectors_in_device_loop(op); + } else { + return IRMutator::visit(op); + } + } + } mutator; + return mutator.mutate(s); +} +} // namespace Internal +} // namespace Halide diff --git a/src/LegalizeVectors.h b/src/LegalizeVectors.h new file mode 100644 index 000000000000..14fe8d806fb1 --- /dev/null +++ b/src/LegalizeVectors.h @@ -0,0 +1,19 @@ +#ifndef HALIDE_INTERNAL_LEGALIZE_VECTORS_H +#define HALIDE_INTERNAL_LEGALIZE_VECTORS_H + +#include "Expr.h" + +/** \file + * Defines a lowering pass that legalizes vectorized expressions + * to have a maximal lane count. + */ + +namespace Halide { +namespace Internal { + +Stmt legalize_vectors(const Stmt &s); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/Lower.cpp b/src/Lower.cpp index fcbc66747242..c5503166b418 100644 --- a/src/Lower.cpp +++ b/src/Lower.cpp @@ -42,6 +42,7 @@ #include "InjectHostDevBufferCopies.h" #include "Inline.h" #include "LICM.h" +#include "LegalizeVectors.h" #include "LoopCarry.h" #include "LowerParallelTasks.h" #include "LowerWarpShuffles.h" @@ -444,6 +445,10 @@ void lower_impl(const vector &output_funcs, s = flatten_nested_ramps(s); log("Lowering after flattening nested ramps:", s); + debug(1) << "Legalizing vectors...\n"; + s = legalize_vectors(s); + log("Lowering after legalizing vectors:", s); + debug(1) << "Removing dead allocations and moving loop invariant code...\n"; s = remove_dead_allocations(s); s = simplify(s); diff --git a/src/Simplify_Let.cpp b/src/Simplify_Let.cpp index c7d598d5cf71..3f8a9b45538d 100644 --- a/src/Simplify_Let.cpp +++ b/src/Simplify_Let.cpp @@ -98,7 +98,7 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) { Expr new_var = Variable::make(f.new_value.type(), f.new_name); Expr replacement = new_var; - debug(4) << "simplify let " << op->name << " = " << f.value << " in...\n"; + debug(4) << "simplify let " << op->name << " = (" << f.value.type() << ") " << f.value << " in...\n"; while (true) { const Variable *var = f.new_value.template as(); @@ -180,6 +180,16 @@ Body Simplify::simplify_let(const LetOrLetStmt *op, ExprInfo *info) { f.new_value = cast->value; new_var = Variable::make(f.new_value.type(), f.new_name); replacement = substitute(f.new_name, Cast::make(cast->type, new_var), replacement); + } else if (shuffle && shuffle->is_concat() && is_pure(shuffle)) { + // Substitute in all concatenates as they will likely simplify + // with other shuffles. + // As the structure of this while loop makes it hard to peel off + // pure operations from _all_ arguments to the Shuffle, we will + // instead substitute all of the vars that go in the shuffle, and + // instead guard against side effects by checking with `is_pure()`. + replacement = substitute(f.new_name, shuffle, replacement); + f.new_value = Expr(); + break; } else if (shuffle && shuffle->is_slice()) { // Replacing new_value below might free the shuffle // indices vector, so save them now. diff --git a/src/Simplify_Shuffle.cpp b/src/Simplify_Shuffle.cpp index aecb4c6fc99a..5c84cea8d195 100644 --- a/src/Simplify_Shuffle.cpp +++ b/src/Simplify_Shuffle.cpp @@ -5,6 +5,7 @@ namespace Halide { namespace Internal { +using std::pair; using std::vector; Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { @@ -25,9 +26,11 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { } } - // Mutate the vectors vector new_vectors; + vector new_indices = op->indices; bool changed = false; + + // Mutate the vectors for (const Expr &vector : op->vectors) { ExprInfo v_info; Expr new_vector = mutate(vector, &v_info); @@ -45,48 +48,151 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { new_vectors.push_back(new_vector); } - // Try to convert a load with shuffled indices into a - // shuffle of a dense load. + // A concat of one vector, is just the vector. + // (Early check, this is repeated below, once the argument list is potentially reduced) + if (op->vectors.size() == 1 && op->is_concat()) { + return new_vectors[0]; + } + + Expr result = op; + + // Analyze which input vectors are actually used. We will rewrite + // the vector of inputs and the indices jointly, and continue with + // those below. + { + vector arg_used(new_vectors.size()); + // Figure out if all extracted lanes come from 1 component. + vector> src_vec_and_lane_idx = op->vector_and_lane_indices(); + for (int i = 0; i < int(op->indices.size()); ++i) { + arg_used[src_vec_and_lane_idx[i].first] = true; + } + size_t num_args_used = 0; + for (bool used : arg_used) { + if (used) { + num_args_used++; + } + } + + if (num_args_used < op->vectors.size()) { + // Not all arguments to the shuffle are used by the indices. + // Let's throw them out. + for (int vi = arg_used.size() - 1; vi >= 0; --vi) { + if (!arg_used[vi]) { + int lanes_deleted = op->vectors[vi].type().lanes(); + int vector_start_lane = 0; + for (int i = 0; i < vi; ++i) { + vector_start_lane += op->vectors[i].type().lanes(); + } + for (int &new_index : new_indices) { + if (new_index > vector_start_lane) { + internal_assert(new_index >= vector_start_lane + lanes_deleted); + new_index -= lanes_deleted; + } + } + new_vectors.erase(new_vectors.begin() + vi); + } + } + + changed = true; + } + } + + // Replace the op with the intermediate simplified result (if it changed), and continue. + if (changed) { + result = Shuffle::make(new_vectors, new_indices); + op = result.as(); + changed = false; + } + + if (new_vectors.size() == 1) { + const Ramp *ramp = new_vectors[0].as(); + if (ramp && op->is_slice()) { + int first_lane_in_src = op->indices[0]; + int slice_stride = op->slice_stride(); + if (slice_stride >= 1) { + return mutate(Ramp::make(ramp->base + first_lane_in_src * ramp->stride, + ramp->stride * slice_stride, + op->indices.size()), + nullptr); + } + } + + // Test this again, but now after new_vectors got potentially shorter. + if (op->is_concat()) { + return new_vectors[0]; + } + } + + // Try to convert a Shuffle of Loads into a single Load of a Ramp. + // Make sure to not undo the work of the StageStridedLoads pass: + // only if the result of the shuffled indices is a *dense* ramp, we + // can proceed. There are two side cases: concatenations of scalars, + // and when the loads weren't dense to begin with. if (const Load *first_load = new_vectors[0].as()) { vector load_predicates; vector load_indices; + bool all_loads_are_dense = true; bool unpredicated = true; + bool concat_of_scalars = true; for (const Expr &e : new_vectors) { const Load *load = e.as(); if (load && load->name == first_load->name) { load_predicates.push_back(load->predicate); load_indices.push_back(load->index); unpredicated = unpredicated && is_const_one(load->predicate); + if (const Ramp *index_ramp = load->index.as()) { + if (!is_const_one(index_ramp->stride)) { + all_loads_are_dense = false; + } + } else if (!load->index.type().is_scalar()) { + all_loads_are_dense = false; + } + if (!load->index.type().is_scalar()) { + concat_of_scalars = false; + } } else { break; } } + debug(3) << "Shuffle of Load found: " << result << " where" + << " all_loads_are_dense=" << all_loads_are_dense << "," + << " concat_of_scalars=" << concat_of_scalars << "\n"; + if (load_indices.size() == new_vectors.size()) { + // All of the Shuffle arguments are Loads. Type t = load_indices[0].type().with_lanes(op->indices.size()); Expr shuffled_index = Shuffle::make(load_indices, op->indices); + debug(3) << " Shuffled index: " << shuffled_index << "\n"; ExprInfo shuffled_index_info; shuffled_index = mutate(shuffled_index, &shuffled_index_info); - if (shuffled_index.as()) { - ExprInfo base_info; - if (const Ramp *r = shuffled_index.as()) { - mutate(r->base, &base_info); - } + debug(3) << " Simplified shuffled index: " << shuffled_index << "\n"; + if (const Ramp *index_ramp = shuffled_index.as()) { + if (is_const_one(index_ramp->stride) || !all_loads_are_dense || concat_of_scalars) { + ExprInfo base_info; + mutate(index_ramp->base, &base_info); - ModulusRemainder alignment = - ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment); + ModulusRemainder alignment = + ModulusRemainder::intersect(base_info.alignment, shuffled_index_info.alignment); - Expr shuffled_predicate; - if (unpredicated) { - shuffled_predicate = const_true(t.lanes(), nullptr); - } else { - shuffled_predicate = Shuffle::make(load_predicates, op->indices); - shuffled_predicate = mutate(shuffled_predicate, nullptr); + Expr shuffled_predicate; + if (unpredicated) { + shuffled_predicate = const_true(t.lanes(), nullptr); + } else { + shuffled_predicate = Shuffle::make(load_predicates, op->indices); + shuffled_predicate = mutate(shuffled_predicate, nullptr); + } + t = first_load->type; + t = t.with_lanes(op->indices.size()); + Expr result = Load::make(t, first_load->name, shuffled_index, first_load->image, + first_load->param, shuffled_predicate, alignment); + debug(3) << " => " << result << "\n"; + return result; } - t = first_load->type; - t = t.with_lanes(op->indices.size()); - return Load::make(t, first_load->name, shuffled_index, first_load->image, - first_load->param, shuffled_predicate, alignment); + } else { + // We can't... Leave it as a Shuffle of Loads. + // Note: don't proceed down. + return result; } } } @@ -256,6 +362,14 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { } } + for (size_t i = 0; i < new_vectors.size() && can_collapse; i++) { + if (new_vectors[i].as()) { + // Don't create a Ramp of a Load, like: + // ramp(buf[x], buf[x + 1] - buf[x], ...) + can_collapse = false; + } + } + if (can_collapse) { return Ramp::make(new_vectors[0], stride, op->indices.size()); } @@ -289,13 +403,18 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { if (inner_shuffle->is_concat()) { int slice_min = op->indices.front(); int slice_max = op->indices.back(); + if (slice_min > slice_max) { + // Slices can go backward. + std::swap(slice_min, slice_max); + } int concat_index = 0; int new_slice_start = -1; vector new_concat_vectors; for (const auto &v : inner_shuffle->vectors) { // Check if current concat vector overlaps with slice. - if ((concat_index >= slice_min && concat_index <= slice_max) || - ((concat_index + v.type().lanes() - 1) >= slice_min && (concat_index + v.type().lanes() - 1) <= slice_max)) { + int overlap_max = std::min(slice_max, concat_index + v.type().lanes() - 1); + int overlap_min = std::max(slice_min, concat_index); + if (overlap_min <= overlap_max) { if (new_slice_start < 0) { new_slice_start = concat_index; } @@ -305,17 +424,16 @@ Expr Simplify::visit(const Shuffle *op, ExprInfo *info) { concat_index += v.type().lanes(); } if (new_concat_vectors.size() < inner_shuffle->vectors.size()) { - return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), op->slice_begin() - new_slice_start, op->slice_stride(), op->indices.size()); + return Shuffle::make_slice(Shuffle::make_concat(new_concat_vectors), + op->slice_begin() - new_slice_start, + op->slice_stride(), + op->indices.size()); } } } } - if (!changed) { - return op; - } else { - return Shuffle::make(new_vectors, op->indices); - } + return result; } } // namespace Internal diff --git a/src/VectorizeLoops.cpp b/src/VectorizeLoops.cpp index 2d149adbaf20..fc6fd9531983 100644 --- a/src/VectorizeLoops.cpp +++ b/src/VectorizeLoops.cpp @@ -732,8 +732,8 @@ class VectorSubs : public IRMutator { if (op->is_intrinsic(Call::prefetch)) { // We don't want prefetch args to ve vectorized, but we can't just skip the mutation - // (otherwise we can end up with dead loop variables. Instead, use extract_lane() on each arg - // to scalarize it again. + // (otherwise we can end up with dead loop variables). Instead, use extract_lane() on + // each arg to scalarize it again. for (auto &arg : new_args) { if (arg.type().is_vector()) { arg = extract_lane(arg, 0); diff --git a/src/runtime/vulkan_internal.h b/src/runtime/vulkan_internal.h index aeef545385cc..ef6376a58f7d 100644 --- a/src/runtime/vulkan_internal.h +++ b/src/runtime/vulkan_internal.h @@ -279,6 +279,8 @@ const char *vk_get_error_name(VkResult error) { return "VK_ERROR_FORMAT_NOT_SUPPORTED"; case VK_ERROR_FRAGMENTED_POOL: return "VK_ERROR_FRAGMENTED_POOL"; + case VK_ERROR_UNKNOWN: + return "VK_ERROR_UNKNOWN"; case VK_ERROR_SURFACE_LOST_KHR: return "VK_ERROR_SURFACE_LOST_KHR"; case VK_ERROR_NATIVE_WINDOW_IN_USE_KHR: @@ -302,6 +304,8 @@ const char *vk_get_error_name(VkResult error) { } } +#define vk_report_error(user_context, code, func) (error((user_context)) << "Vulkan: " << (func) << " returned " << vk_get_error_name((code)) << " (code: " << (code) << ") ") + // -------------------------------------------------------------------------- } // namespace diff --git a/src/runtime/vulkan_resources.h b/src/runtime/vulkan_resources.h index c5c3c6620a9f..09e532a50c48 100644 --- a/src/runtime/vulkan_resources.h +++ b/src/runtime/vulkan_resources.h @@ -85,7 +85,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, debug(user_context) << " vk_create_command_pool (user_context: " << user_context << ", " << "allocator: " << (void *)allocator << ", " - << "queue_index: " << queue_index << ")\n"; + << "queue_index: " << queue_index << ")"; #endif if (allocator == nullptr) { @@ -103,7 +103,7 @@ int vk_create_command_pool(void *user_context, VulkanMemoryAllocator *allocator, VkResult result = vkCreateCommandPool(allocator->current_device(), &command_pool_info, allocator->callbacks(), command_pool); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create command pool!\n"; + vk_report_error(user_context, result, "vkCreateCommandPool"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -117,7 +117,7 @@ int vk_destroy_command_pool(void *user_context, VulkanMemoryAllocator *allocator << "command_pool: " << (void *)command_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy command pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkResetCommandPool(allocator->current_device(), command_pool, VK_COMMAND_POOL_RESET_RELEASE_RESOURCES_BIT); @@ -135,7 +135,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato << "command_pool: " << (void *)command_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create command buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -150,7 +150,7 @@ int vk_create_command_buffer(void *user_context, VulkanMemoryAllocator *allocato VkResult result = vkAllocateCommandBuffers(allocator->current_device(), &command_buffer_info, command_buffer); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to allocate command buffers!\n"; + vk_report_error(user_context, result, "vkAllocateCommandBuffers"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -165,7 +165,7 @@ int vk_destroy_command_buffer(void *user_context, VulkanMemoryAllocator *allocat << "command_buffer: " << (void *)command_buffer << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy command buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -231,7 +231,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context, VkResult result = vkBeginCommandBuffer(command_buffer, &command_buffer_begin_info); if (result != VK_SUCCESS) { - error(user_context) << "vkBeginCommandBuffer returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkBeginCommandBuffer"); return halide_error_code_generic_error; } @@ -242,7 +242,7 @@ int vk_fill_command_buffer_with_dispatch_call(void *user_context, result = vkEndCommandBuffer(command_buffer); if (result != VK_SUCCESS) { - error(user_context) << "vkEndCommandBuffer returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkEndCommandBuffer"); return halide_error_code_generic_error; } @@ -272,7 +272,7 @@ int vk_submit_command_buffer(void *user_context, VkQueue queue, VkCommandBuffer VkResult result = vkQueueSubmit(queue, 1, &submit_info, VK_NULL_HANDLE); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkQueueSubmit returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkSubmitQueue"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -327,7 +327,7 @@ int vk_create_descriptor_pool(void *user_context, << "storage_buffer_count: " << (uint32_t)storage_buffer_count << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -364,7 +364,7 @@ int vk_create_descriptor_pool(void *user_context, VkResult result = vkCreateDescriptorPool(allocator->current_device(), &descriptor_pool_info, allocator->callbacks(), descriptor_pool); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create descriptor pool! vkCreateDescriptorPool returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateDescriptorPool"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -380,7 +380,7 @@ int vk_destroy_descriptor_pool(void *user_context, << "descriptor_pool: " << (void *)descriptor_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy descriptor pool ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkDestroyDescriptorPool(allocator->current_device(), descriptor_pool, allocator->callbacks()); @@ -404,7 +404,7 @@ int vk_create_descriptor_set_layout(void *user_context, << "layout: " << (void *)layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -462,7 +462,7 @@ int vk_create_descriptor_set_layout(void *user_context, // Create the descriptor set layout VkResult result = vkCreateDescriptorSetLayout(allocator->current_device(), &layout_info, allocator->callbacks(), layout); if (result != VK_SUCCESS) { - error(user_context) << "vkCreateDescriptorSetLayout returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateDescriptorSetLayout"); return halide_error_code_generic_error; } @@ -480,7 +480,7 @@ int vk_destroy_descriptor_set_layout(void *user_context, << "layout: " << (void *)descriptor_set_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy descriptor set layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } vkDestroyDescriptorSetLayout(allocator->current_device(), descriptor_set_layout, allocator->callbacks()); @@ -502,7 +502,7 @@ int vk_create_descriptor_set(void *user_context, << "descriptor_pool: " << (void *)descriptor_pool << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -517,7 +517,7 @@ int vk_create_descriptor_set(void *user_context, VkResult result = vkAllocateDescriptorSets(allocator->current_device(), &descriptor_set_info, descriptor_set); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkAllocateDescriptorSets returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkAllocateDescriptorSets"); return halide_error_code_generic_error; } @@ -543,7 +543,7 @@ int vk_update_descriptor_set(void *user_context, << "descriptor_set: " << (void *)descriptor_set << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create descriptor set ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -600,7 +600,7 @@ int vk_update_descriptor_set(void *user_context, // retrieve the buffer from the region VkBuffer *device_buffer = reinterpret_cast(owner->handle); if (device_buffer == nullptr) { - error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n"; + error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!"; return halide_error_code_internal_error; } @@ -673,7 +673,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... invalid allocator pointer!"; return nullptr; } @@ -686,7 +686,7 @@ MemoryRegion *vk_create_scalar_uniform_buffer(void *user_context, // allocate a new region MemoryRegion *region = allocator->reserve(user_context, request); if ((region == nullptr) || (region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!\n"; + error(user_context) << "Vulkan: Failed to create scalar uniform buffer ... unable to allocate device memory!"; return nullptr; } @@ -708,19 +708,19 @@ int vk_update_scalar_uniform_buffer(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } if ((region == nullptr) || (region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... invalid memory region!"; return halide_error_code_internal_error; } // map the region to a host ptr uint8_t *host_ptr = (uint8_t *)allocator->map(user_context, region); if (host_ptr == nullptr) { - error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!\n"; + error(user_context) << "Vulkan: Failed to update scalar uniform buffer ... unable to map host pointer to device memory!"; return halide_error_code_internal_error; } @@ -748,7 +748,7 @@ int vk_destroy_scalar_uniform_buffer(void *user_context, VulkanMemoryAllocator * << "scalar_args_region: " << (void *)scalar_args_region << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy scalar uniform buffer ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -782,7 +782,7 @@ int vk_create_pipeline_layout(void *user_context, << "pipeline_layout: " << (void *)pipeline_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create pipeline layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -791,7 +791,7 @@ int vk_create_pipeline_layout(void *user_context, if (descriptor_set_count > max_bound_descriptor_sets) { error(user_context) << "Vulkan: Number of descriptor sets for pipeline layout exceeds the number that can be bound by device!\n" << " requested: " << descriptor_set_count << "," - << " available: " << max_bound_descriptor_sets << "\n"; + << " available: " << max_bound_descriptor_sets; return halide_error_code_incompatible_device_interface; } } @@ -808,7 +808,7 @@ int vk_create_pipeline_layout(void *user_context, VkResult result = vkCreatePipelineLayout(allocator->current_device(), &pipeline_layout_info, allocator->callbacks(), pipeline_layout); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: vkCreatePipelineLayout returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreatePipelineLayout"); return halide_error_code_generic_error; } return halide_error_code_success; @@ -826,7 +826,7 @@ int vk_destroy_pipeline_layout(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy pipeline layout ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -848,11 +848,12 @@ int vk_create_compute_pipeline(void *user_context, debug(user_context) << " vk_create_compute_pipeline (user_context: " << user_context << ", " << "allocator: " << (void *)allocator << ", " + << "pipeline_name: " << pipeline_name << ", " << "shader_module: " << (void *)shader_module << ", " << "pipeline_layout: " << (void *)pipeline_layout << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -878,7 +879,10 @@ int vk_create_compute_pipeline(void *user_context, VkResult result = vkCreateComputePipelines(allocator->current_device(), VK_NULL_HANDLE, 1, &compute_pipeline_info, allocator->callbacks(), compute_pipeline); if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to create compute pipeline! vkCreateComputePipelines returned " << vk_get_error_name(result) << "\n"; + vk_report_error(user_context, result, "vkCreateComputePipeline") + << "failed to create compute pipeline " << pipeline_name << ".\n" + << " (This might be a bug in Halide. To debug this, see the HL_SPIRV_DUMP_FILE environment variable, and use the Khronos validator to make a bug report)"; + return halide_error_code_generic_error; } @@ -905,24 +909,24 @@ int vk_setup_compute_pipeline(void *user_context, #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid shader bindings!"; return halide_error_code_generic_error; } if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... invalid dispatch data!"; return halide_error_code_generic_error; } VkResult result = VK_SUCCESS; const char *entry_point_name = shader_bindings->entry_point_name; if (entry_point_name == nullptr) { - error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!\n"; + error(user_context) << "Vulkan: Failed to setup compute pipeline ... missing entry point name!"; return halide_error_code_generic_error; } @@ -945,7 +949,7 @@ int vk_setup_compute_pipeline(void *user_context, } else { // dynamic allocation if (shared_mem_constant_id > 0) { - error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!\n"; + error(user_context) << "Vulkan: Multiple dynamic shared memory allocations found! Only one is suported!!"; result = VK_ERROR_TOO_MANY_OBJECTS; break; } @@ -978,13 +982,13 @@ int vk_setup_compute_pipeline(void *user_context, if (static_shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n" << " requested: " << static_shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } if (dispatch_data->shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of dynamic shared memory used exceeds device limit!\n" << " requested: " << dispatch_data->shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } } @@ -1015,14 +1019,14 @@ int vk_setup_compute_pipeline(void *user_context, } } if (found_index == invalid_index) { - error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!\n"; + error(user_context) << "Vulkan: Failed to locate dispatch constant index for shader binding!"; result = VK_ERROR_INITIALIZATION_FAILED; } } // don't even attempt to create the pipeline layout if we encountered errors in the shader binding if (result != VK_SUCCESS) { - error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result) << "\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings! " << vk_get_error_name(result); return halide_error_code_generic_error; } @@ -1050,7 +1054,7 @@ int vk_setup_compute_pipeline(void *user_context, if (shader_bindings->compute_pipeline) { int error_code = vk_destroy_compute_pipeline(user_context, allocator, shader_bindings->compute_pipeline); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to destroy compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to destroy compute pipeline!"; return halide_error_code_generic_error; } shader_bindings->compute_pipeline = VK_NULL_HANDLE; @@ -1058,7 +1062,7 @@ int vk_setup_compute_pipeline(void *user_context, int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, &specialization_info, &(shader_bindings->compute_pipeline)); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to create compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline!"; return error_code; } @@ -1068,7 +1072,7 @@ int vk_setup_compute_pipeline(void *user_context, if (shader_bindings->compute_pipeline == VK_NULL_HANDLE) { int error_code = vk_create_compute_pipeline(user_context, allocator, entry_point_name, shader_module, pipeline_layout, nullptr, &(shader_bindings->compute_pipeline)); if (error_code != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to create compute pipeline!\n"; + error(user_context) << "Vulkan: Failed to create compute pipeline!"; return error_code; } } @@ -1088,7 +1092,7 @@ int vk_destroy_compute_pipeline(void *user_context, << "compute_pipeline: " << (void *)compute_pipeline << ")\n"; #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to destroy compute pipeline ... invalid allocator pointer!"; return halide_error_code_generic_error; } @@ -1110,12 +1114,12 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid allocator pointer!"; return nullptr; } if ((module_ptr == nullptr) || (module_size < (2 * sizeof(uint32_t)))) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... invalid module buffer!"; return nullptr; } @@ -1163,7 +1167,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA uint32_t idx = 1; // skip past the header_word_count uint32_t shader_count = module_ptr[idx++]; if (shader_count < 1) { - error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings ... no descriptors found!"; return nullptr; // no descriptors } @@ -1172,7 +1176,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t shader_bindings_size = shader_count * sizeof(VulkanShaderBinding); VulkanShaderBinding *shader_bindings = (VulkanShaderBinding *)vk_host_malloc(user_context, shader_bindings_size, 0, alloc_scope, allocator->callbacks()); if (shader_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate shader_bindings! Out of memory!"; return nullptr; } memset(shader_bindings, 0, shader_bindings_size); @@ -1205,7 +1209,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t specialization_constants_size = specialization_constants_count * sizeof(VulkanSpecializationConstant); specialization_constants = (VulkanSpecializationConstant *)vk_host_malloc(user_context, specialization_constants_size, 0, alloc_scope, allocator->callbacks()); if (specialization_constants == nullptr) { - error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate specialization_constants! Out of memory!"; return nullptr; } memset(specialization_constants, 0, specialization_constants_size); @@ -1241,7 +1245,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA size_t shared_memory_allocations_size = shared_memory_allocations_count * sizeof(VulkanSharedMemoryAllocation); shared_memory_allocations = (VulkanSharedMemoryAllocation *)vk_host_malloc(user_context, shared_memory_allocations_size, 0, alloc_scope, allocator->callbacks()); if (shared_memory_allocations == nullptr) { - error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate shared_memory_allocations! Out of memory!"; return nullptr; } memset(shared_memory_allocations, 0, shared_memory_allocations_size); @@ -1306,7 +1310,7 @@ VulkanShaderBinding *vk_decode_shader_bindings(void *user_context, VulkanMemoryA #endif shader_bindings[n].entry_point_name = (char *)vk_host_malloc(user_context, entry_point_name_length * sizeof(uint32_t), 0, alloc_scope, allocator->callbacks()); if (shader_bindings[n].entry_point_name == nullptr) { - error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate entry_point_name! Out of memory!"; return nullptr; } @@ -1358,7 +1362,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all if (static_shared_mem_bytes > device_shared_mem_size) { error(user_context) << "Vulkan: Amount of static shared memory used exceeds device limit!\n" << " requested: " << static_shared_mem_bytes << " bytes," - << " available: " << device_shared_mem_size << " bytes\n"; + << " available: " << device_shared_mem_size << " bytes"; return halide_error_code_incompatible_device_interface; } } @@ -1370,7 +1374,7 @@ int vk_validate_shader_for_device(void *user_context, VulkanMemoryAllocator *all if (shader_count > max_descriptors) { error(user_context) << "Vulkan: Number of required descriptor sets exceeds the amount available for device!\n" << " requested: " << shader_count << "," - << " available: " << max_descriptors << "\n"; + << " available: " << max_descriptors; return halide_error_code_incompatible_device_interface; } } @@ -1466,7 +1470,7 @@ VulkanCompilationCacheEntry *vk_compile_kernel_module(void *user_context, Vulkan // Compile the "SPIR-V Module" for the kernel cache_entry->compiled_modules[i] = vk_compile_shader_module(user_context, allocator, (const char *)spirv_ptr, (int)spirv_size); if (cache_entry->compiled_modules[i] == nullptr) { - debug(user_context) << "Vulkan: Failed to compile shader module!\n"; + debug(user_context) << "Vulkan: Failed to compile shader module!"; error_code = halide_error_code_generic_error; } @@ -1506,12 +1510,12 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM #endif if (allocator == nullptr) { - error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!\n"; + error(user_context) << "Vulkan: Failed to compile shader modules ... invalid allocator pointer!"; return nullptr; } if ((ptr == nullptr) || (size <= 0)) { - error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!\n"; + error(user_context) << "Vulkan: Failed to compile shader modules ... invalid program source buffer!"; return nullptr; } @@ -1549,7 +1553,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM VkSystemAllocationScope alloc_scope = VkSystemAllocationScope::VK_SYSTEM_ALLOCATION_SCOPE_OBJECT; VulkanCompiledShaderModule *compiled_module = (VulkanCompiledShaderModule *)vk_host_malloc(user_context, sizeof(VulkanCompiledShaderModule), 0, alloc_scope, allocator->callbacks()); if (compiled_module == nullptr) { - error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate compilation cache entry! Out of memory!"; return nullptr; } memset(compiled_module, 0, sizeof(VulkanCompiledShaderModule)); @@ -1557,7 +1561,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM // decode the entry point data and extract the shader bindings VulkanShaderBinding *decoded_bindings = vk_decode_shader_bindings(user_context, allocator, module_ptr, module_size); if (decoded_bindings == nullptr) { - error(user_context) << "Vulkan: Failed to decode shader bindings!\n"; + error(user_context) << "Vulkan: Failed to decode shader bindings!"; return nullptr; } @@ -1574,8 +1578,8 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM compiled_module->shader_count = shader_count; VkResult result = vkCreateShaderModule(allocator->current_device(), &shader_info, allocator->callbacks(), &compiled_module->shader_module); - if ((result != VK_SUCCESS)) { - error(user_context) << "Vulkan: vkCreateShaderModule Failed! Error returned: " << vk_get_error_name(result) << "\n"; + if (result != VK_SUCCESS) { + vk_report_error(user_context, result, "vkCreateShaderModule"); vk_host_free(user_context, compiled_module->shader_bindings, allocator->callbacks()); vk_host_free(user_context, compiled_module, allocator->callbacks()); return nullptr; @@ -1585,7 +1589,7 @@ VulkanCompiledShaderModule *vk_compile_shader_module(void *user_context, VulkanM if (compiled_module->shader_count) { compiled_module->descriptor_set_layouts = (VkDescriptorSetLayout *)vk_host_malloc(user_context, compiled_module->shader_count * sizeof(VkDescriptorSetLayout), 0, alloc_scope, allocator->callbacks()); if (compiled_module->descriptor_set_layouts == nullptr) { - error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!\n"; + error(user_context) << "Vulkan: Failed to allocate descriptor set layouts for cache entry! Out of memory!"; return nullptr; } memset(compiled_module->descriptor_set_layouts, 0, compiled_module->shader_count * sizeof(VkDescriptorSetLayout)); @@ -1758,7 +1762,7 @@ int vk_do_multidimensional_copy(void *user_context, VkCommandBuffer command_buff VkBuffer *src_buffer = reinterpret_cast(c.src); VkBuffer *dst_buffer = reinterpret_cast(c.dst); if (!src_buffer || !dst_buffer) { - error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!\n"; + error(user_context) << "Vulkan: Failed to retrieve buffer for device memory!"; return halide_error_code_internal_error; } @@ -1796,7 +1800,7 @@ int vk_device_crop_from_offset(void *user_context, VulkanContext ctx(user_context); if (ctx.error != halide_error_code_success) { - error(user_context) << "Vulkan: Failed to acquire context!\n"; + error(user_context) << "Vulkan: Failed to acquire context!"; return ctx.error; } @@ -1805,21 +1809,21 @@ int vk_device_crop_from_offset(void *user_context, #endif if (offset < 0) { - error(user_context) << "Vulkan: Invalid offset for device crop!\n"; + error(user_context) << "Vulkan: Invalid offset for device crop!"; return halide_error_code_device_crop_failed; } // get the allocated region for the device MemoryRegion *device_region = reinterpret_cast(src->device); if (device_region == nullptr) { - error(user_context) << "Vulkan: Failed to crop region! Invalide device region!\n"; + error(user_context) << "Vulkan: Failed to crop region! Invalide device region!"; return halide_error_code_device_crop_failed; } // create the croppeg region from the allocated region MemoryRegion *cropped_region = ctx.allocator->create_crop(user_context, device_region, (uint64_t)offset); if ((cropped_region == nullptr) || (cropped_region->handle == nullptr)) { - error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!\n"; + error(user_context) << "Vulkan: Failed to crop region! Unable to create memory region!"; return halide_error_code_device_crop_failed; } diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt index 4bce8789875e..fc610b7d90de 100644 --- a/test/correctness/CMakeLists.txt +++ b/test/correctness/CMakeLists.txt @@ -223,6 +223,7 @@ tests(GROUPS correctness median3x3.cpp metal_precompiled_shaders.cpp memoize_cloned.cpp + metal_long_vectors.cpp min_extent.cpp mod.cpp mul_div_mod.cpp diff --git a/test/error/metal_vector_too_large.cpp b/test/correctness/metal_long_vectors.cpp similarity index 89% rename from test/error/metal_vector_too_large.cpp rename to test/correctness/metal_long_vectors.cpp index bf4c74bb75a0..74c2e981fc2d 100644 --- a/test/error/metal_vector_too_large.cpp +++ b/test/correctness/metal_long_vectors.cpp @@ -9,7 +9,7 @@ int main(int argc, char **argv) { Var x("x"), y("y"); f(x, y) = input(x, y) + 42; - f.vectorize(x, 16).gpu_blocks(y, DeviceAPI::Metal); + f.vectorize(x, 32).gpu_blocks(y, DeviceAPI::Metal); std::string test_object = Internal::get_test_tmp_dir() + "metal_vector_too_large.o"; Target mac_target("x86-64-osx-metal"); diff --git a/test/correctness/require.cpp b/test/correctness/require.cpp index 625383f460df..58226077d971 100644 --- a/test/correctness/require.cpp +++ b/test/correctness/require.cpp @@ -9,7 +9,7 @@ void halide_error(JITUserContext *ctx, const char *msg) { // Emitting "error.*:" to stdout or stderr will cause CMake to report the // test as a failure on Windows, regardless of error code returned, // hence the abbreviation to "err". - printf("Saw (Expected) Halide Err: %s\n", msg); + printf("Saw (Expected) Halide Err: %s", msg); error_occurred = true; } @@ -46,14 +46,18 @@ static void test(int vector_width) { if (!error_occurred) { printf("There should have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } + printf("\n"); + p1.set(1); p2.set(kPrime1 - 1); error_occurred = false; result = f.realize({realize_width}); if (error_occurred) { - printf("There should not have been a requirement error (vector_width = %d)\n", vector_width); + printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width); exit(1); } for (int i = 0; i < realize_width; ++i) { @@ -64,6 +68,8 @@ static void test(int vector_width) { exit(1); } } + printf("OK\n"); + printf("\n"); ImageParam input(Int(32), 2); Expr h = require(p1 == p2, p1); @@ -81,8 +87,12 @@ static void test(int vector_width) { if (!error_occurred) { printf("There should have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } + printf("\n"); + p1.set(16); p2.set(16); @@ -91,6 +101,8 @@ static void test(int vector_width) { if (error_occurred) { printf("There should NOT have been a requirement error (vector_width = %d)\n", vector_width); exit(1); + } else { + printf("OK\n"); } } diff --git a/test/correctness/simd_op_check.h b/test/correctness/simd_op_check.h index 53af05c5795f..ca507ce07038 100644 --- a/test/correctness/simd_op_check.h +++ b/test/correctness/simd_op_check.h @@ -506,20 +506,27 @@ class SimdOpCheckTest { })); } + std::vector failed_tests; + constexpr int tabstop = 32; for (auto &f : futures) { auto result = f.get(); - constexpr int tabstop = 32; const int spaces = std::max(1, tabstop - (int)result.op.size()); std::cout << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n"; if (!result.error_msg.empty()) { std::cerr << result.error_msg; - // The thread-pool destructor will block until in-progress tasks - // are done, and then will discard any tasks that haven't been - // launched yet. - return false; + failed_tests.push_back(std::move(result)); } } + if (!failed_tests.empty()) { + std::cerr << "SIMD op check summary: " << failed_tests.size() << " tests failed:\n"; + for (auto &result : failed_tests) { + const int spaces = std::max(1, tabstop - (int)result.op.size()); + std::cerr << " " << result.op << std::string(spaces, ' ') << "(" << run_target_str << ")\n"; + } + return false; + } + return true; } diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp index 5da8e85d8b23..241152df2342 100644 --- a/test/correctness/simd_op_check_hvx.cpp +++ b/test/correctness/simd_op_check_hvx.cpp @@ -54,16 +54,24 @@ class SimdOpCheckHVX : public SimdOpCheckTest { isa_version = 62; } + auto valign_test_u8 = [&](int off) { + return in_u8(x + off) + in_u8(x + off + 1); + }; + + auto valign_test_u16 = [&](int off) { + return in_u16(x + off) + in_u16(x + off + 1); + }; + // Verify that unaligned loads use the right instructions, and don't try to use // immediates of more than 3 bits. - check("valign(v*,v*,#7)", hvx_width / 1, in_u8(x + 7)); - check("vlalign(v*,v*,#7)", hvx_width / 1, in_u8(x + hvx_width - 7)); - check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + 8)); - check("valign(v*,v*,r*)", hvx_width / 1, in_u8(x + hvx_width - 8)); - check("valign(v*,v*,#6)", hvx_width / 1, in_u16(x + 3)); - check("vlalign(v*,v*,#6)", hvx_width / 1, in_u16(x + hvx_width - 3)); - check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + 4)); - check("valign(v*,v*,r*)", hvx_width / 1, in_u16(x + hvx_width - 4)); + check("valign(v*,v*,#7)", hvx_width / 1, valign_test_u8(6)); + check("vlalign(v*,v*,#7)", hvx_width / 1, valign_test_u8(hvx_width - 7)); + check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(8)); + check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u8(hvx_width - 8)); + check("valign(v*,v*,#6)", hvx_width / 1, valign_test_u16(3)); + check("vlalign(v*,v*,#6)", hvx_width / 1, valign_test_u16(hvx_width - 3)); + check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(4)); + check("valign(v*,v*,r*)", hvx_width / 1, valign_test_u16(hvx_width - 4)); check("vunpack(v*.ub)", hvx_width / 1, u16(u8_1)); check("vunpack(v*.ub)", hvx_width / 1, i16(u8_1)); diff --git a/test/correctness/specialize.cpp b/test/correctness/specialize.cpp index 1a807003f72a..8df87dd27333 100644 --- a/test/correctness/specialize.cpp +++ b/test/correctness/specialize.cpp @@ -128,6 +128,11 @@ int main(int argc, char **argv) { } } + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } + // Should have used vector stores if (!vector_store || scalar_store) { printf("This was supposed to use vector stores\n"); @@ -156,6 +161,11 @@ int main(int argc, char **argv) { } } + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } + // Should have used scalar stores if (vector_store || !scalar_store) { printf("This was supposed to use scalar stores\n"); @@ -243,6 +253,10 @@ int main(int argc, char **argv) { // Check we don't crash with the small input, and that it uses scalar stores reset_trace(); f.realize({5}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return 1; @@ -254,6 +268,10 @@ int main(int argc, char **argv) { reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return 1; @@ -282,6 +300,10 @@ int main(int argc, char **argv) { // Check we used scalar stores for a strided input. reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (!scalar_store || vector_store) { printf("These stores were supposed to be scalar.\n"); return 1; @@ -293,6 +315,10 @@ int main(int argc, char **argv) { reset_trace(); f.realize({100}); + if (!vector_store && !scalar_store) { + printf("No stores were reported\n"); + return 1; + } if (scalar_store || !vector_store) { printf("These stores were supposed to be vector.\n"); return 1; diff --git a/test/correctness/stage_strided_loads.cpp b/test/correctness/stage_strided_loads.cpp index f791385f7c25..dab19a370d93 100644 --- a/test/correctness/stage_strided_loads.cpp +++ b/test/correctness/stage_strided_loads.cpp @@ -10,7 +10,7 @@ class CheckForStridedLoads : public IRMutator { if (const Ramp *r = op->index.as()) { if (op->name == buf_name) { bool dense = is_const_one(r->stride); - found |= !dense; + found_strided_load |= !dense; dense_loads += dense; } } @@ -18,27 +18,27 @@ class CheckForStridedLoads : public IRMutator { } public: - bool found = false; + bool found_strided_load = false; int dense_loads = 0; std::string buf_name; void check(Func f, int desired_dense_loads, std::string name = "buf") { - found = false; + found_strided_load = false; dense_loads = 0; buf_name = name; f.add_custom_lowering_pass(this, nullptr); f.compile_jit(); - assert(!found); + assert(!found_strided_load); assert(dense_loads == desired_dense_loads); } void check_not(Func f, int desired_dense_loads, std::string name = "buf") { - found = false; + found_strided_load = false; dense_loads = 0; buf_name = name; f.add_custom_lowering_pass(this, nullptr); f.compile_jit(); - assert(found); + assert(found_strided_load); assert(dense_loads == desired_dense_loads); } } checker; diff --git a/test/correctness/vector_shuffle.cpp b/test/correctness/vector_shuffle.cpp index aff6fcbcddcf..f0a62ab3d8cd 100644 --- a/test/correctness/vector_shuffle.cpp +++ b/test/correctness/vector_shuffle.cpp @@ -1,10 +1,20 @@ #include "Halide.h" +#include +#include #include using namespace Halide; -int main(int argc, char **argv) { - Target target = get_jit_target_from_environment(); +int test_with_indices(const Target &target, const std::vector &indices0, const std::vector &indices1) { + printf("indices0:"); + for (int i : indices0) { + printf(" %d", i); + } + printf(" indices1:"); + for (int i : indices1) { + printf(" %d", i); + } + printf("\n"); Var x{"x"}, y{"y"}; Func f0{"f0"}, f1{"f1"}, g{"g"}; @@ -12,15 +22,6 @@ int main(int argc, char **argv) { f1(x, y) = x * (y + 3); Expr vec1 = Internal::Shuffle::make_concat({f0(x, 0), f0(x, 1), f0(x, 2), f0(x, 3)}); Expr vec2 = Internal::Shuffle::make_concat({f1(x, 4), f1(x, 5), f1(x, 6), f1(x, 7)}); - std::vector indices0; - std::vector indices1; - if (!target.has_gpu_feature() || target.has_feature(Target::Feature::OpenCL) || target.has_feature(Target::Feature::CUDA)) { - indices0 = {3, 1, 6, 7, 2, 4, 0, 5}; - indices1 = {1, 0, 3, 4, 7, 0, 5, 2}; - } else { - indices0 = {3, 1, 6, 7}; - indices1 = {1, 0, 3, 4}; - } Expr shuffle1 = Internal::Shuffle::make({vec1, vec2}, indices0); Expr shuffle2 = Internal::Shuffle::make({vec1, vec2}, indices1); Expr result = shuffle1 * shuffle2; @@ -55,6 +56,94 @@ int main(int argc, char **argv) { return 1; } } + return 0; +} + +int main(int argc, char **argv) { + Target target = get_jit_target_from_environment(); + + for (int vec_size = 8; vec_size > 1; vec_size /= 2) { + printf("Testing vector size %d...\n", vec_size); + std::vector indices0, indices1; + + // Test 1: All indices: foreward/backward and combined + for (int i = 0; i < vec_size; ++i) { + indices0.push_back(i); // forward + indices1.push_back(vec_size - i - 1); // backward + } + printf(" All indices forward...\n"); + if (test_with_indices(target, indices0, indices0)) { + return 1; + } + printf(" All indices backward...\n"); + if (test_with_indices(target, indices1, indices1)) { + return 1; + } + printf(" All indices mixed forware / backward...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + // Test 2: Shuffled indices (4 repetitions) + for (int r = 0; r < 4; ++r) { + // Shuffle with Fisher-Yates + for (int i = vec_size - 1; i >= 1; --i) { + // indices0 + int idx = std::rand() % (i + 1); + std::swap(indices0[idx], indices0[i]); + // indices1 + idx = std::rand() % (i + 1); + std::swap(indices1[idx], indices1[i]); + } + printf(" Randomly shuffled...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + } + + // Test 3: Interleaved + indices0.clear(); + indices1.clear(); + for (int i = 0; i < vec_size / 2; ++i) { + // interleave (A, B) + indices0.push_back(i); + indices0.push_back(i + vec_size / 2); + + // interleave (B, A) + indices1.push_back(i + vec_size / 2); + indices1.push_back(i); + } + printf(" Interleaved...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + // Test 4: Concat (not-really, as the input-vectors are size 4, so only if vec_size == 8, it's a concat) + indices0.clear(); + indices1.clear(); + for (int i = 0; i < vec_size; ++i) { + // concat (A, B) + indices0.push_back(i); + + // concat (B, A) + indices1.push_back((i + vec_size / 2) % (vec_size / 2)); + } + printf(" Concat...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + + if (vec_size == 4) { + indices0 = {1, 3, 2, 0}; + indices1 = {2, 3, 1, 0}; + + printf(" Specific index combination, known to have caused problems...\n"); + if (test_with_indices(target, indices0, indices1)) { + return 1; + } + } + } + printf("Success!\n"); return 0; } diff --git a/test/error/CMakeLists.txt b/test/error/CMakeLists.txt index 41816d5ba36b..b7d6a380c504 100644 --- a/test/error/CMakeLists.txt +++ b/test/error/CMakeLists.txt @@ -79,7 +79,6 @@ tests(GROUPS error memoize_output_invalid.cpp memoize_redefine_eviction_key.cpp metal_threads_too_large.cpp - metal_vector_too_large.cpp mismatch_runtime_vscale.cpp missing_args.cpp no_default_device.cpp