Skip to content

Commit e23a33a

Browse files
committed
[VPlan] Set ZeroIsPoison=false whne lowering FirstActiveLane
When interleaving a loop with an early exit, the parts before the active lane will be all zero. Currently we emit @llvm.experimental.cttz.elts with ZeroIsPoison=true for these parts, which means that they will produce poison. We don't see any miscompiles today on AArch64 because it has the same lowering for cttz.elts regardless of ZeroIsPoison, but this may cause issues on RISC-V when interleaving. This fixes it by setting ZeroIsPoison=false. The codegen is worse when ZeroIsPoison=false and we could potentially recover it by enabling it again when UF=1, but this is left to another PR. This is split off from llvm#168738, where LastActiveLane can get expanded to a FirstActiveLane with an all-zeroes mask.
1 parent 86a82f2 commit e23a33a

File tree

9 files changed

+87
-87
lines changed

9 files changed

+87
-87
lines changed

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1005,7 +1005,7 @@ Value *VPInstruction::generate(VPTransformState &State) {
10051005
if (getNumOperands() == 1) {
10061006
Value *Mask = State.get(getOperand(0));
10071007
return Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(), Mask,
1008-
true, Name);
1008+
/*ZeroIsPoison=*/false, Name);
10091009
}
10101010
// If there are multiple operands, create a chain of selects to pick the
10111011
// first operand with an active lane and add the number of lanes of the
@@ -1021,9 +1021,9 @@ Value *VPInstruction::generate(VPTransformState &State) {
10211021
Builder.CreateICmpEQ(State.get(getOperand(Idx)),
10221022
Builder.getFalse()),
10231023
Builder.getInt64Ty())
1024-
: Builder.CreateCountTrailingZeroElems(Builder.getInt64Ty(),
1025-
State.get(getOperand(Idx)),
1026-
true, Name);
1024+
: Builder.CreateCountTrailingZeroElems(
1025+
Builder.getInt64Ty(), State.get(getOperand(Idx)),
1026+
/*ZeroIsPoison=*/false, Name);
10271027
Value *Current = Builder.CreateAdd(
10281028
Builder.CreateMul(RuntimeVF, Builder.getInt64(Idx)), TrailingZeros);
10291029
if (Res) {

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ define i64 @same_exit_block_pre_inc_use1() #1 {
4444
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[N_VEC]]
4545
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH]]
4646
; CHECK: vector.early.exit:
47-
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 true)
47+
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP16]], i1 false)
4848
; CHECK-NEXT: [[TMP20:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
4949
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP20]]
5050
; CHECK-NEXT: br label [[LOOP_END]]
@@ -125,7 +125,7 @@ define i64 @same_exit_block_pre_inc_use4() {
125125
; CHECK: middle.block:
126126
; CHECK-NEXT: br label [[LOOP_END:%.*]]
127127
; CHECK: vector.early.exit:
128-
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 true)
128+
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v2i1(<2 x i1> [[TMP4]], i1 false)
129129
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
130130
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP8]]
131131
; CHECK-NEXT: br label [[LOOP_END]]
@@ -187,7 +187,7 @@ define i64 @loop_contains_safe_call() #1 {
187187
; CHECK: middle.block:
188188
; CHECK-NEXT: br label [[LOOP_END:%.*]]
189189
; CHECK: vector.early.exit:
190-
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
190+
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
191191
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
192192
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP9]]
193193
; CHECK-NEXT: br label [[LOOP_END]]
@@ -256,7 +256,7 @@ define i64 @loop_contains_safe_div() #1 {
256256
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 64, [[INDEX1]]
257257
; CHECK-NEXT: br i1 [[CMP_N]], label [[LOOP_END:%.*]], label [[SCALAR_PH:%.*]]
258258
; CHECK: vector.early.exit:
259-
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 true)
259+
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv4i1(<vscale x 4 x i1> [[TMP15]], i1 false)
260260
; CHECK-NEXT: [[TMP16:%.*]] = add i64 [[INDEX2]], [[FIRST_ACTIVE_LANE]]
261261
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP16]]
262262
; CHECK-NEXT: br label [[LOOP_END]]
@@ -336,7 +336,7 @@ define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(
336336
; CHECK: middle.block:
337337
; CHECK-NEXT: br label [[LOOP_END:%.*]]
338338
; CHECK: vector.early.exit:
339-
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
339+
; CHECK-NEXT: [[FIRST_ACTIVE_LANE:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
340340
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX1]], [[FIRST_ACTIVE_LANE]]
341341
; CHECK-NEXT: [[EARLY_EXIT_VALUE:%.*]] = add i64 3, [[TMP11]]
342342
; CHECK-NEXT: br label [[LOOP_END]]
@@ -483,12 +483,12 @@ exit:
483483
define i64 @same_exit_block_requires_interleaving() {
484484
; CHECK-LABEL: define i64 @same_exit_block_requires_interleaving() {
485485
; CHECK-NEXT: entry:
486-
; CHECK-NEXT: [[P1:%.*]] = alloca [128 x %my.struct], align 8
486+
; CHECK-NEXT: [[P1:%.*]] = alloca [128 x [[MY_STRUCT:%.*]]], align 8
487487
; CHECK-NEXT: call void @init_mem(ptr [[P1]], i64 256)
488488
; CHECK-NEXT: br label [[LOOP:%.*]]
489489
; CHECK: loop:
490490
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
491-
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x %my.struct], ptr [[P1]], i64 0, i64 [[INDEX]]
491+
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [128 x [[MY_STRUCT]]], ptr [[P1]], i64 0, i64 [[INDEX]]
492492
; CHECK-NEXT: [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
493493
; CHECK-NEXT: [[CMP3:%.*]] = icmp eq i8 [[LD1]], 3
494494
; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[LOOP_END:%.*]]

llvm/test/Transforms/LoopVectorize/AArch64/single-early-exit-interleave.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,20 +79,20 @@ define i64 @same_exit_block_pre_inc_use1() #0 {
7979
; CHECK: vector.early.exit:
8080
; CHECK-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64()
8181
; CHECK-NEXT: [[TMP40:%.*]] = mul nuw i64 [[TMP39]], 16
82-
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 true)
82+
; CHECK-NEXT: [[TMP41:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP59]], i1 false)
8383
; CHECK-NEXT: [[TMP42:%.*]] = mul i64 [[TMP40]], 3
8484
; CHECK-NEXT: [[TMP43:%.*]] = add i64 [[TMP42]], [[TMP41]]
85-
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 true)
85+
; CHECK-NEXT: [[TMP44:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP31]], i1 false)
8686
; CHECK-NEXT: [[TMP45:%.*]] = mul i64 [[TMP40]], 2
8787
; CHECK-NEXT: [[TMP46:%.*]] = add i64 [[TMP45]], [[TMP44]]
8888
; CHECK-NEXT: [[TMP47:%.*]] = icmp ne i64 [[TMP44]], [[TMP40]]
8989
; CHECK-NEXT: [[TMP48:%.*]] = select i1 [[TMP47]], i64 [[TMP46]], i64 [[TMP43]]
90-
; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 true)
90+
; CHECK-NEXT: [[TMP49:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP30]], i1 false)
9191
; CHECK-NEXT: [[TMP50:%.*]] = mul i64 [[TMP40]], 1
9292
; CHECK-NEXT: [[TMP51:%.*]] = add i64 [[TMP50]], [[TMP49]]
9393
; CHECK-NEXT: [[TMP52:%.*]] = icmp ne i64 [[TMP49]], [[TMP40]]
9494
; CHECK-NEXT: [[TMP53:%.*]] = select i1 [[TMP52]], i64 [[TMP51]], i64 [[TMP48]]
95-
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 true)
95+
; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.nxv16i1(<vscale x 16 x i1> [[TMP32]], i1 false)
9696
; CHECK-NEXT: [[TMP55:%.*]] = mul i64 [[TMP40]], 0
9797
; CHECK-NEXT: [[TMP56:%.*]] = add i64 [[TMP55]], [[TMP61]]
9898
; CHECK-NEXT: [[TMP57:%.*]] = icmp ne i64 [[TMP61]], [[TMP40]]

llvm/test/Transforms/LoopVectorize/single-early-exit-cond-poison.ll

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,9 @@ define noundef i32 @f(i32 noundef %g) {
3131
; VF4IC2: [[MIDDLE_BLOCK]]:
3232
; VF4IC2-NEXT: br label %[[RETURN:.*]]
3333
; VF4IC2: [[VECTOR_EARLY_EXIT]]:
34-
; VF4IC2-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
34+
; VF4IC2-NEXT: [[TMP9:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
3535
; VF4IC2-NEXT: [[TMP10:%.*]] = add i64 4, [[TMP9]]
36-
; VF4IC2-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
36+
; VF4IC2-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
3737
; VF4IC2-NEXT: [[TMP12:%.*]] = add i64 0, [[TMP11]]
3838
; VF4IC2-NEXT: [[TMP13:%.*]] = icmp ne i64 [[TMP11]], 4
3939
; VF4IC2-NEXT: [[TMP14:%.*]] = select i1 [[TMP13]], i64 [[TMP12]], i64 [[TMP10]]
@@ -64,7 +64,7 @@ define noundef i32 @f(i32 noundef %g) {
6464
; VF8IC1: [[MIDDLE_BLOCK]]:
6565
; VF8IC1-NEXT: br label %[[RETURN:.*]]
6666
; VF8IC1: [[VECTOR_EARLY_EXIT]]:
67-
; VF8IC1-NEXT: [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 true)
67+
; VF8IC1-NEXT: [[TMP5:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v8i1(<8 x i1> [[TMP2]], i1 false)
6868
; VF8IC1-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
6969
; VF8IC1-NEXT: [[TMP7:%.*]] = add i32 0, [[TMP6]]
7070
; VF8IC1-NEXT: br label %[[RETURN]]

llvm/test/Transforms/LoopVectorize/single-early-exit-deref-assumptions.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
2828
; CHECK: [[MIDDLE_BLOCK]]:
2929
; CHECK-NEXT: br label %[[LOOP_END:.*]]
3030
; CHECK: [[VECTOR_EARLY_EXIT]]:
31-
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 true)
31+
; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP4]], i1 false)
3232
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
3333
; CHECK-NEXT: br label %[[LOOP_END]]
3434
; CHECK: [[LOOP_END]]:
@@ -140,7 +140,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero(ptr n
140140
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
141141
; CHECK-NEXT: br i1 [[CMP_N]], label %[[LOOP_END_LOOPEXIT:.*]], label %[[SCALAR_PH]]
142142
; CHECK: [[VECTOR_EARLY_EXIT]]:
143-
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
143+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
144144
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
145145
; CHECK-NEXT: br label %[[LOOP_END_LOOPEXIT]]
146146
; CHECK: [[SCALAR_PH]]:
@@ -336,7 +336,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_n_not_zero_i16_p
336336
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]]
337337
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
338338
; CHECK: [[VECTOR_EARLY_EXIT]]:
339-
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 true)
339+
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP5]], i1 false)
340340
; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
341341
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 2
342342
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP12]]
@@ -431,7 +431,7 @@ define ptr @find_deref_pointer_distance_align_attribute_argument(ptr align 2 %fi
431431
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
432432
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
433433
; CHECK: [[VECTOR_EARLY_EXIT]]:
434-
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
434+
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
435435
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
436436
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2
437437
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -525,7 +525,7 @@ define ptr @find_deref_pointer_distance_align_assumption(ptr %first, ptr %last)
525525
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
526526
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
527527
; CHECK: [[VECTOR_EARLY_EXIT]]:
528-
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
528+
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
529529
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
530530
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2
531531
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[FIRST]], i64 [[TMP13]]
@@ -602,7 +602,7 @@ define i64 @early_exit_alignment_and_deref_known_via_assumption_with_constant_si
602602
; CHECK: [[MIDDLE_BLOCK]]:
603603
; CHECK-NEXT: br label %[[LOOP_END:.*]]
604604
; CHECK: [[VECTOR_EARLY_EXIT]]:
605-
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 true)
605+
; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP2]], i1 false)
606606
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], [[TMP7]]
607607
; CHECK-NEXT: br label %[[LOOP_END]]
608608
; CHECK: [[LOOP_END]]:
@@ -740,7 +740,7 @@ define i64 @find_if_pointer_distance_deref_via_assumption(ptr %vec) nofree nosyn
740740
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]]
741741
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT_LOOPEXIT:.*]], label %[[SCALAR_PH]]
742742
; CHECK: [[VECTOR_EARLY_EXIT]]:
743-
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 true)
743+
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.experimental.cttz.elts.i64.v4i1(<4 x i1> [[TMP6]], i1 false)
744744
; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], [[TMP11]]
745745
; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 2
746746
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[BEGIN]], i64 [[TMP13]]

0 commit comments

Comments
 (0)