-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[LV] Ensure getScaledReductions only matches extends inside the loop #148264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-transforms Author: David Sherwood (david-arm) ChangesIn getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes #148260 Full diff: https://github.com/llvm/llvm-project/pull/148264.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
std::optional<unsigned> BinOpc;
Type *ExtOpTypes[2] = {nullptr};
- auto CollectExtInfo = [&Exts,
+ auto CollectExtInfo = [this, &Exts,
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
unsigned I = 0;
for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
return false;
Exts[I] = cast<Instruction>(OpI);
+
+ // Other operand should live inside the loop
+ if (!CM.TheLoop->contains(Exts[I]))
+ return false;
+
ExtOpTypes[I] = ExtOp->getType();
I++;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ %conv1 = zext i8 %c to i32
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %conv1 = zext i8 %c to i32
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
|
@llvm/pr-subscribers-vectorizers Author: David Sherwood (david-arm) ChangesIn getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes #148260 Full diff: https://github.com/llvm/llvm-project/pull/148264.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index f3de24aa4c3d1..7a00e94efb228 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8109,7 +8109,7 @@ bool VPRecipeBuilder::getScaledReductions(
std::optional<unsigned> BinOpc;
Type *ExtOpTypes[2] = {nullptr};
- auto CollectExtInfo = [&Exts,
+ auto CollectExtInfo = [this, &Exts,
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
unsigned I = 0;
for (Value *OpI : Ops) {
@@ -8117,6 +8117,11 @@ bool VPRecipeBuilder::getScaledReductions(
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp))))
return false;
Exts[I] = cast<Instruction>(OpI);
+
+ // Other operand should live inside the loop
+ if (!CM.TheLoop->contains(Exts[I]))
+ return false;
+
ExtOpTypes[I] = ExtOp->getType();
I++;
}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 5d5ee570da0ff..70c8019a2115f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -911,6 +911,253 @@ for.exit: ; preds = %for.body
ret i32 %add
}
+define void @add_of_zext_outside_loop(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[TMP15]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP16]]
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP20]] = add i32 [[VEC_PHI]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI1]], [[CONV1]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP21]], [[TMP20]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_zext_outside_loop(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ %conv1 = zext i8 %c to i32
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+define void @add_of_loop_invariant_zext(ptr noalias %a, ptr noalias %b, i8 %c, i8 %d) #0 {
+; CHECK-INTERLEAVE1-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVE1-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVE1-NEXT: entry:
+; CHECK-INTERLEAVE1-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-INTERLEAVE1: for.body:
+; CHECK-INTERLEAVE1-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-INTERLEAVE1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-INTERLEAVE1-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVE1-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-INTERLEAVE1-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-INTERLEAVE1-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-INTERLEAVE1: exit:
+; CHECK-INTERLEAVE1-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-INTERLEAVE1-NEXT: ret void
+;
+; CHECK-INTERLEAVED-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-INTERLEAVED-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-INTERLEAVED-NEXT: entry:
+; CHECK-INTERLEAVED-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-INTERLEAVED-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = lshr i32 [[TMP0]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
+; CHECK-INTERLEAVED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
+; CHECK-INTERLEAVED: vector.scevcheck:
+; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = trunc i8 [[D]] to i2
+; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sub i2 0, [[TMP3]]
+; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext i2 [[TMP4]] to i32
+; CHECK-INTERLEAVED-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[TMP5]], 0
+; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sub i32 -4, [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = lshr i32 [[TMP6]], 2
+; CHECK-INTERLEAVED-NEXT: [[MUL:%.*]] = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 4, i32 [[TMP7]])
+; CHECK-INTERLEAVED-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i32, i1 } [[MUL]], 0
+; CHECK-INTERLEAVED-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i32, i1 } [[MUL]], 1
+; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = add i32 [[CONV]], [[MUL_RESULT]]
+; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp slt i32 [[TMP8]], [[CONV]]
+; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW]]
+; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = or i1 [[IDENT_CHECK]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED: vector.ph:
+; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 2
+; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul i32 [[N_VEC]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = add i32 [[CONV]], [[TMP12]]
+; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext i8 [[C]] to i32
+; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-INTERLEAVED: vector.body:
+; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ [[A_PROMOTED]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul i32 [[INDEX]], 4
+; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[CONV]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = add i32 [[OFFSET_IDX]], 4
+; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext i32 [[OFFSET_IDX]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = sext i32 [[TMP16]] to i64
+; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP17]]
+; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[TMP18]]
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT: store i8 0, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add i32 [[VEC_PHI]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add i32 [[VEC_PHI1]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2
+; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-INTERLEAVED: middle.block:
+; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP22]], [[TMP21]]
+; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED: scalar.ph:
+;
+; CHECK-MAXBW-LABEL: define void @add_of_loop_invariant_zext(
+; CHECK-MAXBW-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i8 [[C:%.*]], i8 [[D:%.*]]) #[[ATTR0]] {
+; CHECK-MAXBW-NEXT: entry:
+; CHECK-MAXBW-NEXT: [[CONV:%.*]] = zext i8 [[D]] to i32
+; CHECK-MAXBW-NEXT: [[A_PROMOTED:%.*]] = load i32, ptr [[A]], align 1
+; CHECK-MAXBW-NEXT: br label [[FOR_BODY:%.*]]
+; CHECK-MAXBW: for.body:
+; CHECK-MAXBW-NEXT: [[IV:%.*]] = phi i32 [ [[CONV]], [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[RDX:%.*]] = phi i32 [ [[A_PROMOTED]], [[ENTRY]] ], [ [[RDX_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: [[IDXPROM:%.*]] = sext i32 [[IV]] to i64
+; CHECK-MAXBW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[B]], i64 0, i64 [[IDXPROM]]
+; CHECK-MAXBW-NEXT: store i8 0, ptr [[ARRAYIDX]], align 1
+; CHECK-MAXBW-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32
+; CHECK-MAXBW-NEXT: [[RDX_NEXT]] = add nsw i32 [[RDX]], [[CONV1]]
+; CHECK-MAXBW-NEXT: [[IV_NEXT]] = add i32 [[IV]], 4
+; CHECK-MAXBW-NEXT: [[CMP:%.*]] = icmp eq i32 [[IV_NEXT]], 0
+; CHECK-MAXBW-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK-MAXBW: exit:
+; CHECK-MAXBW-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT: store i32 [[ADD_LCSSA]], ptr [[A]], align 4
+; CHECK-MAXBW-NEXT: ret void
+;
+entry:
+ %conv = zext i8 %d to i32
+ %a.promoted = load i32, ptr %a, align 1
+ br label %for.body
+
+for.body:
+ %iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ]
+ %rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ]
+ %idxprom = sext i32 %iv to i64
+ %arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom
+ store i8 0, ptr %arrayidx, align 1
+ %conv1 = zext i8 %c to i32
+ %rdx.next = add nsw i32 %rdx, %conv1
+ %iv.next = add i32 %iv, 4
+ %cmp = icmp eq i32 %iv.next, 0
+ br i1 %cmp, label %exit, label %for.body
+
+exit:
+ %add.lcssa = phi i32 [ %rdx.next, %for.body ]
+ store i32 %add.lcssa, ptr %a, align 4
+ ret void
+}
+
+
!0 = distinct !{!0, !1}
!1 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
attributes #0 = { vscale_range(1,16) "target-features"="+sve" }
|
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool { | ||
unsigned I = 0; | ||
for (Value *OpI : Ops) { | ||
Value *ExtOp; | ||
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) | ||
return false; | ||
Exts[I] = cast<Instruction>(OpI); | ||
|
||
// Other operand should live inside the loop | ||
if (!CM.TheLoop->contains(Exts[I])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is there a reason it is difficult/not possible to support live-in operands?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe it is possible, but given we're close to the branch point I thought it might be better to disable it first and look at adding support for it afterwards. What do you think?
I think the problem lies in the cost model (specifically VPPartialReduction::computeCost) and some assumptions made about the ordering of PHI operands.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, TODO seems fine for now, although it would ideally be fixed at the source.
%iv = phi i32 [ %conv, %entry ], [ %iv.next, %for.body ] | ||
%rdx = phi i32 [ %a.promoted, %entry ], [ %rdx.next, %for.body ] | ||
%idxprom = sext i32 %iv to i64 | ||
%arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%arrayidx = getelementptr inbounds [0 x i8], ptr %b, i64 0, i64 %idxprom | |
%arrayidx = getelementptr inbounds i8, ptr %b, ii64 %idxprom |
That should work I think, w/o using an array type with zero elements.
%conv1 = zext i8 %c to i32 | ||
%rdx.next = add nsw i32 %rdx, %conv1 | ||
%iv.next = add i32 %iv, 4 | ||
%cmp = icmp eq i32 %iv.next, 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This could probably be something like
%cmp = icmp eq i32 %iv.next, 0 | |
%cmp = icmp eq i32 %iv.next, 1024 |
to avoid wrapping IV and various SCEV runtime checks.
%add.lcssa = phi i32 [ %rdx.next, %for.body ] | ||
store i32 %add.lcssa, ptr %a, align 4 | ||
ret void |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%add.lcssa = phi i32 [ %rdx.next, %for.body ] | |
store i32 %add.lcssa, ptr %a, align 4 | |
ret void | |
%add.lcssa = phi i32 [ %rdx.next, %for.body ] | |
ret i32 %add.lcssa |
; | ||
entry: | ||
%conv = zext i8 %d to i32 | ||
%a.promoted = load i32, ptr %a, align 1 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can just pass i32 %a
as argument?
In getScaledReductions for the case where we try to match a partial reduction of the form: %phi = phi i32 ... ... %add = add i32 %phi, %zext where %zext = i8 %some_val to i32 we should ensure that %zext is actually inside the loop. Fixes llvm#148260
415c5ee
to
58356cc
Compare
; CHECK-MAXBW-NEXT: ret i32 [[ADD_LCSSA]] | ||
; | ||
entry: | ||
%conv = zext i8 %d to i32 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we're testing that partial reductions aren't emitted here, %conv = zext i8 %d to i32
is also present in the test add_of_loop_invariant_zext
as the iv phi's operand.
Shouldn't %d just be an i32 here so we have a fully independent test for the IV phi having a zext operand, and also the reduction add having a zext operand? Otherwise I think these tests are doing the same thing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've changed the tests now so they are a bit more obvious. I've removed the zext of %d in each test, and changed the loop to remove the SCEV checks and also avoid having to use scatter instructions when vectorising the store. Hopefully it should be clearer now - @add_of_zext_outside_loop
doesn't use partial reductions, but @add_of_loop_invariant_zext
does because the zext is inside the loop.
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool { | ||
unsigned I = 0; | ||
for (Value *OpI : Ops) { | ||
Value *ExtOp; | ||
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) | ||
return false; | ||
Exts[I] = cast<Instruction>(OpI); | ||
|
||
// Other operand should live inside the loop |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unnecessary comment which says the same as the code below it. I'd remove it or be more specific about why it's here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @david-arm!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, thanks
&ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool { | ||
unsigned I = 0; | ||
for (Value *OpI : Ops) { | ||
Value *ExtOp; | ||
if (!match(OpI, m_ZExtOrSExt(m_Value(ExtOp)))) | ||
return false; | ||
Exts[I] = cast<Instruction>(OpI); | ||
|
||
// Other operand should live inside the loop | ||
if (!CM.TheLoop->contains(Exts[I])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ok, TODO seems fine for now, although it would ideally be fixed at the source.
In getScaledReductions for the case where we try to match a partial reduction of the form:
%phi = phi i32 ...
...
%add = add i32 %phi, %zext
where
%zext = i8 %some_val to i32
we should ensure that %zext is actually inside the loop.
Fixes #148260