-
Notifications
You must be signed in to change notification settings - Fork 14.5k
[SLP] Fix cost estimation of external uses with wrong VF #148185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-vectorizers Author: Gaëtan Bossu (gbossu) ChangesIt assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, and we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert:
Full diff: https://github.com/llvm/llvm-project/pull/148185.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7b77954e3a4ff..88da7015fc770 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3898,7 +3898,7 @@ class BoUpSLP {
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
- int findLaneForValue(Value *V) const {
+ unsigned findLaneForValue(Value *V) const {
unsigned FoundLane = getVectorFactor();
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
+ ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
: Scalar(S), User(U), E(E), Lane(L) {}
/// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
const TreeEntry &E;
/// Which lane does the scalar belong to.
- int Lane;
+ unsigned Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
@@ -7901,7 +7901,7 @@ void BoUpSLP::buildExternalUses(
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7949,7 @@ void BoUpSLP::buildExternalUses(
if (U && Scalar->hasNUsesOrMore(UsesLimit))
U = nullptr;
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
<< " from lane " << FoundLane << " from " << *Scalar
<< ".\n");
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
for (ExternalUser &EU : ExternalUses) {
+ LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+ << EU.E.Idx << " in lane " << EU.Lane << "\n");
+ LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
+ LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *ScalarTy = EU.Scalar->getType();
+ const unsigned BundleWidth = EU.E.getVectorFactor();
+ assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
const TreeEntry *Entry = &EU.E;
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost =
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+ LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
+ << ExtraCost << "\n");
} else {
ExtraCost =
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
+ << *VecTy << ": " << ExtraCost << "\n");
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
ret void
}
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT: ret double [[A7]]
+;
+ %a0 = load double, ptr @src, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+ %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+ %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+ %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+ %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+ %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+ %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+ %res1 = fsub fast double %a0, %a1
+ %res2 = fsub fast double %a2, %a3
+ %res3 = fsub fast double %a4, %a5
+ %res4 = fsub fast double %a6, %a7
+ store double %res1, ptr @dst, align 8
+ store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+ store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+ store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+ ret double %a7
+}
|
@llvm/pr-subscribers-llvm-transforms Author: Gaëtan Bossu (gbossu) ChangesIt assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, and we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert:
Full diff: https://github.com/llvm/llvm-project/pull/148185.diff 2 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7b77954e3a4ff..88da7015fc770 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3898,7 +3898,7 @@ class BoUpSLP {
/// When ReuseReorderShuffleIndices is empty it just returns position of \p
/// V within vector of Scalars. Otherwise, try to remap on its reuse index.
- int findLaneForValue(Value *V) const {
+ unsigned findLaneForValue(Value *V) const {
unsigned FoundLane = getVectorFactor();
for (auto *It = find(Scalars, V), *End = Scalars.end(); It != End;
std::advance(It, 1)) {
@@ -4344,7 +4344,7 @@ class BoUpSLP {
/// This POD struct describes one external user in the vectorized tree.
struct ExternalUser {
- ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, int L)
+ ExternalUser(Value *S, llvm::User *U, const TreeEntry &E, unsigned L)
: Scalar(S), User(U), E(E), Lane(L) {}
/// Which scalar in our function.
@@ -4357,7 +4357,7 @@ class BoUpSLP {
const TreeEntry &E;
/// Which lane does the scalar belong to.
- int Lane;
+ unsigned Lane;
};
using UserList = SmallVector<ExternalUser, 16>;
@@ -7901,7 +7901,7 @@ void BoUpSLP::buildExternalUses(
// Check if the scalar is externally used as an extra arg.
const auto ExtI = ExternallyUsedValues.find(Scalar);
if (ExtI != ExternallyUsedValues.end()) {
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane "
<< FoundLane << " from " << *Scalar << ".\n");
ScalarToExtUses.try_emplace(Scalar, ExternalUses.size());
@@ -7949,7 +7949,7 @@ void BoUpSLP::buildExternalUses(
if (U && Scalar->hasNUsesOrMore(UsesLimit))
U = nullptr;
- int FoundLane = Entry->findLaneForValue(Scalar);
+ unsigned FoundLane = Entry->findLaneForValue(Scalar);
LLVM_DEBUG(dbgs() << "SLP: Need to extract:" << *UserInst
<< " from lane " << FoundLane << " from " << *Scalar
<< ".\n");
@@ -14568,8 +14568,6 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
LLVM_DEBUG(dbgs() << "SLP: Calculating cost for tree of size "
<< VectorizableTree.size() << ".\n");
- unsigned BundleWidth = VectorizableTree[0]->Scalars.size();
-
SmallPtrSet<Value *, 4> CheckedExtracts;
for (unsigned I = 0, E = VectorizableTree.size(); I < E; ++I) {
TreeEntry &TE = *VectorizableTree[I];
@@ -14632,6 +14630,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
}
SmallDenseSet<std::pair<Value *, Value *>, 8> CheckedScalarUser;
for (ExternalUser &EU : ExternalUses) {
+ LLVM_DEBUG(dbgs() << "SLP: Computing cost for external use of TreeEntry "
+ << EU.E.Idx << " in lane " << EU.Lane << "\n");
+ LLVM_DEBUG(dbgs() << " User:" << *EU.User << "\n");
+ LLVM_DEBUG(dbgs() << " Use: " << EU.Scalar->getNameOrAsOperand() << "\n");
+
// Uses by ephemeral values are free (because the ephemeral value will be
// removed prior to code generation, and so the extraction will be
// removed as well).
@@ -14739,6 +14742,8 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
// for the extract and the added cost of the sign extend if needed.
InstructionCost ExtraCost = TTI::TCC_Free;
auto *ScalarTy = EU.Scalar->getType();
+ const unsigned BundleWidth = EU.E.getVectorFactor();
+ assert(EU.Lane < BundleWidth && "Extracted lane out of bounds.");
auto *VecTy = getWidenedType(ScalarTy, BundleWidth);
const TreeEntry *Entry = &EU.E;
auto It = MinBWs.find(Entry);
@@ -14752,10 +14757,14 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef<Value *> VectorizedVals,
VecTy = getWidenedType(MinTy, BundleWidth);
ExtraCost =
getExtractWithExtendCost(*TTI, Extend, ScalarTy, VecTy, EU.Lane);
+ LLVM_DEBUG(dbgs() << " ExtractExtend or ExtractSubvec cost: "
+ << ExtraCost << "\n");
} else {
ExtraCost =
getVectorInstrCost(*TTI, ScalarTy, Instruction::ExtractElement, VecTy,
CostKind, EU.Lane, EU.Scalar, ScalarUserAndIdx);
+ LLVM_DEBUG(dbgs() << " ExtractElement cost for " << *ScalarTy << " from "
+ << *VecTy << ": " << ExtraCost << "\n");
}
// Leave the scalar instructions as is if they are cheaper than extracts.
if (Entry->Idx != 0 || Entry->getOpcode() == Instruction::GetElementPtr ||
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
index ce26bd3b89392..e800b5e016b74 100644
--- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
@@ -31,3 +31,34 @@ define void @test() {
store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
ret void
}
+
+; Same as above, but %a7 is also used as a scalar and must be extracted from
+; the wide load. (Or in this case, kept as a scalar load).
+define double @test_with_extract() {
+; CHECK-LABEL: @test_with_extract(
+; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8
+; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8
+; CHECK-NEXT: ret double [[A7]]
+;
+ %a0 = load double, ptr @src, align 8
+ %a1 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8
+ %a2 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8
+ %a3 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8
+ %a4 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8
+ %a5 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8
+ %a6 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8
+ %a7 = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8
+ %res1 = fsub fast double %a0, %a1
+ %res2 = fsub fast double %a2, %a3
+ %res3 = fsub fast double %a4, %a5
+ %res4 = fsub fast double %a6, %a7
+ store double %res1, ptr @dst, align 8
+ store double %res2, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 1), align 8
+ store double %res3, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 2), align 8
+ store double %res4, ptr getelementptr inbounds ([8 x double], ptr @dst, i32 0, i64 3), align 8
+ ret double %a7
+}
|
define double @test_with_extract() { | ||
; CHECK-LABEL: @test_with_extract( | ||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 | ||
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
One of the things I do find surprising is that the scalar load of %a7
is kept, instead of being extracted from the last lane of TMP1
. I guess it is fine, but costing and codegen are inconsistent. Changing that behaviour is out of scope for this PR.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost of the original scalar is less than the extractelement, so prefer to stick with the original scalar instead of generating extractelement instruction
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to extract this test into a separate file and make it generating the remarks with the cost to see how it affects the cost
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost is actually the same, because the Lane idx is essentially ignored by TTI implementations of getVectorInstrCost()
. However, querying the cost for a lane that is out-of-bounds is still wrong.
In my local experiments with REVEC, I ended up querying the extraction cost of a sub-vector with an index that is out of bounds, and that did trigger an assert in getShuffleCost()
. I just cannot find a test that reproduces the issue without bringing in more changes.
Do you still want me to separate the test in a different file and show the costs (even though they would not change)?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost is actually the same, because the Lane idx is essentially ignored by TTI implementations of
getVectorInstrCost()
. However, querying the cost for a lane that is out-of-bounds is still wrong.
I don't think it is true. In many cases, an extract from Index 0 is cheaper than an extract from other lanes, so getVectorInstrCost checks for the lane index.
I think you can see the difference if the original vector factor occupies more than a single register. In this case, some type legalization (spanning across registers) may add to the cost.
In my local experiments with REVEC, I ended up querying the extraction cost of a sub-vector with an index that is out of bounds, and that did trigger an assert in
getShuffleCost()
. I just cannot find a test that reproduces the issue without bringing in more changes.Do you still want me to separate the test in a different file and show the costs (even though they would not change)?
Would be good to try to adjust the test to either cause a crash, or show cost changes
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I wasn't very precise, I meant getVectorInstrCost
implementations don't generally check if the lane is in bounds, but they do indeed check for specific values like 0
or -1
.
As mentioned in the description, the added assert does trigger crashes if BundleWidth
isn't correctly computed. I thought this would be enough to show that the parameters given to getVectorInstrCost
are incorrect.
define double @test_with_extract() { | ||
; CHECK-LABEL: @test_with_extract( | ||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 | ||
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The cost of the original scalar is less than the extractelement, so prefer to stick with the original scalar instead of generating extractelement instruction
define double @test_with_extract() { | ||
; CHECK-LABEL: @test_with_extract( | ||
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 | ||
; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to extract this test into a separate file and make it generating the remarks with the cost to see how it affects the cost
c82c78c
to
c7082aa
Compare
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds. While experimenting with re-vectorisation for AArch64, we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in. This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert: - foo() in llvm/test/Transforms/SLPVectorizer/X86/horizontal.ll - test() in llvm/test/Transforms/SLPVectorizer/X86/reduction-with-removed-extracts.ll - test_with_extract() in llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll
c7082aa
to
a4b41eb
Compare
It assumed that the VF remains constant throughout the tree. That's not always true. This meant that we could query the extraction cost for a lane that is out of bounds.
While experimenting with re-vectorisation for AArch64, we ran into this issue. We cannot add a proper AArch64 test as more changes would need to be brought in.
This commit is only fixing the computation of VF and adding an assert. Some tests were failing after adding the assert: