Skip to content

Commit

Permalink
[LoadStoreVectorizer] Postprocess and merge equivalence classes (#121861
Browse files Browse the repository at this point in the history
)

This patch introduces a new method:

void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses)
const;

The method is called at the end of
Vectorizer::collectEquivalenceClasses() and is needed to merge
equivalence classes that differ only by their underlying objects (UO1
and UO2), where UO1 is 1-level-indirection underlying base for UO2. This
situation arises due to the limited lookup depth used during the search
of underlying bases with llvm::getUnderlyingObject(ptr).

Using any fixed lookup depth can result into creation of multiple
equivalence classes that only differ by 1-level indirection bases.

The new approach merges equivalence classes if they have adjacent bases
(1-level indirection). If a series of equivalence classes form ladder
formed of 1-step/level indirections, they are all merged into a single
equivalence class. This provides more opportunities for the load-store
vectorizer to generate better vectors.

---------

Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
  • Loading branch information
v-klochkov authored Jan 8, 2025
1 parent c438758 commit 9184c42
Show file tree
Hide file tree
Showing 2 changed files with 299 additions and 0 deletions.
119 changes: 119 additions & 0 deletions llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,11 @@ class Vectorizer {
Instruction *ChainElem, Instruction *ChainBegin,
const DenseMap<Instruction *, APInt /*OffsetFromLeader*/> &ChainOffsets);

/// Merges the equivalence classes if they have underlying objects that differ
/// by one level of indirection (i.e., one is a getelementptr and the other is
/// the base pointer in that getelementptr).
void mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const;

/// Collects loads and stores grouped by "equivalence class", where:
/// - all elements in an eq class are a load or all are a store,
/// - they all load/store the same element size (it's OK to have e.g. i8 and
Expand Down Expand Up @@ -1305,6 +1310,119 @@ std::optional<APInt> Vectorizer::getConstantOffsetSelects(
return std::nullopt;
}

void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const {
if (EQClasses.size() < 2) // There is nothing to merge.
return;

// The reduced key has all elements of the ECClassKey except the underlying
// object. Check that EqClassKey has 4 elements and define the reduced key.
static_assert(std::tuple_size_v<EqClassKey> == 4,
"EqClassKey has changed - EqClassReducedKey needs changes too");
using EqClassReducedKey =
std::tuple<std::tuple_element_t<1, EqClassKey> /* AddrSpace */,
std::tuple_element_t<2, EqClassKey> /* Element size */,
std::tuple_element_t<3, EqClassKey> /* IsLoad; */>;
using ECReducedKeyToUnderlyingObjectMap =
MapVector<EqClassReducedKey,
SmallPtrSet<std::tuple_element_t<0, EqClassKey>, 4>>;

// Form a map from the reduced key (without the underlying object) to the
// underlying objects: 1 reduced key to many underlying objects, to form
// groups of potentially merge-able equivalence classes.
ECReducedKeyToUnderlyingObjectMap RedKeyToUOMap;
bool FoundPotentiallyOptimizableEC = false;
for (const auto &EC : EQClasses) {
const auto &Key = EC.first;
EqClassReducedKey RedKey{std::get<1>(Key), std::get<2>(Key),
std::get<3>(Key)};
RedKeyToUOMap[RedKey].insert(std::get<0>(Key));
if (RedKeyToUOMap[RedKey].size() > 1)
FoundPotentiallyOptimizableEC = true;
}
if (!FoundPotentiallyOptimizableEC)
return;

LLVM_DEBUG({
dbgs() << "LSV: mergeEquivalenceClasses: before merging:\n";
for (const auto &EC : EQClasses) {
dbgs() << " Key: {" << EC.first << "}\n";
for (const auto &Inst : EC.second)
dbgs() << " Inst: " << *Inst << '\n';
}
});
LLVM_DEBUG({
dbgs() << "LSV: mergeEquivalenceClasses: RedKeyToUOMap:\n";
for (const auto &RedKeyToUO : RedKeyToUOMap) {
dbgs() << " Reduced key: {" << std::get<0>(RedKeyToUO.first) << ", "
<< std::get<1>(RedKeyToUO.first) << ", "
<< static_cast<int>(std::get<2>(RedKeyToUO.first)) << "} --> "
<< RedKeyToUO.second.size() << " underlying objects:\n";
for (auto UObject : RedKeyToUO.second)
dbgs() << " " << *UObject << '\n';
}
});

using UObjectToUObjectMap = DenseMap<const Value *, const Value *>;

// Compute the ultimate targets for a set of underlying objects.
auto GetUltimateTargets =
[](SmallPtrSetImpl<const Value *> &UObjects) -> UObjectToUObjectMap {
UObjectToUObjectMap IndirectionMap;
for (const auto *UObject : UObjects) {
const unsigned MaxLookupDepth = 1; // look for 1-level indirections only
const auto *UltimateTarget = getUnderlyingObject(UObject, MaxLookupDepth);
if (UltimateTarget != UObject)
IndirectionMap[UObject] = UltimateTarget;
}
UObjectToUObjectMap UltimateTargetsMap;
for (const auto *UObject : UObjects) {
auto Target = UObject;
auto It = IndirectionMap.find(Target);
for (; It != IndirectionMap.end(); It = IndirectionMap.find(Target))
Target = It->second;
UltimateTargetsMap[UObject] = Target;
}
return UltimateTargetsMap;
};

// For each item in RedKeyToUOMap, if it has more than one underlying object,
// try to merge the equivalence classes.
for (auto &[RedKey, UObjects] : RedKeyToUOMap) {
if (UObjects.size() < 2)
continue;
auto UTMap = GetUltimateTargets(UObjects);
for (const auto &[UObject, UltimateTarget] : UTMap) {
if (UObject == UltimateTarget)
continue;

EqClassKey KeyFrom{UObject, std::get<0>(RedKey), std::get<1>(RedKey),
std::get<2>(RedKey)};
EqClassKey KeyTo{UltimateTarget, std::get<0>(RedKey), std::get<1>(RedKey),
std::get<2>(RedKey)};
// The entry for KeyFrom is guarantted to exist, unlike KeyTo. Thus,
// request the reference to the instructions vector for KeyTo first.
const auto &VecTo = EQClasses[KeyTo];
const auto &VecFrom = EQClasses[KeyFrom];
SmallVector<Instruction *, 8> MergedVec;
std::merge(VecFrom.begin(), VecFrom.end(), VecTo.begin(), VecTo.end(),
std::back_inserter(MergedVec),
[](Instruction *A, Instruction *B) {
return A && B && A->comesBefore(B);
});
EQClasses[KeyTo] = std::move(MergedVec);
EQClasses.erase(KeyFrom);
}
}
LLVM_DEBUG({
dbgs() << "LSV: mergeEquivalenceClasses: after merging:\n";
for (const auto &EC : EQClasses) {
dbgs() << " Key: {" << EC.first << "}\n";
for (const auto &Inst : EC.second)
dbgs() << " Inst: " << *Inst << '\n';
}
});
}

EquivalenceClassMap
Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
BasicBlock::iterator End) {
Expand Down Expand Up @@ -1377,6 +1495,7 @@ Vectorizer::collectEquivalenceClasses(BasicBlock::iterator Begin,
.emplace_back(&I);
}

mergeEquivalenceClasses(Ret);
return Ret;
}

Expand Down
180 changes: 180 additions & 0 deletions llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s

; This test verifies that the vectorizer can handle an extended sequence of
; getelementptr instructions and generate longer vectors. With special handling,
; some elements can still be vectorized even if they require looking up the
; common underlying object deeper than 6 levels from the original pointer.

; The test below is the simplified version of actual performance oriented
; workload; the offsets in getelementptr instructions are similar or same for
; the test simplicity.

define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) {
; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(
; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16
; CHECK-NEXT: ret void
;

%level1 = getelementptr i8, ptr %arg1, i32 917504
%level2 = getelementptr i8, ptr %level1, i32 %arg0
%level3 = getelementptr i8, ptr %level2, i32 32768
%level4 = getelementptr i8, ptr %level3, i32 %arg0
%level5 = getelementptr i8, ptr %level4, i32 %arg0

%a6 = getelementptr i8, ptr %level5, i32 %arg0
%b7 = getelementptr i8, ptr %a6, i32 2
%c8 = getelementptr i8, ptr %b7, i32 8
%d8 = getelementptr i8, ptr %b7, i32 12

store half 0xH0000, ptr %a6, align 16
store <4 x half> zeroinitializer, ptr %b7, align 2
store <2 x half> zeroinitializer, ptr %c8, align 2
store half 0xH0000, ptr %d8, align 2
ret void
}

define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) {
; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13(
; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504
; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]]
; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768
; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]]
; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]]
; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]]
; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16
; CHECK-NEXT: ret void
;

%level1 = getelementptr i8, ptr %arg1, i32 917504
%level2 = getelementptr i8, ptr %level1, i32 %arg0
%level3 = getelementptr i8, ptr %level2, i32 32768
%level4 = getelementptr i8, ptr %level3, i32 %arg0
%level5 = getelementptr i8, ptr %level4, i32 %arg0

%a6 = getelementptr i8, ptr %level5, i32 %arg0
%b7 = getelementptr i8, ptr %a6, i32 2
%c8 = getelementptr i8, ptr %b7, i32 2
%d9 = getelementptr i8, ptr %c8, i32 2
%e10 = getelementptr i8, ptr %d9, i32 2
%f11 = getelementptr i8, ptr %e10, i32 2
%g12 = getelementptr i8, ptr %f11, i32 2
%h13 = getelementptr i8, ptr %g12, i32 2

store half 0xH0000, ptr %a6, align 16
store half 0xH0000, ptr %b7, align 2
store half 0xH0000, ptr %c8, align 2
store half 0xH0000, ptr %d9, align 2
store half 0xH0000, ptr %e10, align 8
store half 0xH0000, ptr %f11, align 2
store half 0xH0000, ptr %g12, align 2
store half 0xH0000, ptr %h13, align 2
ret void
}

define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) {
; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(
; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752
; CHECK-NEXT: br [[DOTPREHEADER11_PREHEADER:label %.*]]
; CHECK: [[_PREHEADER11_PREHEADER:.*:]]
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]]
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2
; CHECK-NEXT: br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]]
; CHECK: [[_LR_PH:.*:]]
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]]
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7
; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2
; CHECK-NEXT: br [[DOTEXIT_POINT]]
; CHECK: [[_EXIT_POINT:.*:]]
; CHECK-NEXT: ret void
;
%base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752
br label %.preheader11.preheader

.preheader11.preheader:
%base2 = shl nuw nsw i32 %arg0, 6
%base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2

%base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2
%base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3

%cmp = icmp sgt i32 %arg0, 2
br i1 %cmp, label %.lr.ph, label %.exit_point

.lr.ph:
%gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4

%dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5
%dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2
%dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10
%dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14

store half %arg6_half, ptr addrspace(3) %dst, align 2
store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2
store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2
store half %arg7_half, ptr addrspace(3) %dst_off14, align 2
br label %.exit_point

.exit_point:
ret void
}

; The regression test for merging equivalence classes. It is reduced and adapted
; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at
; post-commit checks with memory sanitizer on the initial attempt to implement
; the merging of the equivalence classes.
define void @variadics1(ptr %vlist) {
; CHECK-LABEL: define void @variadics1(
; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0)
; CHECK-NEXT: [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8
; CHECK-NEXT: [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7
; CHECK-NEXT: [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0)
; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8
; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7
; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0)
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296
; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]]
; CHECK-NEXT: store double [[X5]], ptr null, align 8
; CHECK-NEXT: ret void
;
%argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0)
%argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8
%x0 = getelementptr i8, ptr %argp.next8, i32 7
%argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0)
%argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8
%x2 = getelementptr i8, ptr %argp.next12, i32 7
%argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0)
%x3 = load double, ptr %argp.cur16.aligned, align 8
%argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8
%x4 = load double, ptr %argp.cur16.aligned_off8, align 8
%x5 = fadd double %x4, %x3
store double %x5, ptr null, align 8
ret void
}

declare ptr @llvm.ptrmask.p0.i64(ptr, i64)

0 comments on commit 9184c42

Please sign in to comment.