-
Notifications
You must be signed in to change notification settings - Fork 12.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[LoadStoreVectorizer] Postprocess and merge equivalence classes (#121861
) This patch introduces a new method: void Vectorizer::mergeEquivalenceClasses(EquivalenceClassMap &EQClasses) const; The method is called at the end of Vectorizer::collectEquivalenceClasses() and is needed to merge equivalence classes that differ only by their underlying objects (UO1 and UO2), where UO1 is 1-level-indirection underlying base for UO2. This situation arises due to the limited lookup depth used during the search of underlying bases with llvm::getUnderlyingObject(ptr). Using any fixed lookup depth can result into creation of multiple equivalence classes that only differ by 1-level indirection bases. The new approach merges equivalence classes if they have adjacent bases (1-level indirection). If a series of equivalence classes form ladder formed of 1-step/level indirections, they are all merged into a single equivalence class. This provides more opportunities for the load-store vectorizer to generate better vectors. --------- Signed-off-by: Klochkov, Vyacheslav N <[email protected]>
- Loading branch information
1 parent
c438758
commit 9184c42
Showing
2 changed files
with
299 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
180 changes: 180 additions & 0 deletions
180
llvm/test/Transforms/LoadStoreVectorizer/X86/massive_indirection.ll
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: opt %s -mtriple=x86_64-unknown-linux-gnu -passes=load-store-vectorizer -mcpu=skx -S -o - | FileCheck %s | ||
|
||
; This test verifies that the vectorizer can handle an extended sequence of | ||
; getelementptr instructions and generate longer vectors. With special handling, | ||
; some elements can still be vectorized even if they require looking up the | ||
; common underlying object deeper than 6 levels from the original pointer. | ||
|
||
; The test below is the simplified version of actual performance oriented | ||
; workload; the offsets in getelementptr instructions are similar or same for | ||
; the test simplicity. | ||
|
||
define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8(i32 %arg0, ptr align 16 %arg1) { | ||
; CHECK-LABEL: define void @v1_v2_v4_v1_to_v8_levels_6_7_8_8( | ||
; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0:[0-9]+]] { | ||
; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504 | ||
; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768 | ||
; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]] | ||
; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16 | ||
; CHECK-NEXT: ret void | ||
; | ||
|
||
%level1 = getelementptr i8, ptr %arg1, i32 917504 | ||
%level2 = getelementptr i8, ptr %level1, i32 %arg0 | ||
%level3 = getelementptr i8, ptr %level2, i32 32768 | ||
%level4 = getelementptr i8, ptr %level3, i32 %arg0 | ||
%level5 = getelementptr i8, ptr %level4, i32 %arg0 | ||
|
||
%a6 = getelementptr i8, ptr %level5, i32 %arg0 | ||
%b7 = getelementptr i8, ptr %a6, i32 2 | ||
%c8 = getelementptr i8, ptr %b7, i32 8 | ||
%d8 = getelementptr i8, ptr %b7, i32 12 | ||
|
||
store half 0xH0000, ptr %a6, align 16 | ||
store <4 x half> zeroinitializer, ptr %b7, align 2 | ||
store <2 x half> zeroinitializer, ptr %c8, align 2 | ||
store half 0xH0000, ptr %d8, align 2 | ||
ret void | ||
} | ||
|
||
define void @v1x8_levels_6_7_8_9_10_11_12_13(i32 %arg0, ptr align 16 %arg1) { | ||
; CHECK-LABEL: define void @v1x8_levels_6_7_8_9_10_11_12_13( | ||
; CHECK-SAME: i32 [[ARG0:%.*]], ptr align 16 [[ARG1:%.*]]) #[[ATTR0]] { | ||
; CHECK-NEXT: [[LEVEL1:%.*]] = getelementptr i8, ptr [[ARG1]], i32 917504 | ||
; CHECK-NEXT: [[LEVEL2:%.*]] = getelementptr i8, ptr [[LEVEL1]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[LEVEL3:%.*]] = getelementptr i8, ptr [[LEVEL2]], i32 32768 | ||
; CHECK-NEXT: [[LEVEL4:%.*]] = getelementptr i8, ptr [[LEVEL3]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[LEVEL5:%.*]] = getelementptr i8, ptr [[LEVEL4]], i32 [[ARG0]] | ||
; CHECK-NEXT: [[A6:%.*]] = getelementptr i8, ptr [[LEVEL5]], i32 [[ARG0]] | ||
; CHECK-NEXT: store <8 x half> zeroinitializer, ptr [[A6]], align 16 | ||
; CHECK-NEXT: ret void | ||
; | ||
|
||
%level1 = getelementptr i8, ptr %arg1, i32 917504 | ||
%level2 = getelementptr i8, ptr %level1, i32 %arg0 | ||
%level3 = getelementptr i8, ptr %level2, i32 32768 | ||
%level4 = getelementptr i8, ptr %level3, i32 %arg0 | ||
%level5 = getelementptr i8, ptr %level4, i32 %arg0 | ||
|
||
%a6 = getelementptr i8, ptr %level5, i32 %arg0 | ||
%b7 = getelementptr i8, ptr %a6, i32 2 | ||
%c8 = getelementptr i8, ptr %b7, i32 2 | ||
%d9 = getelementptr i8, ptr %c8, i32 2 | ||
%e10 = getelementptr i8, ptr %d9, i32 2 | ||
%f11 = getelementptr i8, ptr %e10, i32 2 | ||
%g12 = getelementptr i8, ptr %f11, i32 2 | ||
%h13 = getelementptr i8, ptr %g12, i32 2 | ||
|
||
store half 0xH0000, ptr %a6, align 16 | ||
store half 0xH0000, ptr %b7, align 2 | ||
store half 0xH0000, ptr %c8, align 2 | ||
store half 0xH0000, ptr %d9, align 2 | ||
store half 0xH0000, ptr %e10, align 8 | ||
store half 0xH0000, ptr %f11, align 2 | ||
store half 0xH0000, ptr %g12, align 2 | ||
store half 0xH0000, ptr %h13, align 2 | ||
ret void | ||
} | ||
|
||
define void @v1_4_4_4_2_1_to_v8_8_levels_6_7(i32 %arg0, ptr addrspace(3) align 16 %arg1_ptr, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, half %arg6_half, half %arg7_half, <2 x half> %arg8_2xhalf) { | ||
; CHECK-LABEL: define void @v1_4_4_4_2_1_to_v8_8_levels_6_7( | ||
; CHECK-SAME: i32 [[ARG0:%.*]], ptr addrspace(3) align 16 [[ARG1_PTR:%.*]], i32 [[ARG2:%.*]], i32 [[ARG3:%.*]], i32 [[ARG4:%.*]], i32 [[ARG5:%.*]], half [[ARG6_HALF:%.*]], half [[ARG7_HALF:%.*]], <2 x half> [[ARG8_2XHALF:%.*]]) #[[ATTR0]] { | ||
; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[ARG1_PTR]], i32 458752 | ||
; CHECK-NEXT: br [[DOTPREHEADER11_PREHEADER:label %.*]] | ||
; CHECK: [[_PREHEADER11_PREHEADER:.*:]] | ||
; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[ARG0]], 6 | ||
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP1]], i32 [[TMP2]] | ||
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[ARG2]] | ||
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[ARG3]] | ||
; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[ARG0]], 2 | ||
; CHECK-NEXT: br i1 [[CMP]], [[DOTLR_PH:label %.*]], [[DOTEXIT_POINT:label %.*]] | ||
; CHECK: [[_LR_PH:.*:]] | ||
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP5]], i32 [[ARG4]] | ||
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[GEP]], i32 [[ARG5]] | ||
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x half> poison, half [[ARG6_HALF]], i32 0 | ||
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x half> [[TMP7]], half 0xH0000, i32 1 | ||
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x half> [[TMP8]], half 0xH0000, i32 2 | ||
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x half> [[TMP9]], half 0xH0000, i32 3 | ||
; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x half> [[TMP10]], half 0xH0000, i32 4 | ||
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 0 | ||
; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x half> [[TMP11]], half [[TMP12]], i32 5 | ||
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <2 x half> [[ARG8_2XHALF]], i32 1 | ||
; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x half> [[TMP13]], half [[TMP14]], i32 6 | ||
; CHECK-NEXT: [[TMP16:%.*]] = insertelement <8 x half> [[TMP15]], half [[ARG7_HALF]], i32 7 | ||
; CHECK-NEXT: store <8 x half> [[TMP16]], ptr addrspace(3) [[TMP6]], align 2 | ||
; CHECK-NEXT: br [[DOTEXIT_POINT]] | ||
; CHECK: [[_EXIT_POINT:.*:]] | ||
; CHECK-NEXT: ret void | ||
; | ||
%base1 = getelementptr inbounds i8, ptr addrspace(3) %arg1_ptr, i32 458752 | ||
br label %.preheader11.preheader | ||
|
||
.preheader11.preheader: | ||
%base2 = shl nuw nsw i32 %arg0, 6 | ||
%base3 = getelementptr inbounds i8, ptr addrspace(3) %base1, i32 %base2 | ||
|
||
%base4 = getelementptr inbounds i8, ptr addrspace(3) %base3, i32 %arg2 | ||
%base5 = getelementptr inbounds i8, ptr addrspace(3) %base4, i32 %arg3 | ||
|
||
%cmp = icmp sgt i32 %arg0, 2 | ||
br i1 %cmp, label %.lr.ph, label %.exit_point | ||
|
||
.lr.ph: | ||
%gep = getelementptr inbounds i8, ptr addrspace(3) %base5, i32 %arg4 | ||
|
||
%dst = getelementptr inbounds i8, ptr addrspace(3) %gep, i32 %arg5 | ||
%dst_off2 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 2 | ||
%dst_off10 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 10 | ||
%dst_off14 = getelementptr inbounds i8, ptr addrspace(3) %dst, i32 14 | ||
|
||
store half %arg6_half, ptr addrspace(3) %dst, align 2 | ||
store <4 x half> zeroinitializer, ptr addrspace(3) %dst_off2, align 2 | ||
store <2 x half> %arg8_2xhalf, ptr addrspace(3) %dst_off10, align 2 | ||
store half %arg7_half, ptr addrspace(3) %dst_off14, align 2 | ||
br label %.exit_point | ||
|
||
.exit_point: | ||
ret void | ||
} | ||
|
||
; The regression test for merging equivalence classes. It is reduced and adapted | ||
; for LSV from llvm/test/CodeGen/NVPTX/variadics-backend.ll, which failed at | ||
; post-commit checks with memory sanitizer on the initial attempt to implement | ||
; the merging of the equivalence classes. | ||
define void @variadics1(ptr %vlist) { | ||
; CHECK-LABEL: define void @variadics1( | ||
; CHECK-SAME: ptr [[VLIST:%.*]]) #[[ATTR0]] { | ||
; CHECK-NEXT: [[ARGP_CUR7_ALIGNED2:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[VLIST]], i64 0) | ||
; CHECK-NEXT: [[ARGP_NEXT8:%.*]] = getelementptr i8, ptr [[ARGP_CUR7_ALIGNED2]], i64 8 | ||
; CHECK-NEXT: [[X0:%.*]] = getelementptr i8, ptr [[ARGP_NEXT8]], i32 7 | ||
; CHECK-NEXT: [[ARGP_CUR11_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X0]], i64 0) | ||
; CHECK-NEXT: [[ARGP_NEXT12:%.*]] = getelementptr i8, ptr [[ARGP_CUR11_ALIGNED]], i64 8 | ||
; CHECK-NEXT: [[X2:%.*]] = getelementptr i8, ptr [[ARGP_NEXT12]], i32 7 | ||
; CHECK-NEXT: [[ARGP_CUR16_ALIGNED:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[X2]], i64 0) | ||
; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[ARGP_CUR16_ALIGNED]], align 4294967296 | ||
; CHECK-NEXT: [[X31:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 | ||
; CHECK-NEXT: [[X42:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 | ||
; CHECK-NEXT: [[X5:%.*]] = fadd double [[X42]], [[X31]] | ||
; CHECK-NEXT: store double [[X5]], ptr null, align 8 | ||
; CHECK-NEXT: ret void | ||
; | ||
%argp.cur7.aligned2 = call ptr @llvm.ptrmask.p0.i64(ptr %vlist, i64 0) | ||
%argp.next8 = getelementptr i8, ptr %argp.cur7.aligned2, i64 8 | ||
%x0 = getelementptr i8, ptr %argp.next8, i32 7 | ||
%argp.cur11.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x0, i64 0) | ||
%argp.next12 = getelementptr i8, ptr %argp.cur11.aligned, i64 8 | ||
%x2 = getelementptr i8, ptr %argp.next12, i32 7 | ||
%argp.cur16.aligned = call ptr @llvm.ptrmask.p0.i64(ptr %x2, i64 0) | ||
%x3 = load double, ptr %argp.cur16.aligned, align 8 | ||
%argp.cur16.aligned_off8 = getelementptr i8, ptr %argp.cur16.aligned, i32 8 | ||
%x4 = load double, ptr %argp.cur16.aligned_off8, align 8 | ||
%x5 = fadd double %x4, %x3 | ||
store double %x5, ptr null, align 8 | ||
ret void | ||
} | ||
|
||
declare ptr @llvm.ptrmask.p0.i64(ptr, i64) |