diff --git a/README.txt b/README.txt index b9b71a3b6daf..ebbd50fc0b71 100644 --- a/README.txt +++ b/README.txt @@ -15,3 +15,4 @@ documentation setup. If you are writing a package for LLVM, see docs/Packaging.rst for our suggestions. + diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 2c69abfdd0bc..1afba7557bd7 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -9,17 +9,10 @@ Frequently Asked Questions (FAQ) License ======= -Does the University of Illinois Open Source License really qualify as an "open source" license? ------------------------------------------------------------------------------------------------ -Yes, the license is `certified -`_ by the Open Source -Initiative (OSI). - - Can I modify LLVM source code and redistribute the modified source? ------------------------------------------------------------------- Yes. The modified source distribution must retain the copyright notice and -follow the three bulleted conditions listed in the `LLVM license +follow the conditions listed in the `LLVM license `_. @@ -41,10 +34,12 @@ the STL. How portable is the LLVM source code? ------------------------------------- The LLVM source code should be portable to most modern Unix-like operating -systems. Most of the code is written in standard C++ with operating system +systems. LLVM has also excellent support on Windows systems. +Most of the code is written in standard C++ with operating system services abstracted to a support library. The tools required to build and test LLVM have been ported to a plethora of platforms. + What API do I use to store a value to one of the virtual registers in LLVM IR's SSA representation? --------------------------------------------------------------------------------------------------- diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h index 94e76a75e8da..069d0aa45095 100644 --- a/include/llvm/CodeGen/MIRYamlMapping.h +++ b/include/llvm/CodeGen/MIRYamlMapping.h @@ -314,6 +314,7 @@ struct ScalarEnumerationTraits { static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) { IO.enumCase(ID, "default", TargetStackID::Default); IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill); + IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector); IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc); } }; diff --git a/include/llvm/CodeGen/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h index 284f7ba64dba..6e4a723b426f 100644 --- a/include/llvm/CodeGen/TargetFrameLowering.h +++ b/include/llvm/CodeGen/TargetFrameLowering.h @@ -28,6 +28,7 @@ namespace TargetStackID { enum Value { Default = 0, SGPRSpill = 1, + SVEVector = 2, NoAlloc = 255 }; } diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index fa053fa70291..c69815066249 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -1649,7 +1649,7 @@ class IRBuilder : public IRBuilderBase, public Inserter { StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false) { StoreInst *SI = CreateStore(Val, Ptr, isVolatile); - SI->setAlignment(Align); + SI->setAlignment(MaybeAlign(Align)); return SI; } diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 71cf9fc38d83..eaaf50646462 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -374,8 +374,6 @@ class StoreInst : public Instruction { return 0; } - // FIXME: Remove once migration to Align is over. - void setAlignment(unsigned Align); void setAlignment(MaybeAlign Align); /// Returns the ordering constraint of this store instruction. diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 896170b4be57..ab6ee7f92dd1 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -899,7 +899,10 @@ class AMDGPURawBufferLoad : Intrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -911,7 +914,10 @@ class AMDGPUStructBufferLoad : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -923,7 +929,10 @@ class AMDGPURawBufferStore : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -936,7 +945,10 @@ class AMDGPUStructBufferStore : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -1050,7 +1062,10 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1061,7 +1076,10 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1072,7 +1090,10 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1084,7 +1105,10 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h index 1662d06bf91e..ef2b4fba031d 100644 --- a/include/llvm/ObjectYAML/ELFYAML.h +++ b/include/llvm/ObjectYAML/ELFYAML.h @@ -137,7 +137,8 @@ struct Section { StackSizes, SymtabShndxSection, Symver, - MipsABIFlags + MipsABIFlags, + Addrsig }; SectionKind Kind; StringRef Name; @@ -256,6 +257,26 @@ struct VerneedSection : Section { } }; +struct AddrsigSymbol { + AddrsigSymbol(StringRef N) : Name(N), Index(None) {} + AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {} + AddrsigSymbol() : Name(None), Index(None) {} + + Optional Name; + Optional Index; +}; + +struct AddrsigSection : Section { + Optional Content; + Optional Size; + Optional> Symbols; + + AddrsigSection() : Section(SectionKind::Addrsig) {} + static bool classof(const Section *S) { + return S->Kind == SectionKind::Addrsig; + } +}; + struct SymverSection : Section { std::vector Entries; @@ -362,6 +383,7 @@ struct Object { } // end namespace ELFYAML } // end namespace llvm +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader) @@ -518,6 +540,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::VernauxEntry &E); }; +template <> struct MappingTraits { + static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym); +}; + template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::Relocation &Rel); }; diff --git a/include/llvm/Target/GlobalISel/Combine.td b/include/llvm/Target/GlobalISel/Combine.td new file mode 100644 index 000000000000..065e28eca8a6 --- /dev/null +++ b/include/llvm/Target/GlobalISel/Combine.td @@ -0,0 +1,17 @@ +//===- Combine.td - Combine rule definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declare GlobalISel combine rules and provide mechanisms to opt-out. +// +//===----------------------------------------------------------------------===// + +// Declares a combiner helper class +class GICombinerHelper { + // The class name to use in the generated output. + string Classname = classname; +} diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h index b722c47c1cab..88c2ef787ad8 100644 --- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -181,6 +181,7 @@ class LibCallSimplifier { Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B); Value *optimizeRealloc(CallInst *CI, IRBuilder<> &B); Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B); + Value *optimizeBCopy(CallInst *CI, IRBuilder<> &B); // Wrapper for all String/Memory Library Call Optimizations Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B); diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp index 7d6429a0fec1..129944743c5e 100644 --- a/lib/Analysis/AssumptionCache.cpp +++ b/lib/Analysis/AssumptionCache.cpp @@ -130,7 +130,10 @@ void AssumptionCache::unregisterAssumption(CallInst *CI) { if (AVI != AffectedValues.end()) AffectedValues.erase(AVI); } - remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }); + + AssumeHandles.erase( + remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }), + AssumeHandles.end()); } void AssumptionCache::AffectedValueCallbackVH::deleted() { diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp index 6018968c199d..d103c3a8b831 100644 --- a/lib/Analysis/MemorySSAUpdater.cpp +++ b/lib/Analysis/MemorySSAUpdater.cpp @@ -347,51 +347,54 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { // If this is the first def in the block and this insert is in an arbitrary // place, compute IDF and place phis. + SmallPtrSet DefiningBlocks; + + // If this is the last Def in the block, also compute IDF based on MD, since + // this may a new Def added, and we may need additional Phis. auto Iter = MD->getDefsIterator(); ++Iter; auto IterEnd = MSSA->getBlockDefs(MD->getBlock())->end(); - if (Iter == IterEnd) { - SmallPtrSet DefiningBlocks; + if (Iter == IterEnd) DefiningBlocks.insert(MD->getBlock()); - for (const auto &VH : InsertedPHIs) - if (const auto *RealPHI = cast_or_null(VH)) - DefiningBlocks.insert(RealPHI->getBlock()); - ForwardIDFCalculator IDFs(*MSSA->DT); - SmallVector IDFBlocks; - IDFs.setDefiningBlocks(DefiningBlocks); - IDFs.calculate(IDFBlocks); - SmallVector, 4> NewInsertedPHIs; - for (auto *BBIDF : IDFBlocks) { - auto *MPhi = MSSA->getMemoryAccess(BBIDF); - if (!MPhi) { - MPhi = MSSA->createMemoryPhi(BBIDF); - NewInsertedPHIs.push_back(MPhi); - } - // Add the phis created into the IDF blocks to NonOptPhis, so they are - // not optimized out as trivial by the call to getPreviousDefFromEnd - // below. Once they are complete, all these Phis are added to the - // FixupList, and removed from NonOptPhis inside fixupDefs(). Existing - // Phis in IDF may need fixing as well, and potentially be trivial - // before this insertion, hence add all IDF Phis. See PR43044. - NonOptPhis.insert(MPhi); + + for (const auto &VH : InsertedPHIs) + if (const auto *RealPHI = cast_or_null(VH)) + DefiningBlocks.insert(RealPHI->getBlock()); + ForwardIDFCalculator IDFs(*MSSA->DT); + SmallVector IDFBlocks; + IDFs.setDefiningBlocks(DefiningBlocks); + IDFs.calculate(IDFBlocks); + SmallVector, 4> NewInsertedPHIs; + for (auto *BBIDF : IDFBlocks) { + auto *MPhi = MSSA->getMemoryAccess(BBIDF); + if (!MPhi) { + MPhi = MSSA->createMemoryPhi(BBIDF); + NewInsertedPHIs.push_back(MPhi); } - for (auto &MPhi : NewInsertedPHIs) { - auto *BBIDF = MPhi->getBlock(); - for (auto *Pred : predecessors(BBIDF)) { - DenseMap> CachedPreviousDef; - MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), - Pred); - } + // Add the phis created into the IDF blocks to NonOptPhis, so they are not + // optimized out as trivial by the call to getPreviousDefFromEnd below. + // Once they are complete, all these Phis are added to the FixupList, and + // removed from NonOptPhis inside fixupDefs(). Existing Phis in IDF may + // need fixing as well, and potentially be trivial before this insertion, + // hence add all IDF Phis. See PR43044. + NonOptPhis.insert(MPhi); + } + for (auto &MPhi : NewInsertedPHIs) { + auto *BBIDF = MPhi->getBlock(); + for (auto *Pred : predecessors(BBIDF)) { + DenseMap> CachedPreviousDef; + MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), Pred); } + } - // Re-take the index where we're adding the new phis, because the above - // call to getPreviousDefFromEnd, may have inserted into InsertedPHIs. - NewPhiIndex = InsertedPHIs.size(); - for (auto &MPhi : NewInsertedPHIs) { - InsertedPHIs.push_back(&*MPhi); - FixupList.push_back(&*MPhi); - } + // Re-take the index where we're adding the new phis, because the above call + // to getPreviousDefFromEnd, may have inserted into InsertedPHIs. + NewPhiIndex = InsertedPHIs.size(); + for (auto &MPhi : NewInsertedPHIs) { + InsertedPHIs.push_back(&*MPhi); + FixupList.push_back(&*MPhi); } + FixupList.push_back(MD); } diff --git a/lib/CodeGen/AsmPrinter/DebugLocStream.h b/lib/CodeGen/AsmPrinter/DebugLocStream.h index 789291771b5a..a062baf7698a 100644 --- a/lib/CodeGen/AsmPrinter/DebugLocStream.h +++ b/lib/CodeGen/AsmPrinter/DebugLocStream.h @@ -38,14 +38,10 @@ class DebugLocStream { : CU(CU), EntryOffset(EntryOffset) {} }; struct Entry { - const MCSymbol *BeginSym; - const MCSymbol *EndSym; + const MCSymbol *Begin; + const MCSymbol *End; size_t ByteOffset; size_t CommentOffset; - Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset, - size_t CommentOffset) - : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset), - CommentOffset(CommentOffset) {} }; private: @@ -93,7 +89,7 @@ class DebugLocStream { /// Until the next call, bytes added to the stream will be added to this /// entry. void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) { - Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size()); + Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()}); } /// Finalize a .debug_loc entry, deleting if it's empty. diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index dcd443aca0f6..e0978c074756 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -337,13 +337,13 @@ void DwarfCompileUnit::addRange(RangeSpan Range) { // emitted into and the subprogram was contained within. If these are the // same then extend our current range, otherwise add this as a new range. if (CURanges.empty() || !SameAsPrevCU || - (&CURanges.back().getEnd()->getSection() != - &Range.getEnd()->getSection())) { + (&CURanges.back().End->getSection() != + &Range.End->getSection())) { CURanges.push_back(Range); return; } - CURanges.back().setEnd(Range.getEnd()); + CURanges.back().End = Range.End; } void DwarfCompileUnit::initStmtList() { @@ -517,7 +517,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( if (Ranges.size() == 1 || !DD->useRangesSection()) { const RangeSpan &Front = Ranges.front(); const RangeSpan &Back = Ranges.back(); - attachLowHighPC(Die, Front.getStart(), Back.getEnd()); + attachLowHighPC(Die, Front.Begin, Back.End); } else addScopeRangeList(Die, std::move(Ranges)); } @@ -527,8 +527,8 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( SmallVector List; List.reserve(Ranges.size()); for (const InsnRange &R : Ranges) - List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first), - DD->getLabelAfterInsn(R.second))); + List.push_back( + {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)}); attachRangesOrLowHighPC(Die, std::move(List)); } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 8237404bf8e9..aeca172cfdb4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1098,7 +1098,7 @@ void DwarfDebug::finalizeModuleInfo() { // 2.17.3). U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0); else - U.setBaseAddress(TheCU.getRanges().front().getStart()); + U.setBaseAddress(TheCU.getRanges().front().Begin); U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges()); } @@ -1807,7 +1807,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { collectEntityInfo(TheCU, SP, Processed); // Add the range of this function to the list of ranges for the CU. - TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); + TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()}); // Under -gmlt, skip building the subprogram if there are no inlined // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram @@ -2325,12 +2325,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_offset_pair"); Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1); Asm->OutStreamer->AddComment(" starting offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.Begin, Base); Asm->OutStreamer->AddComment(" ending offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Base); } else { - Asm->EmitLabelDifference(Entry.BeginSym, Base, Size); - Asm->EmitLabelDifference(Entry.EndSym, Base, Size); + Asm->EmitLabelDifference(Entry.Begin, Base, Size); + Asm->EmitLabelDifference(Entry.End, Base, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2346,12 +2346,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_startx_length"); Asm->emitInt8(dwarf::DW_LLE_startx_length); Asm->OutStreamer->AddComment(" start idx"); - Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym)); + Asm->EmitULEB128(AddrPool.getIndex(Entry.Begin)); Asm->OutStreamer->AddComment(" length"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Entry.Begin); } else { - Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size); - Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.Begin, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.End, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2386,9 +2386,9 @@ void DwarfDebug::emitDebugLocDWO() { // Ideally/in v5, this could use SectionLabels to reuse existing addresses // in the address pool to minimize object size/relocations. Asm->emitInt8(dwarf::DW_LLE_startx_length); - unsigned idx = AddrPool.getIndex(Entry.BeginSym); + unsigned idx = AddrPool.getIndex(Entry.Begin); Asm->EmitULEB128(idx); - Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4); + Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4); emitDebugLocEntryLocation(Entry, List.CU); } @@ -2570,7 +2570,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, auto Size = Asm->MAI->getCodePointerSize(); for (const RangeSpan &Range : List.getRanges()) - SectionRanges[&Range.getStart()->getSection()].push_back(&Range); + SectionRanges[&Range.Begin->getSection()].push_back(&Range); const DwarfCompileUnit &CU = List.getCU(); const MCSymbol *CUBase = CU.getBaseAddress(); @@ -2586,7 +2586,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, if (!Base && (P.second.size() > 1 || DwarfVersion < 5) && (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) { BaseIsSet = true; - Base = DD.getSectionLabel(&P.second.front()->getStart()->getSection()); + Base = DD.getSectionLabel(&P.second.front()->Begin->getSection()); if (DwarfVersion >= 5) { Asm->OutStreamer->AddComment("DW_RLE_base_addressx"); Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1); @@ -2605,8 +2605,8 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, } for (const auto *RS : P.second) { - const MCSymbol *Begin = RS->getStart(); - const MCSymbol *End = RS->getEnd(); + const MCSymbol *Begin = RS->Begin; + const MCSymbol *End = RS->End; assert(Begin && "Range without a begin symbol?"); assert(End && "Range without an end symbol?"); if (Base) { diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 244678ce9dc1..25ed8da970a4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -32,15 +32,9 @@ class LexicalScope; class MCSection; // Data structure to hold a range for range lists. -class RangeSpan { -public: - RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {} - const MCSymbol *getStart() const { return Start; } - const MCSymbol *getEnd() const { return End; } - void setEnd(const MCSymbol *E) { End = E; } - -private: - const MCSymbol *Start, *End; +struct RangeSpan { + const MCSymbol *Begin; + const MCSymbol *End; }; class RangeSpanList { diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index cc3379f13b4d..27b298dcf6af 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -469,7 +469,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { Value *NewAddr = Builder.CreateBitCast(Addr, PT); StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); - NewSI->setAlignment(SI->getAlignment()); + NewSI->setAlignment(MaybeAlign(SI->getAlignment())); NewSI->setVolatile(SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); diff --git a/lib/CodeGen/ModuloSchedule.cpp b/lib/CodeGen/ModuloSchedule.cpp index 30aa81487c8b..d891d644664f 100644 --- a/lib/CodeGen/ModuloSchedule.cpp +++ b/lib/CodeGen/ModuloSchedule.cpp @@ -1772,12 +1772,12 @@ void PeelingModuloScheduleExpander::fixupBranches() { MachineBasicBlock *Fallthrough = *Prolog->succ_begin(); MachineBasicBlock *Epilog = *EI; SmallVector Cond; + TII->removeBranch(*Prolog); Optional StaticallyGreater = Info->createTripCountGreaterCondition(TC, *Prolog, Cond); if (!StaticallyGreater.hasValue()) { LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n"); // Dynamically branch based on Cond. - TII->removeBranch(*Prolog); TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc()); } else if (*StaticallyGreater == false) { LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n"); @@ -1788,7 +1788,6 @@ void PeelingModuloScheduleExpander::fixupBranches() { P.RemoveOperand(2); P.RemoveOperand(1); } - TII->removeBranch(*Prolog); TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc()); KernelDisposed = true; } else { diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index eadc388fc9d5..560b5729e3de 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -514,15 +514,15 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { if (Cond.getValueType().isVector()) { if (SDValue Res = WidenVSELECTAndMask(N)) std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); - // It seems to improve code to generate two narrow SETCCs as opposed to - // splitting a wide result vector. - else if (Cond.getOpcode() == ISD::SETCC) - SplitVecRes_SETCC(Cond.getNode(), CL, CH); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. else if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) + SplitVecRes_SETCC(Cond.getNode(), CL, CH); else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index a599a0899876..c548c56211ae 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -2014,7 +2014,7 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) { else if (LoadInst *LI = dyn_cast(P)) LI->setAlignment(MaybeAlign(Bytes)); else if (StoreInst *SI = dyn_cast(P)) - SI->setAlignment(Bytes); + SI->setAlignment(MaybeAlign(Bytes)); else llvm_unreachable( "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment"); diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index 0f000623bdb5..de1317ea9d3f 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -1397,7 +1397,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } @@ -1413,15 +1413,11 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } -void StoreInst::setAlignment(unsigned Align) { - setAlignment(llvm::MaybeAlign(Align)); -} - void StoreInst::setAlignment(MaybeAlign Align) { assert((!Align || *Align <= MaximumAlignment) && "Alignment is greater than MaximumAlignment!"); diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp index c85cf4c924f0..f9c31f335f11 100644 --- a/lib/ObjectYAML/ELFEmitter.cpp +++ b/lib/ObjectYAML/ELFEmitter.cpp @@ -174,6 +174,10 @@ template class ELFState { void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::HashSection &Section, ContiguousBlobAccumulator &CBA); + void writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA); + ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH); public: @@ -423,6 +427,8 @@ void ELFState::initSectionHeaders(std::vector &SHeaders, writeSectionContent(SHeader, *S, CBA); } else if (auto S = dyn_cast(Sec)) { writeSectionContent(SHeader, *S, CBA); + } else if (auto S = dyn_cast(Sec)) { + writeSectionContent(SHeader, *S, CBA); } else { llvm_unreachable("Unknown section type"); } @@ -990,6 +996,30 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, Section.Content->writeAsBinary(OS); } +template +void ELFState::writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA) { + raw_ostream &OS = + CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign); + + unsigned Link = 0; + if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) + SHeader.sh_link = Link; + + if (Section.Content || Section.Size) { + SHeader.sh_size = writeContent(OS, Section.Content, Section.Size); + return; + } + + for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) { + uint64_t Val = + Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false) + : (uint32_t)*Sym.Index; + SHeader.sh_size += encodeULEB128(Val, OS); + } +} + template void ELFState::buildSectionIndex() { for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) { StringRef Name = Doc.Sections[I]->Name; diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index 0dd6854cfee0..29585abe6e80 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -1071,6 +1071,13 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { IO.mapRequired("Entries", Section.Entries); } +static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { + commonSectionMapping(IO, Section); + IO.mapOptional("Content", Section.Content); + IO.mapOptional("Size", Section.Size); + IO.mapOptional("Symbols", Section.Symbols); +} + void MappingTraits::mapping( IO &IO, ELFYAML::SectionOrType §ionOrType) { IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType); @@ -1161,6 +1168,11 @@ void MappingTraits>::mapping( Section.reset(new ELFYAML::SymtabShndxSection()); sectionMapping(IO, *cast(Section.get())); break; + case ELF::SHT_LLVM_ADDRSIG: + if (!IO.outputting()) + Section.reset(new ELFYAML::AddrsigSection()); + sectionMapping(IO, *cast(Section.get())); + break; default: if (!IO.outputting()) { StringRef Name; @@ -1233,6 +1245,31 @@ StringRef MappingTraits>::validate( return {}; } + if (const auto *Sec = dyn_cast(Section.get())) { + if (!Sec->Symbols && !Sec->Content && !Sec->Size) + return "one of \"Content\", \"Size\" or \"Symbols\" must be specified"; + + if (Sec->Content || Sec->Size) { + if (Sec->Size && Sec->Content && + (uint64_t)*Sec->Size < Sec->Content->binary_size()) + return "\"Size\" must be greater than or equal to the content " + "size"; + + if (Sec->Symbols) + return "\"Symbols\" cannot be used with \"Content\" or \"Size\""; + return {}; + } + + if (!Sec->Symbols) + return {}; + + for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols) + if (AS.Index && AS.Name) + return "\"Index\" and \"Name\" cannot be used together when defining a " + "symbol"; + return {}; + } + return {}; } @@ -1340,6 +1377,12 @@ void MappingTraits::mapping(IO &IO, ELFYAML::Object &Object) { IO.setContext(nullptr); } +void MappingTraits::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("Name", Sym.Name); + IO.mapOptional("Index", Sym.Index); +} + LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG) LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT) diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 6689ee48200e..51bf35d4a161 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -406,6 +406,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td new file mode 100644 index 000000000000..c4658f73b8dd --- /dev/null +++ b/lib/Target/AArch64/AArch64Combine.td @@ -0,0 +1,15 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper">; diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8357b763179d..c42c16bc1aad 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -55,6 +55,10 @@ // | callee-saved fp/simd/SVE regs | // | | // |-----------------------------------| +// | | +// | SVE stack objects | +// | | +// |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) @@ -202,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -214,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -456,6 +467,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -870,6 +886,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -880,6 +898,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -926,6 +946,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); @@ -1083,6 +1104,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1431,8 +1455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -1446,6 +1473,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1595,6 +1628,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + llvm_unreachable("Accessing frame indices in presence of SVE " + "not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2175,8 +2213,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2239,12 +2288,34 @@ bool AArch64FrameLowering::enableStackSlotScavenging( void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7ed20d24607f..99d868a95a70 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -87,6 +87,17 @@ class AArch64FrameLowering : public TargetFrameLowering { int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 097a8ba0ae19..1cc3177b26a7 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3046,6 +3046,16 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3117,8 +3127,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { @@ -3133,6 +3143,23 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, NeedsWinCFI, HasWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec1..a7d0a742573d 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -95,6 +95,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +138,15 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 6df3f944f8c3..bea75b83517b 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -27,12 +27,22 @@ using namespace llvm; using namespace MIPatternMatch; +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + class AArch64PreLegalizerCombinerInfo : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; public: + AArch64GenPreLegalizerCombinerHelper Generated; + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -81,9 +91,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } } + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + return false; } +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + // Pass boilerplate // ================ diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index d46e905d0fe7..1657a76a685c 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -138,12 +138,12 @@ let Predicates = [HasSVE] in { defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h index 5f5cdfa2fad1..13f12a6c9c30 100644 --- a/lib/Target/AArch64/AArch64StackOffset.h +++ b/lib/Target/AArch64/AArch64StackOffset.h @@ -35,32 +35,38 @@ namespace llvm { /// vector and a 64bit GPR. class StackOffset { int64_t Bytes; + int64_t ScalableBytes; explicit operator int() const; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} StackOffset &operator=(const StackOffset &) = default; StackOffset &operator+=(const StackOffset::Part &Other) { - assert(Other.second.getSizeInBits() % 8 == 0 && - "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -72,6 +78,7 @@ class StackOffset { StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -88,16 +95,42 @@ class StackOffset { return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } /// Returns the offset in parts to which this frame offset can be /// decomposed for the purpose of describing a frame offset. /// For non-scalable offsets this is simply its byte size. - void getForFrameOffset(int64_t &ByteSized) const { ByteSized = Bytes; } + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } /// Returns whether the offset is known zero. - explicit operator bool() const { return Bytes; } + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 7af0de7f647e..4fb409f020d9 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5276,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { auto parseOp = [&]() -> bool { SMLoc L = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (check(getParser().parseExpression(Expr), L, "expected expression")) return true; const MCConstantExpr *Value = dyn_cast_or_null(Expr); diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 0da057ea9973..103925d45d51 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -8,6 +8,8 @@ tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) +tablegen(LLVM AArch64GenGICombiner.inc -gen-global-isel-combiner + -combiners="AArch64PreLegalizerCombinerHelper") tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 1a9784065d5b..e2bd47ee6ae3 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -1219,10 +1219,12 @@ multiclass sve_fp_ftmad { //===----------------------------------------------------------------------===// class sve_fp_3op_u_zd sz, bits<3> opc, string asm, - ZPRRegOp zprty> + ZPRRegOp zprty, + ValueType vt, ValueType vt2, SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1236,10 +1238,10 @@ class sve_fp_3op_u_zd sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd opc, string asm> { - def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_u_zd opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b2491ebc6f48..c74a361b2c71 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -186,10 +186,11 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -202,7 +203,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, @@ -1313,7 +1314,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1326,6 +1328,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1405,7 +1408,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC) const { + SDValue &DLC, SDValue &SWZ) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1413,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; ConstantSDNode *C = cast(Addr64); @@ -1435,9 +1438,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1562,13 +1565,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; if (!cast(Offen)->getSExtValue() && @@ -1590,16 +1594,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC; + SDValue GLC, SLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } template diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5480eb5595a5..c5e60ed77be6 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -762,16 +762,20 @@ static bool isZero(Register Reg, MachineRegisterInfo &MRI) { return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; } -static unsigned extractGLC(unsigned CachePolicy) { - return CachePolicy & 1; +static unsigned extractGLC(unsigned AuxiliaryData) { + return AuxiliaryData & 1; } -static unsigned extractSLC(unsigned CachePolicy) { - return (CachePolicy >> 1) & 1; +static unsigned extractSLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 1) & 1; } -static unsigned extractDLC(unsigned CachePolicy) { - return (CachePolicy >> 2) & 1; +static unsigned extractDLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 2) & 1; +} + +static unsigned extractSWZ(unsigned AuxiliaryData) { + return (AuxiliaryData >> 3) & 1; } // Returns Base register, constant offset, and offset def point. @@ -970,7 +974,7 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, Register RSrc = MI.getOperand(2).getReg(); Register VOffset = MI.getOperand(3).getReg(); Register SOffset = MI.getOperand(4).getReg(); - unsigned CachePolicy = MI.getOperand(5).getImm(); + unsigned AuxiliaryData = MI.getOperand(5).getImm(); unsigned ImmOffset; unsigned TotalOffset; @@ -994,10 +998,11 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, MIB.addUse(RSrc) .addUse(SOffset) .addImm(ImmOffset) - .addImm(extractGLC(CachePolicy)) - .addImm(extractSLC(CachePolicy)) + .addImm(extractGLC(AuxiliaryData)) + .addImm(extractSLC(AuxiliaryData)) .addImm(0) // tfe: FIXME: Remove from inst - .addImm(extractDLC(CachePolicy)) + .addImm(extractDLC(AuxiliaryData)) + .addImm(extractSWZ(AuxiliaryData)) .addMemOperand(MMO); MI.eraseFromParent(); diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b8b54a2ef1a5..cf4275c23e8c 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -53,7 +53,8 @@ static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { const LLT Ty = Query.Types[TypeIdx]; return Ty.isVector() && Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32; + Ty.getElementType().getSizeInBits() < 32 && + Ty.getSizeInBits() % 32 != 0; }; } @@ -268,7 +269,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); @@ -279,11 +280,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // TODO: Implement. getActionDefinitionsBuilder(G_BITCAST) - .legalForCartesianProduct({S32, V2S16}) - .legalForCartesianProduct({S64, V2S32, V4S16}) - .legalForCartesianProduct({V2S64, V4S32}) // Don't worry about the size constraint. - .legalIf(all(isPointer(0), isPointer(1))) + .legalIf(all(isRegisterType(0), isRegisterType(1))) // FIXME: Testing hack .legalForCartesianProduct({S16, LLT::vector(2, 8), }); diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index badcd77aaef1..35040af98121 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,7 +17,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -659,45 +658,28 @@ static LLT getHalfSizedType(LLT Ty) { /// unique values used. bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineIRBuilder &B, - MachineInstr &MI, - MachineRegisterInfo &MRI, - ArrayRef OpIndices) const { - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - MachineBasicBlock::iterator I(MI); - - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - // Use a set to avoid extra readfirstlanes in the case where multiple operands - // are the same register. - SmallSet SGPROperandRegs; - for (unsigned Op : OpIndices) { - assert(MI.getOperand(Op).isUse()); - Register Reg = MI.getOperand(Op).getReg(); - const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) - SGPROperandRegs.insert(Reg); - } - - // No operands need to be replaced, so no need to loop. - if (SGPROperandRegs.empty()) - return false; - + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const { SmallVector ResultRegs; SmallVector InitResultRegs; SmallVector PhiRegs; - for (MachineOperand &Def : MI.defs()) { - LLT ResTy = MRI.getType(Def.getReg()); - const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); - ResultRegs.push_back(Def.getReg()); - Register InitReg = B.buildUndef(ResTy).getReg(0); - Register PhiReg = MRI.createGenericVirtualRegister(ResTy); - InitResultRegs.push_back(InitReg); - PhiRegs.push_back(PhiReg); - MRI.setRegBank(PhiReg, *DefBank); - MRI.setRegBank(InitReg, *DefBank); + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction *MF = &B.getMF(); + + for (MachineInstr &MI : Range) { + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } } Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); @@ -726,7 +708,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); MBB.addSuccessor(LoopBB); RestoreExecBB->addSuccessor(RemainderBB); @@ -749,139 +731,56 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); } - // Move the instruction into the loop. - LoopBB->splice(LoopBB->end(), &MBB, I); - I = std::prev(LoopBB->end()); + const DebugLoc &DL = B.getDL(); - B.setInstr(*I); + // Figure out the iterator range after splicing the instructions. + auto NewBegin = std::prev(LoopBB->end()); - Register CondReg; + // Move the instruction into the loop. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg()) - continue; - - assert(!Op.isDef()); - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); - - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); - - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); - - Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } else { - LLT S32 = LLT::scalar(32); - SmallVector ReadlanePieces; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - bool Is64 = OpSize % 64 == 0; - - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - // Insert the unmerge before the loop. - - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); + auto NewEnd = LoopBB->end(); - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - Register UnmergePiece = Unmerge.getReg(PieceIdx); + MachineBasicBlock::iterator I = Range.begin(); + B.setInsertPt(*LoopBB, I); - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); - - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); + Register CondReg; - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg() || Op.isDef()) + continue; - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); - } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); - } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); Register NewCondReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); bool First = CondReg == AMDGPU::NoRegister; if (First) CondReg = NewCondReg; - B.buildInstr(CmpOp) + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) .addDef(NewCondReg) .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); if (!First) { Register AndReg @@ -894,19 +793,115 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addReg(CondReg); CondReg = AndReg; } - } - - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } + LLT S32 = LLT::scalar(32); + SmallVector ReadlanePieces; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + bool Is64 = OpSize % 64 == 0; + + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + // Insert the unmerge before the loop. + + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); + + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); + + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } + } else { + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); + } + + Register NewCondReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); + + if (!First) { + Register AndReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } - MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } + + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + } } } } @@ -949,6 +944,40 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( return true; } +// Return any unique registers used by \p MI at \p OpIndices that need to be +// handled in a waterfall loop. Returns these registers in \p +// SGPROperandRegs. Returns true if there are any operansd to handle and a +// waterfall loop is necessary. +bool AMDGPURegisterBankInfo::collectWaterfallOperands( + SmallSet &SGPROperandRegs, MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef OpIndices) const { + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + return !SGPROperandRegs.empty(); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet SGPROperandRegs; + + if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + return false; + + MachineBasicBlock::iterator I = MI.getIterator(); + return executeInWaterfallLoop(B, make_range(I, std::next(I)), + SGPROperandRegs, MRI); +} + bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef OpIndices) const { @@ -1604,10 +1633,140 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - case AMDGPU::G_EXTRACT_VECTOR_ELT: - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + SmallVector DstRegs(OpdMapper.getVRegs(0)); + + assert(empty(OpdMapper.getVRegs(1)) && empty(OpdMapper.getVRegs(2))); + + if (DstRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register IdxReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + (void)DstTy; + + assert(DstTy.getSizeInBits() == 64); + + LLT SrcTy = MRI.getType(SrcReg); + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); + B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); + + const ValueMapping &DstMapping + = OpdMapper.getInstrMapping().getOperandMapping(0); + + // FIXME: Should be getting from mapping or not? + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { + MI.eraseFromParent(); + return; + } + + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } + case AMDGPU::G_INSERT_VECTOR_ELT: { + SmallVector InsRegs(OpdMapper.getVRegs(2)); + + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(1))); + assert(empty(OpdMapper.getVRegs(3))); + + if (InsRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 3 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT InsTy = MRI.getType(InsReg); + (void)InsTy; + + assert(InsTy.getSizeInBits() == 64); + + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + + auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); + auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); + B.buildBitcast(DstReg, InsHi); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); + + MRI.setRegBank(InsReg, *InsSrcBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(InsLo.getReg(0), *DstBank); + MRI.setRegBank(InsHi.getReg(0), *DstBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + MI.eraseFromParent(); + return; + } + + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { @@ -2126,8 +2285,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { - if (MF.getSubtarget().hasScalarMulHiInsts() && - isSALUMapping(MI)) + if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); } @@ -2301,7 +2459,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - MF.getSubtarget().hasScalarCompareEq64())); + Subtarget.hasScalarCompareEq64())); unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; @@ -2312,14 +2470,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned OutputBankID = isSALUMapping(MI) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + // VGPR index can be used for waterfall when indexing a SGPR vector. + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); @@ -2332,15 +2492,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), + MRI, *TRI); + unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, + InsertSize); // The index can be either if the source vector is VGPR. - OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { @@ -2760,4 +2923,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getOperandsMapping(OpdsMapping), MI.getNumOperands()); } - diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 584b23c0c220..a14b74961118 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -13,6 +13,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -42,6 +44,18 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; const SIInstrInfo *TII; + bool collectWaterfallOperands( + SmallSet &SGPROperandRegs, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + + bool executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const; + bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b946c308cf12..94d1d350dfd2 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -143,6 +143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyDLC, ImmTyGLC, ImmTySLC, + ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, @@ -328,6 +329,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } + bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } @@ -820,6 +822,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; + case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -6037,6 +6040,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 40887a3c56eb..c9e8abad7c3c 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; @@ -54,6 +54,17 @@ class MTBUFAddr64Table { // MTBUF classes //===----------------------------------------------------------------------===// +class MTBUFGetBaseOpcode { + string ret = !subst("FORMAT_XY", "FORMAT_X", + !subst("FORMAT_XYZ", "FORMAT_X", + !subst("FORMAT_XYZW", "FORMAT_X", Op))); +} + +class getMTBUFElements { + int ret = 1; +} + + class MTBUF_Pseudo pattern=[]> : InstSI, @@ -67,6 +78,9 @@ class MTBUF_Pseudo (NAME); + Instruction BaseOpcode = !cast(MTBUFGetBaseOpcode.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; @@ -90,6 +104,7 @@ class MTBUF_Pseudo has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> elements = 0; } class MTBUF_Real : @@ -126,17 +141,17 @@ class getMTBUFInsDA vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc), + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -181,51 +196,54 @@ class MTBUF_SetupAddr { class MTBUF_Load_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo.ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 1; let mayStore = 0; + let elements = elems; } multiclass MTBUF_Pseudo_Loads { - def _OFFSET : MTBUF_Load_Pseudo , + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo , + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo ; - def _IDXEN : MTBUF_Load_Pseudo ; - def _BOTHEN : MTBUF_Load_Pseudo ; + def _OFFEN : MTBUF_Load_Pseudo ; + def _IDXEN : MTBUF_Load_Pseudo ; + def _BOTHEN : MTBUF_Load_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo ; - def _OFFEN_exact : MTBUF_Load_Pseudo ; - def _IDXEN_exact : MTBUF_Load_Pseudo ; - def _BOTHEN_exact : MTBUF_Load_Pseudo ; + def _OFFSET_exact : MTBUF_Load_Pseudo ; + def _OFFEN_exact : MTBUF_Load_Pseudo ; + def _IDXEN_exact : MTBUF_Load_Pseudo ; + def _BOTHEN_exact : MTBUF_Load_Pseudo ; } } class MTBUF_Store_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo .ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 0; let mayStore = 1; + let elements = elems; } multiclass MTBUF_Pseudo_Stores { - def _OFFSET : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo ; - def _IDXEN : MTBUF_Store_Pseudo ; - def _BOTHEN : MTBUF_Store_Pseudo ; + def _OFFEN : MTBUF_Store_Pseudo ; + def _IDXEN : MTBUF_Store_Pseudo ; + def _BOTHEN : MTBUF_Store_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo ; - def _OFFEN_exact : MTBUF_Store_Pseudo ; - def _IDXEN_exact : MTBUF_Store_Pseudo ; - def _BOTHEN_exact : MTBUF_Store_Pseudo ; + def _OFFSET_exact : MTBUF_Store_Pseudo ; + def _OFFEN_exact : MTBUF_Store_Pseudo ; + def _IDXEN_exact : MTBUF_Store_Pseudo ; + def _BOTHEN_exact : MTBUF_Store_Pseudo ; } } @@ -393,7 +412,7 @@ class getMUBUFInsDA vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) ); } @@ -465,7 +484,7 @@ class MUBUF_Load_Pseudo .ret, !if(HasTiedDest, (ins getVregSrcForVT.ret:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc", + !if(isLds, " lds", "$tfe") # "$dlc" # "$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -483,15 +502,15 @@ class MUBUF_Load_Pseudo : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; class MUBUF_Addr64_Load_Pat : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; multiclass MUBUF_Pseudo_Load_Pats { @@ -542,7 +561,7 @@ class MUBUF_Store_Pseudo .ret]>.ret, - " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -558,12 +577,12 @@ multiclass MUBUF_Pseudo_Stores, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo , + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo ; @@ -581,8 +600,8 @@ multiclass MUBUF_Pseudo_Stores : MUBUF_Pseudo { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), + " $srsrc, $soffset$offset lds$glc$slc$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -1065,35 +1084,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < // MTBUF Instructions //===----------------------------------------------------------------------===// -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; } // End HasPackedD16VMem. let SubtargetPredicate = isGFX7Plus in { @@ -1128,6 +1147,10 @@ def extract_dlc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); }]>; +def extract_swz : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1136,32 +1159,36 @@ multiclass MUBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1211,35 +1238,39 @@ multiclass MUBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1441,8 +1472,8 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; multiclass MUBUFLoad_Atomic_Pattern ; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } @@ -1476,8 +1507,8 @@ multiclass MUBUFLoad_Pattern ; } @@ -1500,12 +1531,12 @@ multiclass MUBUFScratchLoadPat ; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1515,12 +1546,12 @@ multiclass MUBUFScratchLoadPat_D16 { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; } @@ -1560,16 +1591,16 @@ defm : MUBUFScratchLoadPat_D16 { - // Store follows atomic op convention so address is forst + // Store follows atomic op convention so address is first def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1583,8 +1614,8 @@ multiclass MUBUFStore_Pattern ; } @@ -1598,13 +1629,13 @@ multiclass MUBUFScratchStorePat ; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1643,36 +1674,40 @@ multiclass MTBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1701,36 +1736,40 @@ multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm), + timm:$format, timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - timm:$offset, timm:$format, timm:$cachepolicy, timm), + timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -2397,3 +2436,22 @@ def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex { let Table = MUBUFInfoTable; let Key = ["BaseOpcode", "elements"]; } + +def MTBUFInfoTable : GenericTable { + let FilterClass = "MTBUF_Pseudo"; + let CppTypeName = "MTBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMTBUFOpcodeHelper"; +} + +def getMTBUFInfoFromOpcode : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["BaseOpcode", "elements"]; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a45162543975..d2ea94548dfe 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "slc"); } +void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { +} + void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 0f62f039763e..66b70831ff9e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -72,6 +72,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 45c06ebb547a..ed07ed100a19 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -669,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 88dec95177c7..1883b28f657a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6271,7 +6271,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy + Op.getOperand(5), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6289,7 +6289,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6338,7 +6338,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6362,7 +6362,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6832,7 +6832,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy + Op.getOperand(8), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6857,7 +6857,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6931,7 +6931,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = @@ -6975,7 +6975,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index b6a90241d4de..7a6bb0e20b79 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4693,6 +4693,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, MIB.addImm(TFE->getImm()); } + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { @@ -5663,7 +5665,16 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( if (RI.hasAGPRs(NewDstRC)) return nullptr; - NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + switch (Inst.getOpcode()) { + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + break; + default: + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + } + if (!NewDstRC) return nullptr; } else { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index e1b32c4964c4..7473a0c64b2f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, @@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, @@ -1035,6 +1035,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 302b299765ee..f9bce4cc9c1c 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass { return true; } + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); @@ -220,6 +245,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, SmallPtrSet &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const; public: static char ID; @@ -228,7 +257,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]); } + + InstsToMove.clear(); } void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, @@ -640,14 +675,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { return false; } - for (unsigned i = 0; i < CI.NumAddresses; i++) { - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (CI.AddrReg[i]->isReg() && - (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg()))) - return false; - } + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) + return false; ++MBBI; @@ -821,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -905,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; } MachineBasicBlock::iterator @@ -932,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -957,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -991,14 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1018,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -1184,21 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand @@ -1511,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const { + for (std::list &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} + +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) continue; - } CombineInfo CI; - CI.setMI(I, *TII, *STM); + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list > &MergeableInsts) { + bool Modified = false; + + for (std::list &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); + continue; + } + + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); + + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: @@ -1544,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { case DS_READ: if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; + break; case BUFFER_LOAD_OFFEN: case BUFFER_LOAD_OFFSET: case BUFFER_LOAD_OFFEN_exact: case BUFFER_LOAD_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; case BUFFER_STORE_OFFEN: case BUFFER_STORE_OFFSET: case BUFFER_STORE_OFFEN_exact: case BUFFER_STORE_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1618,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 23c357dadde5..f4dd995316dd 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -617,6 +617,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -737,6 +738,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index bb4169788f46..afb2fd987afd 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -137,10 +137,51 @@ struct MUBUFInfo { bool has_soffset; }; +struct MTBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t elements; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MTBUFInfoTable_DECL +#define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" +int getMTBUFBaseOpcode(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); + return Info ? Info->Opcode : -1; +} + +int getMTBUFElements(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->elements : 0; +} + +bool getMTBUFHasVAddr(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMTBUFHasSrsrc(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMTBUFHasSoffset(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + int getMUBUFBaseOpcode(unsigned Opc) { const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); return Info ? Info->BaseOpcode : -1; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a578fd2bb6a9..f78dadd447ff 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -263,6 +263,24 @@ struct MIMGInfo { LLVM_READONLY const MIMGInfo *getMIMGInfo(unsigned Opc); +LLVM_READONLY +int getMTBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements); + +LLVM_READONLY +int getMTBUFElements(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSoffset(unsigned Opc); + LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index b3138312150c..45bf67633822 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -13122,7 +13122,8 @@ static SDValue PerformLOADCombine(SDNode *N, // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. -SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, + SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (!St->isTruncatingStore() || !VT.isVector()) @@ -13206,7 +13207,8 @@ SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { // Try taking a single vector store from an truncate (which would otherwise turn // into an expensive buildvector) and splitting it into a series of narrowing // stores. -SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); @@ -13696,7 +13698,7 @@ static SDValue PerformShiftCombine(SDNode *N, // Look for a sign/zero extend of a larger than legal load. This can be split // into two extending loads, which are simpler to deal with than an arbitrary // sign extend. -SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) return SDValue(); diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index dd2d4ec118aa..4d2c9dad7099 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -11339,7 +11339,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { SmallVector Opcodes; auto parseOne = [&]() -> bool { - const MCExpr *OE; + const MCExpr *OE = nullptr; SMLoc OpcodeLoc = getLexer().getLoc(); if (check(getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE), diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp index c682c46fe316..870300ab2b25 100644 --- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -90,6 +90,13 @@ class BPFAbstractMemberAccess final : public ModulePass { static char ID; BPFAbstractMemberAccess() : ModulePass(ID) {} + struct CallInfo { + uint32_t Kind; + uint32_t AccessIndex; + MDNode *Metadata; + Value *Base; + }; + private: enum : uint32_t { BPFPreserveArrayAI = 1, @@ -99,34 +106,32 @@ class BPFAbstractMemberAccess final : public ModulePass { std::map GEPGlobals; // A map to link preserve_*_access_index instrinsic calls. - std::map> AIChain; + std::map> AIChain; // A map to hold all the base preserve_*_access_index instrinsic calls. // The base call is not an input of any other preserve_*_access_index // intrinsics. - std::map BaseAICalls; + std::map BaseAICalls; bool doTransformation(Module &M); - void traceAICall(CallInst *Call, uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI); - void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); - void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); + void traceAICall(CallInst *Call, CallInfo &ParentInfo); + void traceBitCast(BitCastInst *BitCast, CallInst *Parent, + CallInfo &ParentInfo); + void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, + CallInfo &ParentInfo); void collectAICallChains(Module &M, Function &F); - bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind, - const MDNode *&TypeMeta, uint32_t &AccessIndex); + bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo); bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI, const MDNode *ChildMeta); bool removePreserveAccessIndexIntrinsic(Module &M); void replaceWithGEP(std::vector &CallList, uint32_t NumOfZerosIndex, uint32_t DIIndex); - Value *computeBaseAndAccessKey(CallInst *Call, std::string &AccessKey, - uint32_t Kind, MDNode *&BaseMeta); - bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex); - bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind); + Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo, + std::string &AccessKey, MDNode *&BaseMeta); + uint64_t getConstant(const Value *IndexValue); + bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo); }; } // End anonymous namespace @@ -192,9 +197,7 @@ static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) { /// Check whether a call is a preserve_*_access_index intrinsic call or not. bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, - uint32_t &Kind, - const MDNode *&TypeMeta, - uint32_t &AccessIndex) { + CallInfo &CInfo) { if (!Call) return false; @@ -202,30 +205,30 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, if (!GV) return false; if (GV->getName().startswith("llvm.preserve.array.access.index")) { - Kind = BPFPreserveArrayAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveArrayAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.union.access.index")) { - Kind = BPFPreserveUnionAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveUnionAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(1)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.struct.access.index")) { - Kind = BPFPreserveStructAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveStructAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } @@ -238,8 +241,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector &CallList, for (auto Call : CallList) { uint32_t Dimension = 1; if (DimensionIndex > 0) - Dimension = cast(Call->getArgOperand(DimensionIndex)) - ->getZExtValue(); + Dimension = getConstant(Call->getArgOperand(DimensionIndex)); Constant *Zero = ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0); @@ -265,16 +267,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { for (auto &BB : F) for (auto &I : BB) { auto *Call = dyn_cast(&I); - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex)) + CallInfo CInfo; + if (!IsPreserveDIAccessIndexCall(Call, CInfo)) continue; Found = true; - if (Kind == BPFPreserveArrayAI) + if (CInfo.Kind == BPFPreserveArrayAI) PreserveArrayIndexCalls.push_back(Call); - else if (Kind == BPFPreserveUnionAI) + else if (CInfo.Kind == BPFPreserveUnionAI) PreserveUnionIndexCalls.push_back(Call); else PreserveStructIndexCalls.push_back(Call); @@ -349,99 +349,94 @@ bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType, return dyn_cast(stripQualifiers(Ty)) == CTy; } -void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { +void BPFAbstractMemberAccess::traceAICall(CallInst *Call, + CallInfo &ParentInfo) { for (User *U : Call->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Call, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Call, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Call, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Call, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Call, Kind, ParentMeta, ParentAI); + traceGEP(GI, Call, ParentInfo); else - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } } void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast, - CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInst *Parent, + CallInfo &ParentInfo) { for (User *U : BitCast->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent, - uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInfo &ParentInfo) { for (User *U : GEP->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } @@ -452,44 +447,37 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) { for (auto &BB : F) for (auto &I : BB) { - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; + CallInfo CInfo; auto *Call = dyn_cast(&I); - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex) || + if (!IsPreserveDIAccessIndexCall(Call, CInfo) || AIChain.find(Call) != AIChain.end()) continue; - traceAICall(Call, Kind, TypeMeta, AccessIndex); + traceAICall(Call, CInfo); } } -/// Get access index from the preserve_*_access_index intrinsic calls. -bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue, - uint64_t &AccessIndex) { +uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) { const ConstantInt *CV = dyn_cast(IndexValue); - if (!CV) - return false; - - AccessIndex = CV->getValue().getZExtValue(); - return true; + assert(CV); + return CV->getValue().getZExtValue(); } /// Compute the base of the whole preserve_*_access_index chains, i.e., the base /// pointer of the first preserve_*_access_index call, and construct the access /// string, which will be the name of a global variable. Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, + CallInfo &CInfo, std::string &AccessKey, - uint32_t Kind, MDNode *&TypeMeta) { Value *Base = nullptr; std::string TypeName; - std::stack> CallStack; + std::stack> CallStack; // Put the access chain into a stack with the top as the head of the chain. while (Call) { - CallStack.push(std::make_pair(Call, Kind)); - Kind = AIChain[Call].second; + CallStack.push(std::make_pair(Call, CInfo)); + CInfo = AIChain[Call].second; Call = AIChain[Call].first; } @@ -508,14 +496,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, while (CallStack.size()) { auto StackElem = CallStack.top(); Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; if (!Base) - Base = Call->getArgOperand(0); + Base = CInfo.Base; - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); - DIType *Ty = stripQualifiers(cast(MDN)); - if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) { + DIType *Ty = stripQualifiers(cast(CInfo.Metadata)); + if (CInfo.Kind == BPFPreserveUnionAI || + CInfo.Kind == BPFPreserveStructAI) { // struct or union type TypeName = Ty->getName(); TypeMeta = Ty; @@ -527,9 +515,7 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, CallStack.pop(); // BPFPreserveArrayAI - uint64_t AccessIndex; - if (!getAccessIndex(Call->getArgOperand(2), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; DIType *BaseTy = nullptr; bool CheckElemType = false; @@ -580,18 +566,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, // and access key construction. while (CallStack.size()) { auto StackElem = CallStack.top(); - Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; CallStack.pop(); // Access Index - uint64_t AccessIndex; - uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2; - if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; AccessKey += ":" + std::to_string(AccessIndex); - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); + MDNode *MDN = CInfo.Metadata; // At this stage, it cannot be pointer type. auto *CTy = cast(stripQualifiers(cast(MDN))); uint32_t Tag = CTy->getTag(); @@ -615,11 +597,11 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, /// Call/Kind is the base preserve_*_access_index() call. Attempts to do /// transformation to a chain of relocable GEPs. bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call, - uint32_t Kind) { + CallInfo &CInfo) { std::string AccessKey; MDNode *TypeMeta; Value *Base = - computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta); + computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta); if (!Base) return false; diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 759a7fdb32b8..142e9cebb79e 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo, assert(MO.isExpr() && "getJumpOffset16OpValue expects only expressions or an immediate"); - // TODO: Push fixup. - return 0; + const MCExpr *Expr = MO.getExpr(); + Mips::Fixups FixupKind = + isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16; + Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind))); + return 0; } /// getJumpTargetOpValue - Return binary encoding of the jump diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 2b413d0b97ab..dc013c8ff9a0 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3571,16 +3571,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, } else { // The 32 bit and 64 bit instructions are quite different. if (SpecialShift32) { - // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31). - uint64_t SH = RightShift ? 32 - ShAmt : ShAmt; + // Left shifts use (N, 0, 31-N). + // Right shifts use (32-N, N, 31) if 0 < N < 32. + // use (0, 0, 31) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt; uint64_t MB = RightShift ? ShAmt : 0; uint64_t ME = RightShift ? 31 : 31 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB) .addImm(ME); } else { - // Left shifts use (N, 63-N), right shifts use (64-N, N). - uint64_t SH = RightShift ? 64 - ShAmt : ShAmt; + // Left shifts use (N, 63-N). + // Right shifts use (64-N, N) if 0 < N < 64. + // use (0, 0) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt; uint64_t ME = RightShift ? ShAmt : 63 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME); diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td index 032642942f2b..3b73c865ea17 100644 --- a/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/lib/Target/RISCV/RISCVInstrInfoF.td @@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>; def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>; def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>; +// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them +// zero weight. +def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>; +def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>; +def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>; + def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>; def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>; def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>; diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index b0c03c13fe60..f83a8a984ae0 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -54,6 +54,12 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel { ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget(); + + // Wasm64 is not fully supported right now (and is not specified) + if (Subtarget->hasAddr64()) + report_fatal_error( + "64-bit WebAssembly (wasm64) is not currently supported"); + return SelectionDAGISel::runOnMachineFunction(MF); } diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8c837dfb6af5..407c4c8137d3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5769,23 +5769,35 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, @@ -10288,6 +10300,7 @@ static bool isTargetShuffleEquivalent(ArrayRef Mask, // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle // mask. +// TODO: Do we need this? It might be better to use Mask+Zeroable directly. static SmallVector createTargetShuffleMask(ArrayRef Mask, const APInt &Zeroable) { int NumElts = Mask.size(); @@ -15440,7 +15453,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, - unsigned &ShuffleImm, ArrayRef Mask) { + unsigned &ShuffleImm, ArrayRef Mask, + const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -15450,7 +15464,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ZeroLane[2] = { true, true }; for (int i = 0; i < NumElts; ++i) - ZeroLane[i & 1] &= isUndefOrZero(Mask[i]); + ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15483,19 +15497,17 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Original, + SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); - SmallVector Mask = createTargetShuffleMask(Original, Zeroable); - unsigned Immediate = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, - Mask)) + Mask, Zeroable)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. @@ -15508,6 +15520,42 @@ static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, DAG.getTargetConstant(Immediate, DL, MVT::i8)); } +// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed +// by zeroable elements in the remaining 24 elements. Turn this into two +// vmovqb instructions shuffled together. +static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + assert(VT == MVT::v32i8 && "Unexpected type!"); + + // The first 8 indices should be every 8th element. + if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) + return SDValue(); + + // Remaining elements need to be zeroable. + if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) + return SDValue(); + + V1 = DAG.getBitcast(MVT::v4i64, V1); + V2 = DAG.getBitcast(MVT::v4i64, V2); + + V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); + V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); + + // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in + // the upper bits of the result using an unpckldq. + SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, + { 0, 1, 2, 3, 16, 17, 18, 19, + 4, 5, 6, 7, 20, 21, 22, 23 }); + // Insert the unpckldq into a zero vector to widen to v32i8. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, + DAG.getConstant(0, DL, MVT::v32i8), Unpack, + DAG.getIntPtrConstant(0, DL)); +} + + /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -16108,6 +16156,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; + // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed + // by zeroable elements in the remaining 24 elements. Turn this into two + // vmovqb instructions shuffled together. + if (Subtarget.hasVLX()) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, + Mask, Zeroable, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); @@ -32091,7 +32147,7 @@ static bool matchBinaryPermuteShuffle( (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { bool ForceV1Zero = false, ForceV2Zero = false; if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, - PermuteImm, Mask)) { + PermuteImm, Mask, Zeroable)) { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; @@ -35403,6 +35459,21 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, DAG.getBitcast(MVT::i16, N0.getOperand(0))); + // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT + // determines // the number of bits loaded. Remaining bits are zero. + if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && + VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + auto *BCast = cast(N0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + VT.getVectorElementType(), + BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return ResNode; + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2d3b8a556816..490d0c3048ff 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5060,22 +5060,11 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; - - def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), - sub_xmm)>; -} - -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { - def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr + (VPMULLQZrmb (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + addr:$src2), sub_ymm)>; def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), @@ -5084,29 +5073,47 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } -multiclass avx512_min_max_lowering { +multiclass avx512_min_max_lowering { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } let Predicates = [HasAVX512, NoVLX] in { - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; + defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; + defm : avx512_min_max_lowering<"VPMINUQZ", umin>; + defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; + defm : avx512_min_max_lowering<"VPMINSQZ", smin>; } //===----------------------------------------------------------------------===// @@ -5282,22 +5289,17 @@ multiclass avx512_logical_lowering_bcast { // Register-broadcast logical operations. - def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (_.BroadcastLdFrag addr:$src2))))), - (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.ImmAllZerosV)), (!cast(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -11488,102 +11490,6 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, avx512vl_i64_info>, VEX_W; -// Patterns to fold bitcasted FP broadcasts. -// FIXME: Need better DAG canonicalization. -let Predicates = [HasVLX] in { - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - // Patterns to use VPTERNLOG for vXi16/vXi8 vectors. let Predicates = [HasVLX] in { def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, diff --git a/lib/Transforms/IPO/Attributor.cpp b/lib/Transforms/IPO/Attributor.cpp index 1455a906103a..58ce91c807dd 100644 --- a/lib/Transforms/IPO/Attributor.cpp +++ b/lib/Transforms/IPO/Attributor.cpp @@ -2467,7 +2467,7 @@ struct AAAlignImpl : AAAlign { if (SI->getAlignment() < getAssumedAlign()) { STATS_DECLTRACK(AAAlign, Store, "Number of times alignemnt added to a store"); - SI->setAlignment(getAssumedAlign()); + SI->setAlignment(Align(getAssumedAlign())); Changed = ChangeStatus::CHANGED; } } else if (auto *LI = dyn_cast(U.getUser())) { diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 9c7fd5e1a813..feac1b608848 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -2285,14 +2285,10 @@ OptimizeFunctions(Module &M, // So, remove unreachable blocks from the function, because a) there's // no point in analyzing them and b) GlobalOpt should otherwise grow // some more complicated logic to break these cycles. - // Removing unreachable blocks might invalidate the dominator so we - // recalculate it. if (!F->isDeclaration()) { - if (removeUnreachableBlocks(*F)) { - auto &DT = LookupDomTree(*F); - DT.recalculate(*F); - Changed = true; - } + auto &DT = LookupDomTree(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Changed |= removeUnreachableBlocks(*F, &DTU); } Changed |= processGlobal(*F, GetTLI, LookupDomTree); diff --git a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 2cfd3f5bb17f..825f4b468b0a 100644 --- a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { auto *SI = new StoreInst(RMWI.getValOperand(), RMWI.getPointerOperand(), &RMWI); SI->setAtomic(Ordering, RMWI.getSyncScopeID()); - SI->setAlignment(DL.getABITypeAlignment(RMWI.getType())); + SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); return eraseInstFromFunction(RMWI); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index bc458ebf6521..8d4b0dc0a7a7 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -185,7 +185,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); LoadInst *L = Builder.CreateLoad(IntType, Src); // Alignment from the mem intrinsic will be better, so use it. - L->setAlignment(MaybeAlign(CopySrcAlign)); + L->setAlignment( + MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) L->setMetadata(LLVMContext::MD_tbaa, CopyMD); MDNode *LoopMemParallelMD = @@ -198,7 +199,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. - S->setAlignment(CopyDstAlign); + S->setAlignment( + MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) S->setMetadata(LLVMContext::MD_tbaa, CopyMD); if (LoopMemParallelMD) @@ -223,9 +225,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { } Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); - if (MI->getDestAlignment() < Alignment) { - MI->setDestAlignment(Alignment); + const unsigned KnownAlignment = + getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); + if (MI->getDestAlignment() < KnownAlignment) { + MI->setDestAlignment(KnownAlignment); return MI; } @@ -243,13 +246,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { ConstantInt *FillC = dyn_cast(MI->getValue()); if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) return nullptr; - uint64_t Len = LenC->getLimitedValue(); - Alignment = MI->getDestAlignment(); + const uint64_t Len = LenC->getLimitedValue(); assert(Len && "0-sized memory setting should be removed already."); - - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) - Alignment = 1; + const Align Alignment = assumeAligned(MI->getDestAlignment()); // If it is an atomic and alignment is less than the size then we will // introduce the unaligned memory access which will be later transformed diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 673099436b79..dcdbee15fe56 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -351,6 +351,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); + Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr); Instruction *visitAShr(BinaryOperator &I); Instruction *visitLShr(BinaryOperator &I); Instruction *commonShiftTransforms(BinaryOperator &I); diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index eb01b4b7d7d1..4c5e1cc43760 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1356,15 +1356,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return eraseInstFromFunction(SI); // Attempt to improve the alignment. - unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT); - unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = - StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + const Align KnownAlign = Align(getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT)); + const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment()); + const Align EffectiveStoreAlign = + StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType())); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); - else if (StoreAlign == 0) + else if (!StoreAlign) SI.setAlignment(EffectiveStoreAlign); // Try to canonicalize the stored type. diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index bc4affbecdfa..9d96ddc4040d 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1039,6 +1039,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return nullptr; } +Instruction * +InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr) { + assert(OldAShr.getOpcode() == Instruction::AShr && + "Must be called with arithmetic right-shift instruction only."); + + // Check that constant C is a splat of the element-wise bitwidth of V. + auto BitWidthSplat = [](Constant *C, Value *V) { + return match( + C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + V->getType()->getScalarSizeInBits()))); + }; + + // It should look like variable-length sign-extension on the outside: + // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) + Value *NBits; + Instruction *MaybeTrunc; + Constant *C1, *C2; + if (!match(&OldAShr, + m_AShr(m_Shl(m_Instruction(MaybeTrunc), + m_ZExtOrSelf(m_Sub(m_Constant(C1), + m_ZExtOrSelf(m_Value(NBits))))), + m_ZExtOrSelf(m_Sub(m_Constant(C2), + m_ZExtOrSelf(m_Deferred(NBits)))))) || + !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) + return nullptr; + + // There may or may not be a truncation after outer two shifts. + Instruction *HighBitExtract; + match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); + bool HadTrunc = MaybeTrunc != HighBitExtract; + + // And finally, the innermost part of the pattern must be a right-shift. + Value *X, *NumLowBitsToSkip; + if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) + return nullptr; + + // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. + Constant *C0; + if (!match(NumLowBitsToSkip, + m_ZExtOrSelf( + m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || + !BitWidthSplat(C0, HighBitExtract)) + return nullptr; + + // Since the NBits is identical for all shifts, if the outermost and + // innermost shifts are identical, then outermost shifts are redundant. + // If we had truncation, do keep it though. + if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) + return replaceInstUsesWith(OldAShr, MaybeTrunc); + + // Else, if there was a truncation, then we need to ensure that one + // instruction will go away. + if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Finally, bypass two innermost shifts, and perform the outermost shift on + // the operands of the innermost shift. + Instruction *NewAShr = + BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); + NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); +} + Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) @@ -1113,6 +1182,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { } } + if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) + return R; + // See if we can turn a signed shr into an unsigned shr. if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) return BinaryOperator::CreateLShr(Op0, Op1); diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 78b697f7f940..22190ad7a0ae 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -14,6 +14,7 @@ add_llvm_library(LLVMInstrumentation PGOMemOPSizeOpt.cpp PoisonChecking.cpp SanitizerCoverage.cpp + ValueProfileCollector.cpp ThreadSanitizer.cpp HWAddressSanitizer.cpp diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 81bd2f3c18ac..f9354069da32 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2562,6 +2562,11 @@ struct MemorySanitizerVisitor : public InstVisitor { return false; } + void handleInvariantGroup(IntrinsicInst &I) { + setShadow(&I, getShadow(&I, 0)); + setOrigin(&I, getOrigin(&I, 0)); + } + void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; @@ -2993,6 +2998,10 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::lifetime_start: handleLifetimeStart(I); break; + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + handleInvariantGroup(I); + break; case Intrinsic::bswap: handleBswap(I); break; diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index e776d59cccb5..3862f19ab7ab 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -48,6 +48,7 @@ //===----------------------------------------------------------------------===// #include "CFGMST.h" +#include "ValueProfileCollector.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -61,7 +62,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/IndirectCallVisitor.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -121,6 +121,7 @@ using namespace llvm; using ProfileCount = Function::ProfileCount; +using VPCandidateInfo = ValueProfileCollector::CandidateInfo; #define DEBUG_TYPE "pgo-instrumentation" @@ -287,6 +288,11 @@ static std::string getBranchCondString(Instruction *TI) { return result; } +static const char *ValueProfKindDescr[] = { +#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, +#include "llvm/ProfileData/InstrProfData.inc" +}; + namespace { /// The select instruction visitor plays three roles specified @@ -349,50 +355,6 @@ struct SelectInstVisitor : public InstVisitor { unsigned getNumOfSelectInsts() const { return NSIs; } }; -/// Instruction Visitor class to visit memory intrinsic calls. -struct MemIntrinsicVisitor : public InstVisitor { - Function &F; - unsigned NMemIs = 0; // Number of memIntrinsics instrumented. - VisitMode Mode = VM_counting; // Visiting mode. - unsigned CurCtrId = 0; // Current counter index. - unsigned TotalNumCtrs = 0; // Total number of counters - GlobalVariable *FuncNameVar = nullptr; - uint64_t FuncHash = 0; - PGOUseFunc *UseFunc = nullptr; - std::vector Candidates; - - MemIntrinsicVisitor(Function &Func) : F(Func) {} - - void countMemIntrinsics(Function &Func) { - NMemIs = 0; - Mode = VM_counting; - visit(Func); - } - - void instrumentMemIntrinsics(Function &Func, unsigned TotalNC, - GlobalVariable *FNV, uint64_t FHash) { - Mode = VM_instrument; - TotalNumCtrs = TotalNC; - FuncHash = FHash; - FuncNameVar = FNV; - visit(Func); - } - - std::vector findMemIntrinsics(Function &Func) { - Candidates.clear(); - Mode = VM_annotate; - visit(Func); - return Candidates; - } - - // Visit the IR stream and annotate all mem intrinsic call instructions. - void instrumentOneMemIntrinsic(MemIntrinsic &MI); - - // Visit \p MI instruction and perform tasks according to visit mode. - void visitMemIntrinsic(MemIntrinsic &SI); - - unsigned getNumOfMemIntrinsics() const { return NMemIs; } -}; class PGOInstrumentationGenLegacyPass : public ModulePass { public: @@ -564,13 +526,14 @@ template class FuncPGOInstrumentation { // A map that stores the Comdat group in function F. std::unordered_multimap &ComdatMembers; + ValueProfileCollector VPC; + void computeCFGHash(); void renameComdatFunction(); public: - std::vector> ValueSites; + std::vector> ValueSites; SelectInstVisitor SIVisitor; - MemIntrinsicVisitor MIVisitor; std::string FuncName; GlobalVariable *FuncNameVar; @@ -605,23 +568,21 @@ template class FuncPGOInstrumentation { std::unordered_multimap &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), - ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func), - MST(F, BPI, BFI) { + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); - MIVisitor.countMemIntrinsics(Func); + ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize); if (!IsCS) { NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfPGOBB += MST.BBInfos.size(); - ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func); + ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget); } else { NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfCSPGOBB += MST.BBInfos.size(); } - ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func); FuncName = getPGOFuncName(F); computeCFGHash(); @@ -875,28 +836,36 @@ static void instrumentOneFunc( if (DisableValueProfiling) return; - unsigned NumIndirectCalls = 0; - for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) { - CallSite CS(I); - Value *Callee = CS.getCalledValue(); - LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = " - << NumIndirectCalls << "\n"); - IRBuilder<> Builder(I); - assert(Builder.GetInsertPoint() != I->getParent()->end() && - "Cannot get the Instrumentation point"); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), - Builder.getInt64(FuncInfo.FunctionHash), - Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()), - Builder.getInt32(IPVK_IndirectCallTarget), - Builder.getInt32(NumIndirectCalls++)}); - } - NumOfPGOICall += NumIndirectCalls; + NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size(); + + // For each VP Kind, walk the VP candidates and instrument each one. + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { + unsigned SiteIndex = 0; + if (Kind == IPVK_MemOPSize && !PGOInstrMemOP) + continue; - // Now instrument memop intrinsic calls. - FuncInfo.MIVisitor.instrumentMemIntrinsics( - F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash); + for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) { + LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind] + << " site: CallSite Index = " << SiteIndex << "\n"); + + IRBuilder<> Builder(Cand.InsertPt); + assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() && + "Cannot get the Instrumentation point"); + + Value *ToProfile = nullptr; + if (Cand.V->getType()->isIntegerTy()) + ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty()); + else if (Cand.V->getType()->isPointerTy()) + ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); + assert(ToProfile && "value profiling Value is of unexpected type"); + + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), + {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}); + } + } // IPVK_First <= Kind <= IPVK_Last } namespace { @@ -1429,43 +1398,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) { llvm_unreachable("Unknown visiting mode"); } -void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) { - Module *M = F.getParent(); - IRBuilder<> Builder(&MI); - Type *Int64Ty = Builder.getInt64Ty(); - Type *I8PtrTy = Builder.getInt8PtrTy(); - Value *Length = MI.getLength(); - assert(!isa(Length)); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), - Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty), - Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)}); - ++CurCtrId; -} - -void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) { - if (!PGOInstrMemOP) - return; - Value *Length = MI.getLength(); - // Not instrument constant length calls. - if (dyn_cast(Length)) - return; - - switch (Mode) { - case VM_counting: - NMemIs++; - return; - case VM_instrument: - instrumentOneMemIntrinsic(MI); - return; - case VM_annotate: - Candidates.push_back(&MI); - return; - } - llvm_unreachable("Unknown visiting mode"); -} - // Traverse all valuesites and annotate the instructions for all value kind. void PGOUseFunc::annotateValueSites() { if (DisableValueProfiling) @@ -1478,11 +1410,6 @@ void PGOUseFunc::annotateValueSites() { annotateValueSites(Kind); } -static const char *ValueProfKindDescr[] = { -#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, -#include "llvm/ProfileData/InstrProfData.inc" -}; - // Annotate the instructions for a specific value kind. void PGOUseFunc::annotateValueSites(uint32_t Kind) { assert(Kind <= IPVK_Last); @@ -1501,11 +1428,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) { return; } - for (auto &I : ValueSites) { + for (VPCandidateInfo &I : ValueSites) { LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind << "): Index = " << ValueSiteIndex << " out of " << NumValueSites << "\n"); - annotateValueSite(*M, *I, ProfileRecord, + annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord, static_cast(Kind), ValueSiteIndex, Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations : MaxNumAnnotations); diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp new file mode 100644 index 000000000000..604726d4f40f --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -0,0 +1,78 @@ +//===- ValueProfileCollector.cpp - determine what to value profile --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl +// +//===----------------------------------------------------------------------===// + +#include "ValueProfilePlugins.inc" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" + +#include + +using namespace llvm; + +namespace { + +/// A plugin-based class that takes an arbitrary number of Plugin types. +/// Each plugin type must satisfy the following API: +/// 1) the constructor must take a `Function &f`. Typically, the plugin would +/// scan the function looking for candidates. +/// 2) contain a member function with the following signature and name: +/// void run(std::vector &Candidates); +/// such that the plugin would append its result into the vector parameter. +/// +/// Plugins are defined in ValueProfilePlugins.inc +template class PluginChain; + +/// The type PluginChainFinal is the final chain of plugins that will be used by +/// ValueProfileCollectorImpl. +using PluginChainFinal = PluginChain; + +template <> class PluginChain<> { +public: + PluginChain(Function &F) {} + void get(InstrProfValueKind K, std::vector &Candidates) {} +}; + +template +class PluginChain : public PluginChain { + PluginT Plugin; + using Base = PluginChain; + +public: + PluginChain(Function &F) : PluginChain(F), Plugin(F) {} + + void get(InstrProfValueKind K, std::vector &Candidates) { + if (K == PluginT::Kind) + Plugin.run(Candidates); + Base::get(K, Candidates); + } +}; + +} // end anonymous namespace + +/// ValueProfileCollectorImpl inherits the API of PluginChainFinal. +class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal { +public: + using PluginChainFinal::PluginChainFinal; +}; + +ValueProfileCollector::ValueProfileCollector(Function &F) + : PImpl(new ValueProfileCollectorImpl(F)) {} + +ValueProfileCollector::~ValueProfileCollector() = default; + +std::vector +ValueProfileCollector::get(InstrProfValueKind Kind) const { + std::vector Result; + PImpl->get(Kind, Result); + return Result; +} diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.h b/lib/Transforms/Instrumentation/ValueProfileCollector.h new file mode 100644 index 000000000000..ff883c8d0c77 --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -0,0 +1,79 @@ +//===- ValueProfileCollector.h - determine what to value profile ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a utility class, ValueProfileCollector, that is used to +// determine what kind of llvm::Value's are worth value-profiling, at which +// point in the program, and which instruction holds the Value Profile metadata. +// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use] +// passes. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" + +namespace llvm { + +/// Utility analysis that determines what values are worth profiling. +/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to +/// populate the Candidates vector. +/// +/// Value profiling an expression means to track the values that this expression +/// takes at runtime and the frequency of each value. +/// It is important to distinguish between two sets of value profiles for a +/// particular expression: +/// 1) The set of values at the point of evaluation. +/// 2) The set of values at the point of use. +/// In some cases, the two sets are identical, but it's not unusual for the two +/// to differ. +/// +/// To elaborate more, consider this C code, and focus on the expression `nn`: +/// void foo(int nn, bool b) { +/// if (b) memcpy(x, y, nn); +/// } +/// The point of evaluation can be as early as the start of the function, and +/// let's say the value profile for `nn` is: +/// total=100; (value,freq) set = {(8,10), (32,50)} +/// The point of use is right before we call memcpy, and since we execute the +/// memcpy conditionally, the value profile of `nn` can be: +/// total=15; (value,freq) set = {(8,10), (4,5)} +/// +/// For this reason, a plugin is responsible for computing the insertion point +/// for each value to be profiled. The `CandidateInfo` structure encapsulates +/// all the information needed for each value profile site. +class ValueProfileCollector { +public: + struct CandidateInfo { + Value *V; // The value to profile. + Instruction *InsertPt; // Insert the VP lib call before this instr. + Instruction *AnnotatedInst; // Where metadata is attached. + }; + + ValueProfileCollector(Function &Fn); + ValueProfileCollector(ValueProfileCollector &&) = delete; + ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; + + ValueProfileCollector(const ValueProfileCollector &) = delete; + ValueProfileCollector &operator=(const ValueProfileCollector &) = delete; + ~ValueProfileCollector(); + + /// returns a list of value profiling candidates of the given kind + std::vector get(InstrProfValueKind Kind) const; + +private: + class ValueProfileCollectorImpl; + std::unique_ptr PImpl; +}; + +} // namespace llvm + +#endif diff --git a/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc new file mode 100644 index 000000000000..4cc4c6c848c3 --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -0,0 +1,75 @@ +//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a set of plugin classes used in ValueProfileCollectorImpl. +// Each plugin is responsible for collecting Value Profiling candidates for a +// particular optimization. +// Each plugin must satisfy the interface described in ValueProfileCollector.cpp +// +//===----------------------------------------------------------------------===// + +#include "ValueProfileCollector.h" +#include "llvm/Analysis/IndirectCallVisitor.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; +using CandidateInfo = ValueProfileCollector::CandidateInfo; + +///--------------------------- MemIntrinsicPlugin ------------------------------ +class MemIntrinsicPlugin : public InstVisitor { + Function &F; + std::vector *Candidates; + +public: + static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; + + MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + + void run(std::vector &Cs) { + Candidates = &Cs; + visit(F); + Candidates = nullptr; + } + void visitMemIntrinsic(MemIntrinsic &MI) { + Value *Length = MI.getLength(); + // Not instrument constant length calls. + if (dyn_cast(Length)) + return; + + Instruction *InsertPt = &MI; + Instruction *AnnotatedInst = &MI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } +}; + +///------------------------ IndirectCallPromotionPlugin ------------------------ +class IndirectCallPromotionPlugin { + Function &F; + +public: + static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; + + IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + + void run(std::vector &Candidates) { + std::vector Result = findIndirectCalls(F); + for (Instruction *I : Result) { + Value *Callee = CallSite(I).getCalledValue(); + Instruction *InsertPt = I; + Instruction *AnnotatedInst = I; + Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst}); + } + } +}; + +///----------------------- Registration of the plugins ------------------------- +/// For now, registering a plugin with the ValueProfileCollector is done by +/// adding the plugin type to the VP_PLUGIN_LIST macro. +#define VP_PLUGIN_LIST \ + MemIntrinsicPlugin, \ + IndirectCallPromotionPlugin diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 2d135b41279f..0e9f03a06061 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -329,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlignment()) { - SI->setAlignment(NewAlignment); + SI->setAlignment(MaybeAlign(NewAlignment)); ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast(J)) { diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index 79b901ac0db8..1f01ba2fbfc6 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -894,8 +894,8 @@ class GVNHoist { ++NumLoadsRemoved; } else if (auto *ReplacementStore = dyn_cast(Repl)) { ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast(I)->getAlignment())); + MaybeAlign(std::min(ReplacementStore->getAlignment(), + cast(I)->getAlignment()))); ++NumStoresRemoved; } else if (auto *ReplacementAlloca = dyn_cast(Repl)) { ReplacementAlloca->setAlignment( diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 5c2176f873bc..2e13e8e4150d 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -1790,7 +1790,7 @@ class LoopPromoter : public LoadAndStorePromoter { StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Alignment); + NewSI->setAlignment(MaybeAlign(Alignment)); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index cec65ba76eda..c1e935fda7f8 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -3127,7 +3127,7 @@ class llvm::sroa::AllocaSliceRewriter Value *Op = SI->getOperand(0); StoreAlign = DL.getABITypeAlignment(Op->getType()); } - SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); continue; } diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 3a6b7a6a6555..94339c2ba00f 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -2246,9 +2246,13 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, } BB->dropAllReferences(); if (DTU) { - // Remove the terminator of BB to clear the successor list of BB. - if (BB->getTerminator()) - BB->getInstList().pop_back(); + Instruction *TI = BB->getTerminator(); + assert(TI && "Basic block should have a terminator"); + // Terminators like invoke can have users. We have to replace their users, + // before removing them. + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); assert(succ_empty(BB) && "The successor list of BB isn't empty before " "applying corresponding DTU updates."); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 7352ce83adb4..279a844f9e44 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3087,15 +3087,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, // store that doesn't execute. if (MinAlignment != 0) { // Choose the minimum of all non-zero alignments. - SI->setAlignment(MinAlignment); + SI->setAlignment(Align(MinAlignment)); } else if (MaxAlignment != 0) { // Choose the minimal alignment between the non-zero alignment and the ABI // default alignment for the type of the stored value. - SI->setAlignment(std::min(MaxAlignment, TypeAlignment)); + SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment))); } else { // If both alignments are zero, use ABI default alignment for the type of // the stored value. - SI->setAlignment(TypeAlignment); + SI->setAlignment(Align(TypeAlignment)); } QStore->eraseFromParent(); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 3af754a3eb03..1fb4f28f3364 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2792,6 +2792,12 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { return nullptr; } +Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { + // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) + return B.CreateMemMove(CI->getArgOperand(1), 1, CI->getArgOperand(0), 1, + CI->getArgOperand(2)); +} + bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { LibFunc Func; SmallString<20> FloatFuncName = FuncName; @@ -2870,6 +2876,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeRealloc(CI, Builder); case LibFunc_wcslen: return optimizeWcslen(CI, Builder); + case LibFunc_bcopy: + return optimizeBCopy(CI, Builder); default: break; } diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 196dbe12b876..99428c6c5dee 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4024,7 +4024,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), diff --git a/test/Analysis/MemorySSA/pr43427.ll b/test/Analysis/MemorySSA/pr43427.ll new file mode 100644 index 000000000000..f70887822171 --- /dev/null +++ b/test/Analysis/MemorySSA/pr43427.ll @@ -0,0 +1,42 @@ +; RUN: opt -disable-output -licm -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: @f() +; CHECK: 8 = MemoryPhi( +; CHECK: 7 = MemoryPhi( +; CHECK: 9 = MemoryPhi( +define void @f() { +entry: + %e = alloca i16, align 1 + br label %lbl1 + +lbl1: ; preds = %if.else, %cleanup, %entry + store i16 undef, i16* %e, align 1 + call void @g() + br i1 undef, label %for.end, label %if.else + +for.end: ; preds = %lbl1 + br i1 undef, label %lbl3, label %lbl2 + +lbl2: ; preds = %lbl3, %for.end + br label %lbl3 + +lbl3: ; preds = %lbl2, %for.end + br i1 undef, label %lbl2, label %cleanup + +cleanup: ; preds = %lbl3 + %cleanup.dest = load i32, i32* undef, align 1 + %switch = icmp ult i32 %cleanup.dest, 1 + br i1 %switch, label %cleanup.cont, label %lbl1 + +cleanup.cont: ; preds = %cleanup + call void @llvm.lifetime.end.p0i8(i64 1, i8* null) + ret void + +if.else: ; preds = %lbl1 + br label %lbl1 +} + +declare void @g() + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) diff --git a/test/CodeGen/AArch64/framelayout-sve.mir b/test/CodeGen/AArch64/framelayout-sve.mir new file mode 100644 index 000000000000..9009a6a29bf6 --- /dev/null +++ b/test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,121 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated below the (scalar) callee saves, +# and above spills/locals and the alignment gap, e.g. +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | SVE area | +# +-------------+ +# |/////////////| alignment gap. +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_sve() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable } + +... +# +----------+ +# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + RET_ReallyLR +--- +... +# +----------+ +# | x20, x21 | // callee saves +# +----------+ +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $x21, killed $x20, $sp, -2 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $x20 = IMPLICIT_DEF +# CHECK-NEXT: $x21 = IMPLICIT_DEF +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp, $x21, $x20 = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_callee_saves +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + RET_ReallyLR +--- +... +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# |//////////| // alignment gap +# | %stack.0 | // not scalable +# +----------+ <- SP +# CHECK-LABEL: name: test_allocate_sve_gpr_realigned +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_realigned +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + RET_ReallyLR +--- diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 513f5b08c6a2..65a7fc7f4aa7 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -20,12 +20,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -51,12 +51,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -82,12 +82,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -208,12 +208,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -243,14 +243,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -283,12 +283,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -318,14 +318,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -355,14 +355,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -392,12 +392,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -427,14 +427,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -464,14 +464,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -501,14 +501,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -538,14 +538,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -575,14 +575,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -612,14 +612,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -649,14 +649,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -681,10 +681,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -707,10 +707,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -733,10 +733,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -760,11 +760,11 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -789,10 +789,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_fi - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_fi - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -820,10 +820,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -853,13 +853,13 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index 2d8634025c96..ee5ff53ca676 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -21,12 +21,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -52,12 +52,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -83,12 +83,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -114,12 +114,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -145,12 +145,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -176,12 +176,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -209,10 +209,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_GEP %0, %1 @@ -239,10 +239,10 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -268,11 +268,11 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 5ea807166fe5..e4cc48d54dd6 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -522,3 +522,169 @@ body: | %3:_(<4 x s32>) = G_ANYEXT %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: test_and_v8s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v8s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF]](<8 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF1]](<8 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC1]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + %0:_(<8 x s8>) = G_IMPLICIT_DEF + %1:_(<8 x s8>) = G_IMPLICIT_DEF + %2:_(<8 x s8>) = G_AND %0, %1 + %3:_(<8 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %3 +... + +--- +name: test_and_v16s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF]](<16 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF1]](<16 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<16 x s8>) + ; CHECK: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC1]](<16 x s8>) + ; CHECK: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV]](<8 x s8>) + ; CHECK: [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8), [[UV16:%[0-9]+]]:_(s8), [[UV17:%[0-9]+]]:_(s8), [[UV18:%[0-9]+]]:_(s8), [[UV19:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV2]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[UV20:%[0-9]+]]:_(s8), [[UV21:%[0-9]+]]:_(s8), [[UV22:%[0-9]+]]:_(s8), [[UV23:%[0-9]+]]:_(s8), [[UV24:%[0-9]+]]:_(s8), [[UV25:%[0-9]+]]:_(s8), [[UV26:%[0-9]+]]:_(s8), [[UV27:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV1]](<8 x s8>) + ; CHECK: [[UV28:%[0-9]+]]:_(s8), [[UV29:%[0-9]+]]:_(s8), [[UV30:%[0-9]+]]:_(s8), [[UV31:%[0-9]+]]:_(s8), [[UV32:%[0-9]+]]:_(s8), [[UV33:%[0-9]+]]:_(s8), [[UV34:%[0-9]+]]:_(s8), [[UV35:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV3]](<8 x s8>) + ; CHECK: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s8) + ; CHECK: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s8) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT16]], [[ANYEXT17]] + ; CHECK: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s8) + ; CHECK: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s8) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT18]], [[ANYEXT19]] + ; CHECK: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s8) + ; CHECK: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s8) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT20]], [[ANYEXT21]] + ; CHECK: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s8) + ; CHECK: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s8) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT22]], [[ANYEXT23]] + ; CHECK: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s8) + ; CHECK: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s8) + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT24]], [[ANYEXT25]] + ; CHECK: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s8) + ; CHECK: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s8) + ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT26]], [[ANYEXT27]] + ; CHECK: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s8) + ; CHECK: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s8) + ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT28]], [[ANYEXT29]] + ; CHECK: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s8) + ; CHECK: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s8) + ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT30]], [[ANYEXT31]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[AND8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND9]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND10]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND11]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND12]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND13]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[AND14]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[AND15]](s32) + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>) + %0:_(<16 x s8>) = G_IMPLICIT_DEF + %1:_(<16 x s8>) = G_IMPLICIT_DEF + %2:_(<16 x s8>) = G_AND %0, %1 + %3:_(<16 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir index 2c3995a49d1c..a7c62f74c216 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -121,6 +121,36 @@ body: | $vgpr0_vgpr1 = COPY %1 ... +--- +name: test_bitcast_v2s64_to_v8s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v2s64_to_v8s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>) + %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<8 x s16>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + +--- +name: test_bitcast_v8s16_to_v2s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v8s16_to_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[COPY]](<8 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x s64>) + %0:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + --- name: test_bitcast_p0_to_p1 body: | @@ -180,3 +210,75 @@ body: | %1:_(p999) = G_BITCAST %0 $vgpr0_vgpr1 = COPY %1 ... + +--- +name: test_bitcast_v4s64_to_v8s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v4s64_to_v8s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s32>) = G_BITCAST [[COPY]](<4 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<8 x s32>) + %0:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<8 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s32_to_v4s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v8s32_to_v4s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<8 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x s64>) + %0:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<4 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s64_to_v16s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v8s64_to_v16s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<16 x s32>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<16 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... + +--- +name: test_bitcast_v16s32_to_v8s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v16s32_to_v8s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s64>) = G_BITCAST [[COPY]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) + %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<8 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir index b36737de837d..066932ec4807 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir @@ -33,6 +33,24 @@ body: | %2:_(s64) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s32_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s32_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[COPY1]](s32), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s64) = G_INSERT %0, %1, 16 + $vgpr0_vgpr1 = COPY %2 +... + --- name: test_insert_s96_s32_offset0 body: | @@ -305,6 +323,83 @@ body: | %2:_(s128) = G_INSERT %0, %1, 64 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 ... + +--- +name: test_insert_s128_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset112 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset112 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 112 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 112 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + --- name: test_insert_v2s32_s32_offset0 body: | @@ -1079,3 +1174,76 @@ body: | %2:_(<4 x s16>) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset48 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset48 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 48 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 48 + $vgpr0_vgpr1 = COPY %3 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 25f5d873e9d5..0643505bf999 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; PACKED: bb.1 (%ir-block.0): @@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; PACKED: bb.1 (%ir-block.0): @@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -312,7 +312,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; PACKED: bb.1 (%ir-block.0): @@ -328,7 +328,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -354,7 +354,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): @@ -370,7 +370,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -396,7 +396,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -412,7 +412,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -459,7 +459,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -500,7 +500,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 7de9e4554425..5fa523f7b8da 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -202,7 +202,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -226,7 +226,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -250,7 +250,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -294,7 +294,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 3ed040bdfffc..eea1750d63ee 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) + ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -537,7 +537,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -559,7 +559,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -581,7 +581,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -600,7 +600,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -618,7 +618,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -639,7 +639,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -661,7 +661,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -683,7 +683,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -722,7 +722,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -765,7 +765,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir index accfaf08624b..b390e33e4ae6 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -35,17 +35,16 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<16 x s32>) = COPY [[COPY]](<16 x s32>) ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -117,6 +116,102 @@ body: | $vgpr0 = COPY %2 ... +--- +name: extract_vector_elt_v8s64_ss +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; CHECK-LABEL: name: extract_vector_elt_v8s64_ss + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; CHECK: [[EVEC:%[0-9]+]]:sgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[COPY1]](s32) + ; CHECK: $sgpr0_sgpr1 = COPY [[EVEC]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $sgpr16 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $sgpr0_sgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_vs +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v8s64_vs + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s32) = COPY $sgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_sv +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; CHECK-LABEL: name: extract_vector_elt_v8s64_sv + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $vgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + --- name: extract_vector_elt_v8s64_vv legalized: true @@ -130,16 +225,27 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(s64) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -147,7 +253,8 @@ body: | ; CHECK: successors: %bb.3(0x80000000) ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir index a3e7d7423b38..18f97fbeb91b 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir @@ -56,8 +56,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -69,17 +68,35 @@ body: | --- name: insert_vector_elt_v4i32_s_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 @@ -91,17 +108,35 @@ body: | --- name: insert_vector_elt_v4i32_s_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -113,16 +148,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 @@ -134,12 +188,14 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_s legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 @@ -155,16 +211,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 @@ -172,3 +247,299 @@ body: | %3:_(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: insert_vector_elt_v8s64_s_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:sgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $sgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $vgpr0 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s32) = COPY $vgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $vgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... diff --git a/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir index 323396795dd4..4ac48b1133aa 100644 --- a/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -393,12 +393,12 @@ name: trivial_clause_load_mubuf4_x2 body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -407,13 +407,13 @@ name: break_clause_simple_load_mubuf_offen_ptr body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -424,11 +424,11 @@ name: mubuf_load4_overwrite_ptr body: | bb.0: ; GCN-LABEL: name: mubuf_load4_overwrite_ptr - ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec S_ENDPGM 0 @@ -443,11 +443,11 @@ body: | ; GCN-LABEL: name: break_clause_flat_load_mubuf_load ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... # Break a clause from interference between mubuf and flat instructions @@ -462,7 +462,7 @@ name: break_clause_mubuf_load_flat_load body: | bb.0: - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -504,12 +504,12 @@ name: break_clause_atomic_rtn_into_ptr_mubuf4 body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -521,11 +521,11 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 ; GCN: BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -536,11 +536,11 @@ name: no_break_clause_mubuf_load_novaddr body: | bb.0: ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index 16021126d1ea..f631bcd25811 100644 --- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -55,10 +55,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -117,10 +117,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -180,10 +180,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -245,10 +245,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -322,10 +322,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,10 +387,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir b/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir index 7839d514a144..599cacb82615 100644 --- a/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir +++ b/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir @@ -30,7 +30,7 @@ body: | %14:vgpr_32 = V_AND_B32_e32 1, %13, implicit $exec %15:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %14, implicit $exec %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %15, implicit $exec - BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) S_ENDPGM 0 bb.2: @@ -78,7 +78,7 @@ body: | bb.8: successors: %bb.10 - %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %34:sreg_64_xexec = V_CMP_NE_U32_e64 0, %31, implicit $exec %35:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %34, implicit $exec %28:vgpr_32 = COPY %35 diff --git a/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir b/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir index d7d8b41f6833..bc549f7bb87b 100644 --- a/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir +++ b/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir @@ -83,7 +83,7 @@ body: | bb.9: successors: %bb.10(0x80000000) - %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %21:sreg_64 = V_CMP_NE_U32_e64 target-flags(amdgpu-gotprel) 0, killed %19.sub0, implicit $exec %22:sreg_64 = COPY $exec, implicit-def $exec %23:sreg_64 = S_AND_B64 %22, %21, implicit-def dead $scc diff --git a/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir b/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir index 4dffe32f9b18..67399883ae07 100644 --- a/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir +++ b/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir @@ -68,7 +68,7 @@ body: | %23:vreg_128 = COPY killed %17 %24:sreg_64 = COPY killed %16 %25:vgpr_32 = V_OR_B32_e32 %22, %11, implicit $exec - %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %28:vgpr_32 = V_LSHRREV_B32_e32 30, killed %26.sub0, implicit $exec %29:vreg_128 = COPY killed %21 %29.sub0:vreg_128 = COPY %1 diff --git a/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir b/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir index eb3d6169e976..773466af7adb 100644 --- a/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir +++ b/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir @@ -11,7 +11,7 @@ # # GCN-LABEL: bb.6: # GCN: successors: %bb.7(0x{{[0-9]+}}), %bb.18(0x{{[0-9]+}}) -# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, implicit $exec # --- | @@ -69,7 +69,7 @@ body: | %10:sreg_64 = COPY killed %5 undef %11.sub2:sreg_128 = COPY %4 %11.sub3:sreg_128 = COPY %3 - %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, implicit $exec + %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec undef %13.sub1:vreg_128 = COPY %9.sub1 %13.sub2:vreg_128 = COPY %9.sub2 %14:sreg_64 = V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $exec @@ -161,7 +161,7 @@ body: | bb.18: successors: %bb.7(0x80000000) dead %59:vgpr_32 = V_FMA_F32 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $exec - dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, implicit $exec + dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec undef %66.sub1:vreg_128 = COPY %13.sub1 %66.sub2:vreg_128 = COPY %13.sub2 %67:sreg_64 = V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir b/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir index a01f1e71dace..4c532e89398e 100644 --- a/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir +++ b/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir @@ -148,7 +148,7 @@ body: | %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sreg_128, 12, 0, 0 :: (dereferenceable invariant load 4) %45:vgpr_32 = V_MUL_LO_I32 killed %42, killed %43, implicit $exec %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec - %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %49:sreg_64 = V_CMP_NE_U32_e64 0, killed %47, implicit $exec %50:sreg_64 = COPY $exec, implicit-def $exec %51:sreg_64 = S_AND_B64 %50, %49, implicit-def dead $scc diff --git a/test/CodeGen/AMDGPU/collapse-endcf.mir b/test/CodeGen/AMDGPU/collapse-endcf.mir index 708814e3df45..1a26a507cd9b 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -33,7 +33,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -44,7 +44,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: DBG_VALUE @@ -80,7 +80,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -92,7 +92,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -141,7 +141,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -152,7 +152,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -188,7 +188,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -200,7 +200,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -249,7 +249,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -260,7 +260,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -297,7 +297,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -309,7 +309,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -358,7 +358,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -370,7 +370,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -408,7 +408,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -420,7 +420,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: %15:sgpr_32 = IMPLICIT_DEF @@ -471,7 +471,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -482,7 +482,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -520,7 +520,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -532,7 +532,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -583,7 +583,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -595,7 +595,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -631,7 +631,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -643,7 +643,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -691,7 +691,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -703,7 +703,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -739,7 +739,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -751,7 +751,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -799,7 +799,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -811,7 +811,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.5(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -850,7 +850,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -862,7 +862,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/test/CodeGen/AMDGPU/collapse-endcf2.mir b/test/CodeGen/AMDGPU/collapse-endcf2.mir index 44a8e38a5655..9219083bb64c 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf2.mir +++ b/test/CodeGen/AMDGPU/collapse-endcf2.mir @@ -42,7 +42,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -54,7 +54,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -91,7 +91,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -103,7 +103,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index e5ff97a7be3d..92e29f3a5290 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -54,7 +54,7 @@ body: | %8 = S_MOV_B32 9999 %9 = S_AND_B32 killed %7, killed %8, implicit-def dead $scc %10 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -219,7 +219,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHL_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -419,7 +419,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_ASHR_I32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -627,7 +627,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHR_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir index 4679831c786d..bba41584bc97 100644 --- a/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir +++ b/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir @@ -291,7 +291,7 @@ body: | bb.3..lr.ph3410.preheader: successors: %bb.4(0x80000000) - dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) dead %60:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec %36:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc dead %67:vgpr_32 = V_MOV_B32_e32 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll index a39833455a15..70e5df5788ae 100644 --- a/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 @@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GCN: [[DEF3:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; GCN: S_ENDPGM 0 main_body: %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) diff --git a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir index a015a1ef4d11..f80176508bef 100644 --- a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -23,13 +23,13 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %1:sreg_32_xm0 = S_MOV_B32 0 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -57,12 +57,12 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -87,15 +87,15 @@ body: | ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -119,15 +119,15 @@ body: | ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 diff --git a/test/CodeGen/AMDGPU/fold-imm-copy.mir b/test/CodeGen/AMDGPU/fold-imm-copy.mir index 7fe6ce845ab9..f2d423a70785 100644 --- a/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -17,7 +17,7 @@ body: | %4:vgpr_32 = V_LSHLREV_B32_e64 killed %3, %0, implicit $exec %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %6:vreg_64 = REG_SEQUENCE killed %4, %subreg.sub0, killed %5, %subreg.sub1 - %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, implicit $exec %8:sreg_32_xm0 = S_MOV_B32 65535 %9:vgpr_32 = COPY %8 %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec diff --git a/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir index 3ab99551012f..1e596b79016a 100644 --- a/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -158,10 +158,10 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %12 = V_MOV_B32_e32 1065353216, implicit $exec %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -222,13 +222,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1065353216, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -289,14 +289,14 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -360,16 +360,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -427,13 +427,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -494,16 +494,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 -2, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -564,13 +564,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 15360, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -631,13 +631,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 80886784, implicit $exec %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -697,13 +697,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 305413120, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir index 65b254e7616a..e26f0c934fce 100644 --- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -60,13 +60,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -131,13 +131,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -202,13 +202,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -273,13 +273,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/fold-multiple.mir b/test/CodeGen/AMDGPU/fold-multiple.mir index ef35b2634579..d8c396c9d4a4 100644 --- a/test/CodeGen/AMDGPU/fold-multiple.mir +++ b/test/CodeGen/AMDGPU/fold-multiple.mir @@ -34,7 +34,7 @@ body: | %3 = S_LSHL_B32 %1, killed %1, implicit-def dead $scc %4 = V_AND_B32_e64 killed %2, killed %3, implicit $exec %5 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir b/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir new file mode 100644 index 000000000000..11af6e19ecb2 --- /dev/null +++ b/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir @@ -0,0 +1,15 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -o - -run-pass si-fix-sgpr-copies -verify-machineinstrs %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: fold_acc_copy_into_valu +# GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY %0.sub0 +# GCN: %2:vgpr_32 = V_AND_B32_e32 [[COPY]], undef %3:vgpr_32, implicit $exec +--- +name: fold_acc_copy_into_valu +body: | + bb.0.entry: + + %0:areg_1024 = IMPLICIT_DEF + %1:sreg_32_xm0 = COPY %0.sub0 + %3:vgpr_32 = V_AND_B32_e32 %1, undef %2:vgpr_32, implicit $exec + +... diff --git a/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir b/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir index 8cc294f57b26..bd6244127e6f 100644 --- a/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir +++ b/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir @@ -12,7 +12,7 @@ body: | bb.0.entry: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $m0, implicit $exec S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir index 9ef2431df6ee..d0f32f287473 100644 --- a/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -57,7 +57,7 @@ body: | BUNDLE implicit-def $sgpr0_sgpr1, implicit $sgpr10_sgpr11 { $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 } - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -91,5 +91,5 @@ body: | } bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index abcf3342fcf4..e7ad0bd0122e 100644 --- a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -1,5 +1,5 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN %s ; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy ; GCN: ; illegal copy v1 to s9 @@ -43,7 +43,8 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a1 to s9 +; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 +; GCN: ; illegal copy [[COPY1]] to s9 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) @@ -51,7 +52,9 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a[0:1] to s[10:11] +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 +; GCN: ; illegal copy v{{\[}}[[COPY1L]]:[[COPY1H]]] to s[10:11] define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) diff --git a/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 053a68440764..40722ce43741 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -104,7 +104,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 16 from %stack.1, align 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir index 09f1ba901060..b305cfddb5a5 100644 --- a/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir +++ b/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir @@ -41,7 +41,7 @@ body: | ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: bb.2: ; CHECK: S_ENDPGM 0 bb.0: @@ -51,7 +51,7 @@ body: | bb.1: successors: %bb.2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir b/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir index 9f39dc341509..5797bb5cfa29 100644 --- a/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir +++ b/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir @@ -49,10 +49,10 @@ body: | bb.0 (%ir-block.2): $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir index 6e67f7df30a7..1ab10fa92f7b 100644 --- a/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -230,17 +230,17 @@ name: vmem_gt_8dw_store body: | bb.0: - BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec BUFFER_ATOMIC_CMPSWAP_X2_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -553,10 +553,10 @@ body: | dead $sgpr6_sgpr7 = KILL $sgpr4_sgpr5 $sgpr8 = S_MOV_B32 $sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) $sgpr8 = S_MOV_B32 $sgpr4, implicit killed $sgpr4_sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir index 48f0be4ff8fd..0a60eaf7c03f 100644 --- a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -64,7 +64,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec S_BRANCH %bb.3 @@ -72,7 +72,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.3.done: @@ -80,7 +80,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir index 69c038b976b8..566b1c06fb12 100644 --- a/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ b/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -12,7 +12,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -24,7 +24,7 @@ name: hazard_buf_branch_lds body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,11 +56,11 @@ name: no_hazard_buf_branch_buf body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -75,7 +75,7 @@ body: | $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -87,7 +87,7 @@ name: no_hazard_lds_branch_buf_samebb body: | bb.0: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -101,7 +101,7 @@ body: | bb.0: successors: %bb.0 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... @@ -118,8 +118,8 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -137,7 +137,7 @@ body: | bb.1: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -150,11 +150,11 @@ body: | bb.0: successors: %bb.1 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -171,7 +171,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -189,7 +189,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -206,7 +206,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr0, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -223,7 +223,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 958a72566b5f..9f18f4df40bf 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -400,6 +400,46 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 7d1a5a3b99a0..1bfe0aa4086e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -276,6 +276,37 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged: +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir index f33c2115dcb2..673fff50b39c 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -86,7 +86,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 0 S_WAITCNT 127 - $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) + $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir index 1046b9729df4..99348a57b9f6 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -23,13 +23,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_CBRANCH_SCC0 %bb.1, implicit killed $scc bb.2: @@ -55,7 +55,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir index bf24ce15acb6..f52275af48c9 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir @@ -117,13 +117,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -149,7 +149,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir index a6088b0677a0..c543b80454b6 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir @@ -97,13 +97,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -129,7 +129,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory_clause.mir b/test/CodeGen/AMDGPU/memory_clause.mir index ac412a8fc29b..b46cfb16b7ba 100644 --- a/test/CodeGen/AMDGPU/memory_clause.mir +++ b/test/CodeGen/AMDGPU/memory_clause.mir @@ -337,7 +337,7 @@ body: | # GCN: dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec { # GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -357,7 +357,7 @@ body: | %2 = IMPLICIT_DEF %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec + %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: atomic{{$}} diff --git a/test/CodeGen/AMDGPU/merge-load-store.mir b/test/CodeGen/AMDGPU/merge-load-store.mir index becd2e1b9c1e..6bff48467b59 100644 --- a/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/test/CodeGen/AMDGPU/merge-load-store.mir @@ -169,10 +169,10 @@ body: | --- # CHECK-LABEL: merge_mmos # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 name: merge_mmos tracksRegLiveness: true body: | @@ -182,14 +182,14 @@ body: | %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4) - %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) - BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) - BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) + BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) + BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index 9cfd92f86c4f..ccff6bb51275 100644 --- a/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -25,7 +25,7 @@ # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -47,7 +47,7 @@ # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -72,7 +72,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -94,7 +94,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -116,7 +116,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -141,7 +141,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -163,7 +163,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -185,7 +185,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -210,7 +210,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -226,7 +226,7 @@ body: | # ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_I32_e64 %14.sub0, %4.sub0, 0, implicit $exec # ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec # ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: addr64 liveins: @@ -246,7 +246,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -269,7 +269,7 @@ body: | # W64-NO-ADDR64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-NO-ADDR64-LABEL bb.2: @@ -289,7 +289,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -303,7 +303,7 @@ body: | # ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 # ADDR64: [[ZERORSRC:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3 # ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: offset @@ -324,7 +324,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 diff --git a/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir index 0dd135723d8c..39d3efe2a1de 100644 --- a/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -9,7 +9,7 @@ name: hazard_image_sample_d_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_off1 @@ -20,7 +20,7 @@ name: no_hazard_image_sample_d_buf_off1 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_far @@ -33,7 +33,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec V_NOP_e32 implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Non-NSA @@ -45,7 +45,7 @@ name: no_hazard_image_sample_v4_v2_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Less than 4 dwords @@ -57,5 +57,5 @@ name: no_hazard_image_sample_v4_v3_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 3af2f0457fbb..0e7708210a90 100644 --- a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -137,7 +137,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -243,7 +243,7 @@ body: | %37 = REG_SEQUENCE %6, 17, killed %36, 18 %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 - BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -332,7 +332,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir index 7da7dc8d3199..53ca546969b9 100644 --- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -151,7 +151,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -159,7 +159,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -188,7 +188,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -196,7 +196,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -225,7 +225,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -233,14 +233,14 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc -# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH @@ -255,7 +255,7 @@ body: | $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec $vgpr0 = V_MOV_B32_e32 4, implicit $exec $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc $exec = S_MOV_B64_term killed $sgpr2_sgpr3 SI_MASK_BRANCH %bb.2, implicit $exec @@ -266,7 +266,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -274,7 +274,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -304,7 +304,7 @@ body: | bb.1.if: liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7 @@ -312,7 +312,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -346,7 +346,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -356,7 +356,7 @@ body: | $sgpr1 = S_MOV_B32 1 $sgpr2 = S_MOV_B32 -1 $sgpr3 = S_MOV_B32 61440 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,7 +387,7 @@ body: | S_SLEEP 0, implicit $sgpr2_sgpr3 $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -395,7 +395,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -426,7 +426,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -434,7 +434,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -463,7 +463,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -471,7 +471,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -500,7 +500,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -508,7 +508,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -539,7 +539,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -547,6 +547,6 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir index 7a1cfa32a60c..39915f2755ce 100644 --- a/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -27,12 +27,12 @@ body: | ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) diff --git a/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 3c99cc7c19da..807029a92f34 100644 --- a/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -46,7 +46,7 @@ body: | %15:sreg_32_xm0 = S_MOV_B32 61440 %16:sreg_32_xm0 = S_MOV_B32 -1 %17:sreg_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 - BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) %19:vgpr_32 = COPY %4 %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 diff --git a/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir index ed648ece0c71..66bd4c163c66 100644 --- a/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir +++ b/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir @@ -17,6 +17,6 @@ body: | S_BARRIER $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $exec $vgpr0 = V_ACCVGPR_READ_B32 $agpr31, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir index 196301f4cb07..1d9ab685c532 100644 --- a/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir +++ b/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir @@ -185,7 +185,7 @@ body: | bb.28: %9 = S_FF1_I32_B32 undef %10 %13 = V_MAD_U32_U24 killed %9, 48, 32, 0, implicit $exec - %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) %46 = V_AND_B32_e32 1, killed %45, implicit $exec %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4) %25 = V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir index 016c5ad32023..a92fe49e1b73 100644 --- a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -70,7 +70,7 @@ body: | %13.sub2_sub3 = COPY killed %12 %20 = V_LSHL_B64 killed %19, 2, implicit $exec %16 = COPY killed %5 - BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir index 0413075dd86c..331cccd853c2 100644 --- a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -134,10 +134,10 @@ body: | %6.sub2 = COPY %6.sub0 bb.2: - BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 S_SETPC_B64_return $sgpr30_sgpr31 diff --git a/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index 915f63dbdd47..0d2f90793fc3 100644 --- a/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -28,7 +28,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:vreg_512 = COPY %0 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) ; CHECK: dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec ; CHECK: dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; CHECK: dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec @@ -52,7 +52,7 @@ body: | %4:vreg_512 = COPY %0 bb.1: - BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec %8:vreg_64 = DS_READ_B64_gfx9 %1, 0, 0, implicit $exec %9:vreg_128 = DS_READ_B128_gfx9 %2, 0, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index eee471cb073b..b954b778dc65 100644 --- a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -279,10 +279,10 @@ body: | %80:vgpr_32 = IMPLICIT_DEF %81:vgpr_32 = IMPLICIT_DEF %84:vgpr_32 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, implicit $exec %85:vgpr_32 = IMPLICIT_DEF %86:vgpr_32 = IMPLICIT_DEF %87:vgpr_32 = IMPLICIT_DEF diff --git a/test/CodeGen/AMDGPU/schedule-barrier.mir b/test/CodeGen/AMDGPU/schedule-barrier.mir index a72a406ff094..e52b955ffaf7 100644 --- a/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -30,14 +30,14 @@ body: | %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec - %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, implicit $exec %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec - %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec + %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, implicit $exec %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 2aaf7f10b69a..42c42a48a616 100644 --- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -81,11 +81,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_ADD_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -165,11 +165,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUB_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -249,11 +249,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUBREV_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -332,12 +332,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %9 = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, %9, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -417,12 +417,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec $vcc = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -502,11 +502,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, $vcc = V_ADDC_U32_e64 %19, %17, undef $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 41444b0ef0cd..330426fb0ad1 100644 --- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -88,7 +88,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -96,7 +96,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -104,7 +104,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... @@ -149,7 +149,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -157,7 +157,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -165,7 +165,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir index 9d45c5b19e65..10ed241acb58 100644 --- a/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ b/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -11,7 +11,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_smem_write_sgpr @@ -25,7 +25,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 ... # GCN-LABEL: name: vmem_snop_write_sgpr @@ -40,7 +40,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 $sgpr0 = S_MOV_B32 0 ... @@ -55,7 +55,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec $sgpr0 = S_MOV_B32 0 ... @@ -70,7 +70,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 0 $sgpr0 = S_MOV_B32 0 ... @@ -86,7 +86,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 1 $sgpr0 = S_MOV_B32 0 ... @@ -101,7 +101,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: vmem_write_exec_expread @@ -114,7 +114,7 @@ body: | bb.0: $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: ds_write_m0 @@ -143,7 +143,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $sgpr0 = S_MOV_B32 0 @@ -161,7 +161,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -181,7 +181,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.2 bb.1: @@ -206,7 +206,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... # GCN-LABEL: name: ds_write_exec diff --git a/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir index 644651ded335..5dbe5d58d9bc 100644 --- a/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir +++ b/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -19,7 +19,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_to_next # GCN: bb.1: @@ -40,7 +40,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far # GCN: bb.1: @@ -61,7 +61,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops # GCN: bb.1: @@ -78,7 +78,7 @@ body: | S_NOP 4 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_around # GCN: bb.2: @@ -107,7 +107,7 @@ body: | S_NOP 0 bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_backedge # GCN: S_NOP @@ -123,7 +123,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $vgpr0 = IMPLICIT_DEF @@ -156,7 +156,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_self_loop # GCN: S_NOP @@ -172,7 +172,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.0 ... @@ -198,7 +198,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... @@ -224,7 +224,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... diff --git a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir index d6e983ae5904..bfd92347d92b 100644 --- a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir +++ b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir @@ -78,7 +78,7 @@ body: | bb.1: successors: %bb.2 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: successors: %bb.3, %bb.6 @@ -86,7 +86,7 @@ body: | bb.3: successors: %bb.4, %bb.5 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_VCCNZ %bb.5, implicit $vcc bb.4: diff --git a/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll new file mode 100644 index 000000000000..0f75cd812421 --- /dev/null +++ b/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll @@ -0,0 +1,127 @@ +; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; Source code: +; struct t1 { +; int c; +; }; +; struct s1 { +; struct t1 b; +; }; +; struct r1 { +; struct s1 a; +; }; +; #define _(x) __builtin_preserve_access_index(x) +; void test1(void *p1, void *p2, void *p3); +; void test(struct r1 *arg) { +; struct s1 *ps = _(&arg->a); +; struct t1 *pt = _(&arg->a.b); +; int *pi = _(&arg->a.b.c); +; test1(ps, pt, pi); +; } +; Compilation flag: +; clang -target bpf -O2 -g -S -emit-llvm test.c + +%struct.r1 = type { %struct.s1 } +%struct.s1 = type { %struct.t1 } +%struct.t1 = type { i32 } + +; Function Attrs: nounwind +define dso_local void @test(%struct.r1* %arg) local_unnamed_addr #0 !dbg !7 { +entry: + call void @llvm.dbg.value(metadata %struct.r1* %arg, metadata !22, metadata !DIExpression()), !dbg !29 + %0 = tail call %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1* %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11 + call void @llvm.dbg.value(metadata %struct.s1* %0, metadata !23, metadata !DIExpression()), !dbg !29 + %1 = tail call %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1* %0, i32 0, i32 0), !dbg !31, !llvm.preserve.access.index !14 + call void @llvm.dbg.value(metadata %struct.t1* %1, metadata !25, metadata !DIExpression()), !dbg !29 + %2 = tail call i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1* %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17 + call void @llvm.dbg.value(metadata i32* %2, metadata !27, metadata !DIExpression()), !dbg !29 + %3 = bitcast %struct.s1* %0 to i8*, !dbg !33 + %4 = bitcast %struct.t1* %1 to i8*, !dbg !34 + %5 = bitcast i32* %2 to i8*, !dbg !35 + tail call void @test1(i8* %3, i8* %4, i8* %5) #4, !dbg !36 + ret void, !dbg !37 +} + +; CHECK: .long 1 # BTF_KIND_STRUCT(id = 2) + +; CHECK: .ascii "r1" # string offset=1 +; CHECK: .ascii ".text" # string offset=29 +; CHECK: .ascii "0:0" # string offset=72 +; CHECK: .ascii "0:0:0" # string offset=76 +; CHECK: .ascii "0:0:0:0" # string offset=82 + +; CHECK: .long 12 # OffsetReloc +; CHECK-NEXT: .long 29 # Offset reloc section string offset=29 +; CHECK-NEXT: .long 3 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 72 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 76 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 82 + +; Function Attrs: nounwind readnone +declare %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1*, i32, i32) #1 + +declare dso_local void @test1(i8*, i8*, i8*) local_unnamed_addr #2 + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind readnone speculatable willreturn } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/work/tests/core") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !8, scopeLine: 12, flags: DIFlagPrototyped, isDefinition: true, isOptimized: true, unit: !0, retainedNodes: !21) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "r1", file: !1, line: 7, size: 32, elements: !12) +!12 = !{!13} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !11, file: !1, line: 8, baseType: !14, size: 32) +!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s1", file: !1, line: 4, size: 32, elements: !15) +!15 = !{!16} +!16 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !14, file: !1, line: 5, baseType: !17, size: 32) +!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !18) +!18 = !{!19} +!19 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !17, file: !1, line: 2, baseType: !20, size: 32) +!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!21 = !{!22, !23, !25, !27} +!22 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 12, type: !10) +!23 = !DILocalVariable(name: "ps", scope: !7, file: !1, line: 13, type: !24) +!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64) +!25 = !DILocalVariable(name: "pt", scope: !7, file: !1, line: 14, type: !26) +!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64) +!27 = !DILocalVariable(name: "pi", scope: !7, file: !1, line: 15, type: !28) +!28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64) +!29 = !DILocation(line: 0, scope: !7) +!30 = !DILocation(line: 13, column: 19, scope: !7) +!31 = !DILocation(line: 14, column: 19, scope: !7) +!32 = !DILocation(line: 15, column: 13, scope: !7) +!33 = !DILocation(line: 16, column: 9, scope: !7) +!34 = !DILocation(line: 16, column: 13, scope: !7) +!35 = !DILocation(line: 16, column: 17, scope: !7) +!36 = !DILocation(line: 16, column: 3, scope: !7) +!37 = !DILocation(line: 17, column: 1, scope: !7) diff --git a/test/CodeGen/Hexagon/swp-stages5.ll b/test/CodeGen/Hexagon/swp-stages5.ll index 1f8463fbc30f..fdfb2101cd36 100644 --- a/test/CodeGen/Hexagon/swp-stages5.ll +++ b/test/CodeGen/Hexagon/swp-stages5.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s | FileCheck %s ; Very similar to swp-stages4.ll, but the pipelined schedule is a little ; different. diff --git a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir index a93af1d54d39..c92d35f92391 100644 --- a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir +++ b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir index 31efedd37960..5a0e0309e417 100644 --- a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir +++ b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir index e3bcac22f136..8ad50b72c284 100644 --- a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir +++ b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir @@ -32,7 +32,7 @@ } ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test1 liveins: @@ -56,14 +56,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test2 liveins: @@ -87,14 +87,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test3 liveins: @@ -118,13 +118,13 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test4 liveins: @@ -148,8 +148,8 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir b/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir index 005014d5e83f..67bf92b60814 100644 --- a/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir +++ b/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir @@ -7,7 +7,7 @@ # CHECK-NEXT: %namedVReg1353:vreg_64 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1 # CHECK-NEXT: %namedVReg1354:sgpr_128 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1, %namedVReg1358, %subreg.sub2, %namedVReg1359, %subreg.sub3 # This tests for the itereator invalidation fix (reviews.llvm.org/D62713) -# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... --- name: foo @@ -27,7 +27,7 @@ body: | %vreg123_3:vgpr_32 = COPY %5 %16:sgpr_128 = REG_SEQUENCE killed %vreg123_0, %subreg.sub0, %vreg123_1, %subreg.sub1, %vreg123_2, %subreg.sub2, %vreg123_3, %subreg.sub3 - BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir index 629f7aefd6af..c48b13e46e20 100644 --- a/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ b/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -12,7 +12,7 @@ # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' # CHECK: scratchWaveOffsetReg: '$sgpr50' # CHECK: frameOffsetReg: '$sgpr50' -# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) +# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register tracksRegLiveness: true machineFunctionInfo: @@ -25,6 +25,6 @@ stack: body: | bb.0: - renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir index b43705eaf8c0..1864f1cbec2f 100644 --- a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -52,7 +52,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -82,6 +82,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MSP430/selectcc.ll b/test/CodeGen/MSP430/selectcc.ll new file mode 100644 index 000000000000..c72079eac276 --- /dev/null +++ b/test/CodeGen/MSP430/selectcc.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=msp430-- < %s | FileCheck %s + +define i16 @select_to_shifts_i16(i16 %a, i16 %b) { +; CHECK-LABEL: select_to_shifts_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.b r12, r12 +; CHECK-NEXT: swpb r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: swpb r12 +; CHECK-NEXT: sxt r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and r13, r12 +; CHECK-NEXT: ret + %and = and i16 %a, 2 + %tobool = icmp eq i16 %and, 0 + %select = select i1 %tobool, i16 0, i16 %b + ret i16 %select +} + +define i32 @select_to_shifts_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_to_shifts_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov.b r13, r13 +; CHECK-NEXT: swpb r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: swpb r13 +; CHECK-NEXT: sxt r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: and r13, r14 +; CHECK-NEXT: and r15, r13 +; CHECK-NEXT: mov r14, r12 +; CHECK-NEXT: ret + %and = and i32 %a, 2 + %tobool = icmp eq i32 %and, 0 + %select = select i1 %tobool, i32 0, i32 %b + ret i32 %select +} diff --git a/test/CodeGen/Mips/micromips-ase-function-attribute.ll b/test/CodeGen/Mips/micromips-ase-function-attribute.ll index fe82b7c5b6cf..cd78166d372a 100644 --- a/test/CodeGen/Mips/micromips-ase-function-attribute.ll +++ b/test/CodeGen/Mips/micromips-ase-function-attribute.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=mips-unknown-linux -filetype=obj %s -o - | \ -; RUN: llvm-readobj --mips-abi-flags | \ +; RUN: llvm-readobj -A | \ ; RUN: FileCheck --check-prefix=ASE-MICROMIPS %s define void @_Z3foov() #0 { diff --git a/test/CodeGen/PowerPC/sh-overflow.mir b/test/CodeGen/PowerPC/sh-overflow.mir new file mode 100644 index 000000000000..31cd710c39ea --- /dev/null +++ b/test/CodeGen/PowerPC/sh-overflow.mir @@ -0,0 +1,58 @@ +# RUN: llc -O3 -mtriple=powerpc64le-unknown-linux-gnu -start-after ppc-mi-peepholes -ppc-late-peephole -ppc-asm-full-reg-names -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: special_right_shift32_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gprc } + - { id: 1, class: gprc } + - { id: 2, class: gprc } +liveins: + - { reg: '$r3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $r3 + + ; Ensure we do not attempt to transform this into srwi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rlwinm $r3, $r3, 32 - 0, 0, 31) + + ; CHECK-LABEL: special_right_shift32_0: + ; CHECK: slwi r[[#]], r[[#]], 0 + + %0:gprc = COPY killed $r3 + %1:gprc = LI 0 + %2:gprc = SRW killed %0, killed %1 + $r3 = COPY killed %2 + BLR implicit $lr, implicit $rm, implicit killed $r3 + +... +--- +name: special_right_shift64_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc } + - { id: 1, class: gprc } + - { id: 2, class: g8rc } +liveins: + - { reg: '$x3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x3 + + ; Ensure we do not attempt to transform this into srdi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rldicl $r3, $r3, 64 - 0, 0) + + ; CHECK-LABEL: special_right_shift64_0: + ; CHECK: rotldi r[[#]], r[[#]], 0 + + %0:g8rc = COPY killed $x3 + %1:gprc = LI 0 + %2:g8rc = SRD killed %0, killed %1 + $x3 = COPY killed %2 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + +... diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll index 8ede6cbb5a71..01964e9c85ab 100644 --- a/test/CodeGen/WebAssembly/cpus.ll +++ b/test/CodeGen/WebAssembly/cpus.ll @@ -1,16 +1,17 @@ ; This tests that llc accepts all valid WebAssembly CPUs. ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=WASM64 ; CHECK-NOT: is not a recognized processor for this target ; INVALID: {{.+}} is not a recognized processor for this target +; WASM64: 64-bit WebAssembly (wasm64) is not currently supported define i32 @f(i32 %i_like_the_web) { ret i32 %i_like_the_web diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 29793a7e0bc6..be88e3530a3c 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -190,6 +190,52 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ret <4 x i64>%z } +define <4 x i64> @imulq256_bcast(<4 x i64> %x) { +; AVX512F-LABEL: imulq256_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq256_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq256_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq256_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq256_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; SKX-NEXT: retq + %z = mul <4 x i64> %x, + ret <4 x i64>%z +} + define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; AVX512F-LABEL: imulq128: ; AVX512F: # %bb.0: @@ -244,6 +290,54 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ret <2 x i64>%z } +define <2 x i64> @imulq128_bcast(<2 x i64> %x) { +; AVX512F-LABEL: imulq128_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq128_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq128_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq128_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq128_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: retq + %z = mul <2 x i64> %x, + ret <2 x i64>%z +} + define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { ; CHECK-LABEL: mulpd512: ; CHECK: # %bb.0: # %entry diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 5fb114b3523a..b13c27e0d470 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -531,211 +531,256 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: pushq %r12 ; KNL-NEXT: pushq %rbx ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kmovw %esi, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k6 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k5 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k4 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %esi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 ; KNL-NEXT: xorl %ecx, %ecx ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF ; KNL-NEXT: movl $0, %esi ; KNL-NEXT: cmovnel %edx, %esi -; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 ; KNL-NEXT: cmovnel %edx, %ecx +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: kmovw %k1, %r8d @@ -832,193 +877,294 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: pushq %r13 ; SKX-NEXT: pushq %r12 ; SKX-NEXT: pushq %rbx -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k0, %k0 -; SKX-NEXT: kshiftrd $30, %k0, %k0 -; SKX-NEXT: kxord %k0, %k2, %k2 -; SKX-NEXT: kshiftrd $2, %k2, %k3 -; SKX-NEXT: kxord %k1, %k3, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $29, %k1, %k1 -; SKX-NEXT: kxord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $31, %k0, %k1 +; SKX-NEXT: kshiftld $2, %k0, %k0 +; SKX-NEXT: kord %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 ; SKX-NEXT: kshiftrd $3, %k1, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $29, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $4, %k1, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k1, %k1 +; SKX-NEXT: kshiftrd $29, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $5, %k1, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k1, %k1 +; SKX-NEXT: kshiftrd $28, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $4, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $5, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $6, %k1, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k1, %k1 +; SKX-NEXT: kshiftrd $27, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $6, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $7, %k1, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k1, %k1 +; SKX-NEXT: kshiftrd $26, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k1, %k1 +; SKX-NEXT: kshiftrd $25, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $7, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $10, %k1, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k1, %k1 +; SKX-NEXT: kshiftrd $23, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $11, %k1, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k1, %k1 +; SKX-NEXT: kshiftrd $22, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $10, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $11, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $12, %k1, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k1, %k1 +; SKX-NEXT: kshiftrd $21, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $12, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $13, %k1, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k1, %k1 +; SKX-NEXT: kshiftrd $20, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k1, %k1 +; SKX-NEXT: kshiftrd $19, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $13, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k1, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $16, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $31, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kmovd %edx, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $3, %k0, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k0, %k0 +; SKX-NEXT: kshiftrd $30, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k2 -; SKX-NEXT: kmovd %esi, %k3 -; SKX-NEXT: kxord %k0, %k3, %k0 -; SKX-NEXT: kshiftrd $2, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $29, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $3, %k0, %k2 -; SKX-NEXT: kmovd %r8d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $4, %k0, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k0, %k0 +; SKX-NEXT: kshiftrd $29, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k2 -; SKX-NEXT: kmovd %r9d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $5, %k0, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $6, %k0, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k0, %k0 +; SKX-NEXT: kshiftrd $27, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $5, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $6, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $7, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k0, %k0 +; SKX-NEXT: kshiftrd $26, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $8, %k0, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $8, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $9, %k0, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $10, %k0, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k0, %k0 +; SKX-NEXT: kshiftrd $23, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $11, %k0, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k0, %k0 +; SKX-NEXT: kshiftrd $22, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $12, %k0, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k0, %k0 +; SKX-NEXT: kshiftrd $21, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $11, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $12, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $13, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k0, %k0 +; SKX-NEXT: kshiftrd $20, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $14, %k0, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $14, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $15, %k0, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k0, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r8d @@ -1113,215 +1259,262 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: subl $20, %esp ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 ; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $3, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $5, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $6, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $7, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k6 +; KNL_X32-NEXT: korw %k1, %k6, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k5 +; KNL_X32-NEXT: korw %k1, %k5, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k4 +; KNL_X32-NEXT: korw %k1, %k4, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $9, %k0, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $10, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $11, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $12, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 ; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k1 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $9, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $10, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $11, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $12, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k6, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $13, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k5, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k4, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $1, %k1, %k1 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: xorl %eax, %eax ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF ; KNL_X32-NEXT: movl $0, %edx ; KNL_X32-NEXT: cmovnel %ecx, %edx +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl +; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %edx, %k1 ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: cmovnel %ecx, %eax -; KNL_X32-NEXT: kandw %k0, %k1, %k0 -; KNL_X32-NEXT: kmovw %edx, %k1 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kandw %k1, %k2, %k1 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1403,6 +1596,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) +; KNL_X32-NEXT: addl $20, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi ; KNL_X32-NEXT: popl %ebx @@ -1416,356 +1610,550 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL-LABEL: test17: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $3, %k3, %k3 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $4, %k3, %k3 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $5, %k3, %k3 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $6, %k3, %k3 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: kshiftlw $7, %k3, %k3 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: korw %k0, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $14, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $3, %k4, %k4 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $4, %k4, %k4 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $5, %k4, %k4 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kshiftlw $7, %k4, %k4 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k5 -; KNL-NEXT: kshiftrw $2, %k5, %k6 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: korw %k0, %k4, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $14, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $3, %k5, %k5 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $12, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $4, %k5, %k5 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $11, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $5, %k5, %k5 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $10, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $6, %k5, %k5 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k5 +; KNL-NEXT: kshiftlw $7, %k5, %k5 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 ; KNL-NEXT: kshiftrw $9, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k1, %k6, %k6 -; KNL-NEXT: kshiftrw $2, %k6, %k7 -; KNL-NEXT: kxorw %k0, %k7, %k0 +; KNL-NEXT: korw %k0, %k5, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $14, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $3, %k6, %k6 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $4, %k6, %k6 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $5, %k6, %k6 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $6, %k6, %k6 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $10, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k6 +; KNL-NEXT: kshiftlw $7, %k6, %k6 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: korw %k0, %k6, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kshiftrw $2, %k0, %k2 -; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $14, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $13, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $12, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $11, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $10, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $9, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k7 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k7 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k1, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kandw %k3, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload @@ -1808,300 +2196,488 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-LABEL: test17: ; SKX: ## %bb.0: ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k0, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftlb $6, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $4, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftlb $5, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $6, %k1, %k2 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftlb $2, %k1, %k1 +; SKX-NEXT: kshiftrb $2, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kandb %k1, %k2, %k1 -; SKX-NEXT: kxorb %k0, %k4, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kshiftrb $4, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kandb %k2, %k3, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 -; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 ; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 -; SKX-NEXT: kxorb %k0, %k7, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kmovd %esi, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $7, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kmovd %edx, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k5 -; SKX-NEXT: kmovd %esi, %k6 -; SKX-NEXT: kxorb %k0, %k6, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $3, %k0, %k5 -; SKX-NEXT: kmovd %r8d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k0, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k5 -; SKX-NEXT: kmovd %r9d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $5, %k0, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: korb %k0, %k5, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kandb %k2, %k0, %k0 @@ -2144,362 +2720,557 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-LABEL: test17: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx -; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: pushl %eax ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $2, %k3, %k4 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $3, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $5, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $6, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $7, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 ; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k3, %k3 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k1, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $2, %k4, %k5 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $14, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $3, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $4, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $6, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $7, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 ; KNL_X32-NEXT: kshiftrw $9, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k4 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k1, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $2, %k5, %k6 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $14, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $3, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $4, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $5, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $7, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 ; KNL_X32-NEXT: kshiftrw $9, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k5 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k1, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $2, %k6, %k7 -; KNL_X32-NEXT: kxorw %k0, %k7, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $14, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $3, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $4, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $5, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $6, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 ; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $3, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $13, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $4, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $12, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $5, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $11, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $6, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $7, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $9, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k7 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $12, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $11, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $10, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload -; KNL_X32-NEXT: kandw %k1, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 @@ -2537,7 +3308,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andb $127, %cl ; KNL_X32-NEXT: movb %cl, (%eax) -; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: addl $4, %esp ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl $4 %j = and <7 x i1> %a, %b diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 20af81948256..fcb07a504067 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1886,410 +1886,495 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k2, %k7 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k6, %k2, %k5 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k4 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k5, %k2, %k4 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k4, %k2, %k3 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k0, %k4 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z} +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm5, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm6, %ymm4 ; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm7, %ymm4 ; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 ; KNL-NEXT: retq ; @@ -2304,410 +2389,495 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: kmovw %edx, %k0 -; AVX512DQNOBW-NEXT: kmovw %edi, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQNOBW-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kmovw %esi, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQNOBW-NEXT: kmovw %edx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k3, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r8d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r9d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k4, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k5, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k6 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k6, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k7, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: korw %k0, %k2, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQNOBW-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k6, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k1, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k3, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k4, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k5, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k6, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k6, %k1, %k5 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k5, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k5, %k1, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k4, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k4, %k1, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k3, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 ; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index d37220222ce7..6e36bd1bb0eb 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $10, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax ; KNL-NEXT: retq @@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrw $10, %k1, %k2 -; SKX-NEXT: kxorw %k0, %k2, %k0 +; SKX-NEXT: kshiftrw $11, %k1, %k2 +; SKX-NEXT: kshiftlw $11, %k2, %k2 +; SKX-NEXT: kshiftlw $6, %k1, %k1 +; SKX-NEXT: kshiftrw $6, %k1, %k1 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $5, %k0, %k0 -; SKX-NEXT: kxorw %k0, %k1, %k0 +; SKX-NEXT: korw %k0, %k2, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq @@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: retq @@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k0, %k2, %k0 +; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k1, %k0 +; SKX-NEXT: korb %k0, %k2, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq @@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: orl %ecx, %eax ; KNL-NEXT: vzeroupper @@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k1 +; SKX-NEXT: kshiftrd $5, %k0, %k1 +; SKX-NEXT: kshiftld $5, %k1, %k1 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $27, %k1, %k1 -; SKX-NEXT: kxord %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index a526518c3fe6..e7f132bcdc67 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2250,8 +2250,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_add_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmb: @@ -2269,10 +2268,9 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_add_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2293,10 +2291,9 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_add_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmbkz: @@ -2418,8 +2415,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_sub_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmb: @@ -2437,10 +2433,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_sub_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2461,10 +2456,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_sub_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmbkz: diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index ea9742a57621..19f9a2a2bd54 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1069,12 +1069,16 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: kxnorw %k0, %k0, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: movb $1, %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1091,24 +1095,32 @@ define <64 x i8> @test16(i64 %x) { ; SKX-LABEL: test16: ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test16: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1116,27 +1128,31 @@ define <64 x i8> @test16(i64 %x) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: movb $1, %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1145,12 +1161,16 @@ define <64 x i8> @test16(i64 %x) { ; X86-LABEL: test16: ; X86: ## %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 -; X86-NEXT: kshiftrq $5, %k0, %k1 -; X86-NEXT: kxnorw %k0, %k0, %k2 -; X86-NEXT: kxorq %k2, %k1, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: movb $1, %al +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1174,12 +1194,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1198,12 +1221,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; @@ -1212,12 +1238,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1225,29 +1254,32 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi ; AVX512DQ-NEXT: setg %al -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1259,12 +1291,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al -; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kshiftrq $5, %k0, %k2 -; X86-NEXT: kxorq %k1, %k2, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1281,10 +1316,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k3 ; KNL-NEXT: kshiftlw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: kshiftlw $7, %k2, %k1 @@ -1301,10 +1338,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; SKX-NEXT: kmovd %esi, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: kshiftrw $9, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k0, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k3 ; SKX-NEXT: kshiftlb $6, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 +; SKX-NEXT: korb %k1, %k3, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 @@ -1318,10 +1357,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: kxorw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 @@ -1337,10 +1378,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 ; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3 -; AVX512DQ-NEXT: kxorb %k1, %k3, %k1 +; AVX512DQ-NEXT: kshiftlb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k3 ; AVX512DQ-NEXT: kshiftlb $6, %k1, %k1 -; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: korb %k1, %k3, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 @@ -1357,10 +1400,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kshiftrw $8, %k1, %k2 ; X86-NEXT: kshiftrw $9, %k1, %k1 -; X86-NEXT: kshiftrb $6, %k0, %k3 -; X86-NEXT: kxorb %k1, %k3, %k1 +; X86-NEXT: kshiftlb $7, %k0, %k3 +; X86-NEXT: kshiftlb $2, %k0, %k0 +; X86-NEXT: kshiftrb $2, %k0, %k0 ; X86-NEXT: kshiftlb $6, %k1, %k1 -; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: korb %k1, %k3, %k1 +; X86-NEXT: korb %k1, %k0, %k0 ; X86-NEXT: kshiftlb $1, %k0, %k0 ; X86-NEXT: kshiftrb $1, %k0, %k0 ; X86-NEXT: kshiftlb $7, %k2, %k1 @@ -2748,403 +2793,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 ; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k2, %k7, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: korw %k6, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k6, %k1, %k5 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k5, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k5, %k1, %k4 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: korw %k4, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k4, %k1, %k3 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k3, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kmovw %k1, 6(%rdi) -; KNL-NEXT: kmovw %k3, 4(%rdi) -; KNL-NEXT: kmovw %k2, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 4(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: retq ; @@ -3166,403 +3296,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQ-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k0 ; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: kmovw %edx, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k3, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r8d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r9d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQ-NEXT: kmovw %r8d, %k1 +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k4, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQ-NEXT: kmovw %r9d, %k1 +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k5, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k6 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k6, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k7, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: korw %k0, %k2, %k0 ; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQ-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQ-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k6, %k7 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k1, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k3, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k4, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k5, %k6 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k6, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k6 +; AVX512DQ-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k6, %k1, %k5 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k5, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k5, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k4, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k4, %k1, %k3 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k3, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 ; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k1, %k1 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: kmovw %k1, 6(%rdi) -; AVX512DQ-NEXT: kmovw %k3, 4(%rdi) -; AVX512DQ-NEXT: kmovw %k2, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: retq ; diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll index d61e4e13df9c..7ee4e6674e0e 100644 --- a/test/CodeGen/X86/avx512-vselect.ll +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -47,3 +47,174 @@ entry: %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b ret <16 x double> %ret } + +define <16 x i64> @test3(<16 x i8> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test3: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test3: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i8> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test4(<16 x i16> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test4: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-SKX-NEXT: vptestnmw %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmw %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test4: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i16> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test5(<16 x i32> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test5: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test5: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vptestnmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i32> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <32 x i32> @test6(<32 x i8> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test6: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test6: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i8> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <32 x i32> @test7(<32 x i16> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test7: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmw %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmw %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test7: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm5 +; CHECK-KNL-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i16> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) { +; CHECK-SKX-LABEL: test8: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmb %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmw %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmw %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test8: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: pushq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_offset 16 +; CHECK-KNL-NEXT: .cfi_offset %rbp, -16 +; CHECK-KNL-NEXT: movq %rsp, %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_register %rbp +; CHECK-KNL-NEXT: andq $-32, %rsp +; CHECK-KNL-NEXT: subq $32, %rsp +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; CHECK-KNL-NEXT: vmovdqa 16(%rbp), %ymm9 +; CHECK-KNL-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm0, %ymm11 +; CHECK-KNL-NEXT: vpmovsxbw %xmm11, %ymm0 +; CHECK-KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm11, %xmm1 +; CHECK-KNL-NEXT: vpmovsxbw %xmm1, %ymm1 +; CHECK-KNL-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm8, %ymm5 +; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm2 +; CHECK-KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm7, %ymm2 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3 +; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3 +; CHECK-KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm9, %ymm3 +; CHECK-KNL-NEXT: movq %rbp, %rsp +; CHECK-KNL-NEXT: popq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-KNL-NEXT: retq + %c = icmp eq <64 x i8> %x, zeroinitializer + %ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b + ret <64 x i16> %ret +} diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 63402d801993..b645098582ff 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2011,8 +2011,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x58,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_512: @@ -2030,9 +2029,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x59,0x40,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2053,9 +2051,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_512: @@ -2172,8 +2169,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x38,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_256: @@ -2191,9 +2187,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x40,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2214,9 +2209,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_256: @@ -2334,8 +2328,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_128: @@ -2353,9 +2346,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x40,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2376,9 +2368,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_128: diff --git a/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll index 5b90bdb8311b..bf85814f3197 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll @@ -199,8 +199,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -236,8 +235,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -276,10 +274,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -319,10 +316,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -362,10 +358,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -405,10 +400,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 077269fde95c..6884666a296c 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -219,8 +219,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -256,8 +255,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -298,10 +296,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -345,10 +342,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -392,10 +388,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -439,10 +434,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8684d1f568fd..233b9162c926 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -3922,10 +3922,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 ; X86-LABEL: test_mask_andnot_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3946,10 +3945,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_128: @@ -4089,10 +4087,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 ; X86-LABEL: test_mask_andnot_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4113,10 +4110,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_256: diff --git a/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll index c4c70fcb2b5c..fe3662d49aa5 100644 --- a/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -229,8 +229,7 @@ define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06] -; X86-NEXT: vbroadcastsd (%edx), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x0a] -; X86-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] @@ -535,9 +534,7 @@ define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocaptu ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06] ; X86-NEXT: # xmm0 = mem[0,0] -; X86-NEXT: vmovddup (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x0a] -; X86-NEXT: # xmm1 = mem[0,0] -; X86-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] diff --git a/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll index 3e37c7c5b6ac..7e7a46db75ed 100644 --- a/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -199,8 +199,7 @@ define void @test_mm512_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] ; X86-NEXT: vbroadcastsd (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x02] -; X86-NEXT: vbroadcastsd (%ecx), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x09] -; X86-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] +; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] diff --git a/test/CodeGen/X86/masked_store.ll b/test/CodeGen/X86/masked_store.ll index 8c7232cf950e..164826d87155 100644 --- a/test/CodeGen/X86/masked_store.ll +++ b/test/CodeGen/X86/masked_store.ll @@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: kmovw %edx, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kxorw %k0, %k0, %k2 -; AVX512F-NEXT: kshiftrw $1, %k2, %k2 -; AVX512F-NEXT: kshiftlw $1, %k2, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k2 -; AVX512F-NEXT: kxorw %k0, %k2, %k0 +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kxorw %k0, %k0, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kshiftrw $14, %k2, %k2 +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-NEXT: kshiftlw $3, %k1, %k1 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kxorw %k0, %k1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kxorw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 -; AVX512F-NEXT: kxorw %k1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: kmovw %edx, %k0 -; AVX512VLDQ-NEXT: kmovw %esi, %k1 -; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2 -; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 -; AVX512VLDQ-NEXT: korb %k1, %k2, %k1 -; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2 -; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0 +; AVX512VLDQ-NEXT: kmovw %esi, %k0 ; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 -; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k0, %k1, %k0 ; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %ecx, %k2 -; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k2 +; AVX512VLDQ-NEXT: kshiftlb $7, %k2, %k2 +; AVX512VLDQ-NEXT: kshiftrb $6, %k2, %k2 +; AVX512VLDQ-NEXT: korb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftlb $3, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %ecx, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLDQ-NEXT: korw %k0, %k1, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kmovd %edx, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k1 -; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512VLBW-NEXT: korw %k1, %k2, %k1 -; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2 -; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k0 +; AVX512VLBW-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $2, %k1, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %edx, %k2 +; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512VLBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512VLBW-NEXT: korw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0 -; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLBW-NEXT: kmovd %ecx, %k2 -; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %ecx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) diff --git a/test/CodeGen/X86/masked_store_trunc_ssat.ll b/test/CodeGen/X86/masked_store_trunc_ssat.ll index 777d4d14e4e4..af413e85be36 100644 --- a/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -1719,10 +1719,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1744,10 +1742,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2027,10 +2023,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2071,10 +2065,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2361,10 +2353,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2405,10 +2395,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/test/CodeGen/X86/masked_store_trunc_usat.ll b/test/CodeGen/X86/masked_store_trunc_usat.ll index 254f0cda48fc..1f47ed5d156d 100644 --- a/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1465,8 +1465,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1487,8 +1486,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -1734,8 +1732,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -1776,8 +1773,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2028,8 +2024,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2070,8 +2065,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index b69525deb41e..46e73c1f8542 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -828,6 +828,30 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector- ret <16 x i8> %b } +define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-AVX512-LABEL: trunc_v8i64_v8i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-AVX512-NEXT: vzeroupper +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: trunc_v8i64_v8i8: +; CHECK-VBMI: # %bb.0: +; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] +; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq + %a = load <8 x i64>, <8 x i64>* %x + %b = trunc <8 x i64> %a to <8 x i8> + ret <8 x i8> %b +} + define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v8i64_v8i16: ; CHECK: # %bb.0: @@ -982,3 +1006,95 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal store <16 x i64> %a, <16 x i64>* %y ret void } + +define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i16> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i32_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i32> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i8_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i8> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} + +define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i16> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index ebb3b623c467..720cabee9122 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -549,20 +549,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: @@ -585,20 +578,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: diff --git a/test/CodeGen/X86/vec_smulo.ll b/test/CodeGen/X86/vec_smulo.ll index b809e55ce5e2..436e48729f9f 100644 --- a/test/CodeGen/X86/vec_smulo.ll +++ b/test/CodeGen/X86/vec_smulo.ll @@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: seto %dl ; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %edx, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 ; AVX512-NEXT: kmovd %k1, %r9d ; AVX512-NEXT: andb $1, %r9b ; AVX512-NEXT: negb %r9b -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: negb %r10b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %edx ; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: negb %al +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %dl +; AVX512-NEXT: imulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %r8b, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: movl %r8d, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %r8b, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: imulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: imulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: imulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: imulb %r10b +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: imulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %al, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/test/CodeGen/X86/vec_umulo.ll b/test/CodeGen/X86/vec_umulo.ll index 07899d0dddf6..c859ce7b74bb 100644 --- a/test/CodeGen/X86/vec_umulo.ll +++ b/test/CodeGen/X86/vec_umulo.ll @@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovq %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %r9b ; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %r9d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 @@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil -; AVX512-NEXT: kshiftrw $2, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: kmovd %k1, %ecx ; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k0, %edx -; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %dl +; AVX512-NEXT: mulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: mulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: mulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: mulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: movl %r9d, %eax @@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index a0306dc1cd4c..a5fa41c521f8 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -223,8 +223,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 @@ -243,8 +242,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index bb734bb8e329..a298382cfb01 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -233,10 +233,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -251,10 +249,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper diff --git a/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll b/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll new file mode 100644 index 000000000000..63de8663e077 --- /dev/null +++ b/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll @@ -0,0 +1,38 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.Foo = type { i32 (...)** } +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local %class.Foo* @_Z1fv() local_unnamed_addr #0 { +entry: + %p = alloca i8*, align 8 + %0 = bitcast i8** %p to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) + %1 = load i8, i8* @flag, align 1 + %tobool = icmp ne i8 %1, 0 + %call = call zeroext i1 @_Z2f1PPvb(i8** nonnull %p, i1 zeroext %tobool) + %2 = load i8*, i8** %p, align 8 + %3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %2) + %4 = bitcast i8* %3 to %class.Foo* + %retval.0 = select i1 %call, %class.Foo* %4, %class.Foo* null + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) + ret %class.Foo* %retval.0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare dso_local zeroext i1 @_Z2f1PPvb(i8**, i1 zeroext) local_unnamed_addr + +declare i8* @llvm.launder.invariant.group.p0i8(i8*) + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +attributes #0 = { sanitize_memory uwtable } diff --git a/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll b/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll new file mode 100644 index 000000000000..f3b5c0d722c8 --- /dev/null +++ b/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll @@ -0,0 +1,21 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local i8* @f(i8* %x) local_unnamed_addr #0 { +entry: + %0 = call i8* @llvm.strip.invariant.group.p0i8(i8* %x) + ret i8* %0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare i8* @llvm.strip.invariant.group.p0i8(i8*) + +attributes #0 = { sanitize_memory uwtable } diff --git a/test/MC/Mips/crc/module-crc.s b/test/MC/Mips/crc/module-crc.s index 92c428e67ff9..66c54647cf44 100644 --- a/test/MC/Mips/crc/module-crc.s +++ b/test/MC/Mips/crc/module-crc.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module crc diff --git a/test/MC/Mips/crc/module-nocrc.s b/test/MC/Mips/crc/module-nocrc.s index c67279194c8e..193ed360b574 100644 --- a/test/MC/Mips/crc/module-nocrc.s +++ b/test/MC/Mips/crc/module-nocrc.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+crc | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module nocrc diff --git a/test/MC/Mips/ginv/module-ginv.s b/test/MC/Mips/ginv/module-ginv.s index 07f1bc4d40e9..8adcd90b23f7 100644 --- a/test/MC/Mips/ginv/module-ginv.s +++ b/test/MC/Mips/ginv/module-ginv.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module ginv diff --git a/test/MC/Mips/ginv/module-noginv.s b/test/MC/Mips/ginv/module-noginv.s index 2ed4fd9c314b..611d72c52d56 100644 --- a/test/MC/Mips/ginv/module-noginv.s +++ b/test/MC/Mips/ginv/module-noginv.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+ginv | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module noginv diff --git a/test/MC/Mips/micromips-ase-directive.s b/test/MC/Mips/micromips-ase-directive.s index f3ac60057dc5..fef40ecc3eeb 100644 --- a/test/MC/Mips/micromips-ase-directive.s +++ b/test/MC/Mips/micromips-ase-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=mips-unknown-linux -filetype=obj %s -o - | \ -# RUN: llvm-readobj --mips-abi-flags | \ +# RUN: llvm-readobj -A | \ # RUN: FileCheck --check-prefix=ASE-MICROMIPS %s .set micromips diff --git a/test/MC/Mips/micromips32r6/relocations.s b/test/MC/Mips/micromips32r6/relocations.s index 7e8f3f6107e6..615b445a0faa 100644 --- a/test/MC/Mips/micromips32r6/relocations.s +++ b/test/MC/Mips/micromips32r6/relocations.s @@ -26,6 +26,12 @@ # CHECK-FIXUP: bnezc $3, bar # encoding: [0xa0,0b011AAAAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar-4, kind: fixup_MICROMIPS_PC21_S1 +# CHECK-FIXUP: jialc $5, bar # encoding: [0x80,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xa0,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -37,6 +43,8 @@ # CHECK-ELF: 0x10 R_MICROMIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x14 R_MICROMIPS_PC21_S1 bar 0x0 # CHECK-ELF: 0x18 R_MICROMIPS_PC21_S1 bar 0x0 +# CHECK-ELF: 0x1C R_MICROMIPS_LO16 bar 0x0 +# CHECK-ELF: 0x20 R_MICROMIPS_LO16 bar 0x0 # CHECK-ELF: ] balc bar @@ -46,3 +54,5 @@ lwpc $2,bar beqzc $3, bar bnezc $3, bar + jialc $5, bar + jic $5, bar diff --git a/test/MC/Mips/mips32r6/relocations.s b/test/MC/Mips/mips32r6/relocations.s index 3f42ee8f4717..8095fb156ec9 100644 --- a/test/MC/Mips/mips32r6/relocations.s +++ b/test/MC/Mips/mips32r6/relocations.s @@ -40,6 +40,12 @@ # CHECK-FIXUP: lwpc $2, bar # encoding: [0xec,0b01001AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -55,6 +61,8 @@ # CHECK-ELF: 0x20 R_MIPS_PCLO16 bar 0x0 # CHECK-ELF: 0x24 R_MIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x28 R_MIPS_PC19_S2 bar 0x0 +# CHECK-ELF: 0x2C R_MIPS_LO16 bar 0x0 +# CHECK-ELF: 0x30 R_MIPS_LO16 bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -68,3 +76,5 @@ addiu $2, $2, %pcrel_lo(bar) lapc $2,bar lwpc $2,bar + jialc $5, bar + jic $5, bar diff --git a/test/MC/Mips/mips64r6/relocations.s b/test/MC/Mips/mips64r6/relocations.s index 4f4efda07c69..5e70f44b96e1 100644 --- a/test/MC/Mips/mips64r6/relocations.s +++ b/test/MC/Mips/mips64r6/relocations.s @@ -47,6 +47,12 @@ # CHECK-FIXUP: lwupc $2, bar # encoding: [0xec,0b01010AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -64,6 +70,8 @@ # CHECK-ELF: 0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -79,3 +87,5 @@ ldpc $2,bar lwpc $2,bar lwupc $2,bar + jialc $5, bar + jic $5, bar diff --git a/test/MC/Mips/mips_abi_flags_xx.s b/test/MC/Mips/mips_abi_flags_xx.s index 94101ae0c8f5..f8386b49774f 100644 --- a/test/MC/Mips/mips_abi_flags_xx.s +++ b/test/MC/Mips/mips_abi_flags_xx.s @@ -2,19 +2,19 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R6,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -mcpu=octeon -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-64R2,CHECK-OBJ-OCTEON # CHECK-ASM: .module fp=xx diff --git a/test/MC/Mips/mips_abi_flags_xx_set.s b/test/MC/Mips/mips_abi_flags_xx_set.s index f2445eba7774..8e4e2dbcf534 100644 --- a/test/MC/Mips/mips_abi_flags_xx_set.s +++ b/test/MC/Mips/mips_abi_flags_xx_set.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module fp=xx diff --git a/test/MC/Mips/module-hardfloat.s b/test/MC/Mips/module-hardfloat.s index f29fbc09353c..5738a09a91b9 100644 --- a/test/MC/Mips/module-hardfloat.s +++ b/test/MC/Mips/module-hardfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module hardfloat diff --git a/test/MC/Mips/module-softfloat.s b/test/MC/Mips/module-softfloat.s index 77e62e38e201..94ab7be63dcc 100644 --- a/test/MC/Mips/module-softfloat.s +++ b/test/MC/Mips/module-softfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module softfloat diff --git a/test/MC/Mips/mt/abiflag.s b/test/MC/Mips/mt/abiflag.s index 2d03c5d1106c..d067c55587c9 100644 --- a/test/MC/Mips/mt/abiflag.s +++ b/test/MC/Mips/mt/abiflag.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -mattr=+mt -filetype=obj -o - \ -# RUN: | llvm-readobj --mips-abi-flags | FileCheck %s +# RUN: | llvm-readobj -A | FileCheck %s # Test that the usage of the MT ASE is recorded in .MIPS.abiflags diff --git a/test/MC/Mips/mt/module-directive.s b/test/MC/Mips/mt/module-directive.s index 0d9ab97b4550..1bbe91147545 100644 --- a/test/MC/Mips/mt/module-directive.s +++ b/test/MC/Mips/mt/module-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck --check-prefix=CHECK-OBJ %s +# RUN: llvm-readobj -A | FileCheck --check-prefix=CHECK-OBJ %s # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck --check-prefix=CHECK-ASM %s diff --git a/test/MC/Mips/mt/set-directive.s b/test/MC/Mips/mt/set-directive.s index 9088655d8c5d..5d18486059d4 100644 --- a/test/MC/Mips/mt/set-directive.s +++ b/test/MC/Mips/mt/set-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck %s --check-prefix=CHECK-OBJ +# RUN: llvm-readobj -A | FileCheck %s --check-prefix=CHECK-OBJ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck %s --check-prefix=CHECK-ASM diff --git a/test/MC/Mips/virt/module-novirt.s b/test/MC/Mips/virt/module-novirt.s index 0f531dbbc80b..6b953d0c5857 100644 --- a/test/MC/Mips/virt/module-novirt.s +++ b/test/MC/Mips/virt/module-novirt.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 -filetype=obj -o - -mattr=+virt | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module novirt diff --git a/test/MC/Mips/virt/module-virt.s b/test/MC/Mips/virt/module-virt.s index ae38b83d8486..1fb035df8783 100644 --- a/test/MC/Mips/virt/module-virt.s +++ b/test/MC/Mips/virt/module-virt.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r5 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module virt diff --git a/test/MC/RISCV/rvf-aliases-valid.s b/test/MC/RISCV/rvf-aliases-valid.s index 725dbe6d6a2c..0d8179ff31f9 100644 --- a/test/MC/RISCV/rvf-aliases-valid.s +++ b/test/MC/RISCV/rvf-aliases-valid.s @@ -55,6 +55,18 @@ fscsr x6, x7 # CHECK-ALIAS: fscsr t3 fscsr x28 +# These are obsolete aliases of frcsr/fscsr. They are accepted by the assembler +# but the disassembler should always print them as the equivalent, new aliases. +# CHECK-INST: csrrs t4, fcsr, zero +# CHECK-ALIAS: frcsr t4 +frsr x29 +# CHECK-INST: csrrw t5, fcsr, t6 +# CHECK-ALIAS: fscsr t5, t6 +fssr x30, x31 +# CHECK-INST: csrrw zero, fcsr, s0 +# CHECK-ALIAS: fscsr s0 +fssr x8 + # CHECK-INST: csrrs t4, frm, zero # CHECK-ALIAS: frrm t4 frrm x29 diff --git a/test/Object/Mips/abi-flags.yaml b/test/Object/Mips/abi-flags.yaml index b5142fd3303a..ce8234a9a0db 100644 --- a/test/Object/Mips/abi-flags.yaml +++ b/test/Object/Mips/abi-flags.yaml @@ -1,5 +1,5 @@ # RUN: yaml2obj %s > %t -# RUN: llvm-readobj --mips-abi-flags %t | FileCheck -check-prefix=OBJ %s +# RUN: llvm-readobj -A %t | FileCheck -check-prefix=OBJ %s # RUN: obj2yaml %t | FileCheck -check-prefix=YAML %s # OBJ: MIPS ABI Flags { diff --git a/test/Transforms/InstCombine/bcopy.ll b/test/Transforms/InstCombine/bcopy.ll new file mode 100644 index 000000000000..6a53bad7eeb0 --- /dev/null +++ b/test/Transforms/InstCombine/bcopy.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare void @bcopy(i8* nocapture readonly, i8* nocapture, i32) + +define void @bcopy_memmove(i8* nocapture readonly %a, i8* nocapture %b) { +; CHECK-LABEL: @bcopy_memmove( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[A:%.*]] to i64* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B:%.*]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 8) + ret void +} + +define void @bcopy_memmove2(i8* nocapture readonly %a, i8* nocapture %b, i32 %len) { +; CHECK-LABEL: @bcopy_memmove2( +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i32(i8* align 1 [[B:%.*]], i8* align 1 [[A:%.*]], i32 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 %len) + ret void +} diff --git a/test/Transforms/InstCombine/sub-of-negatible.ll b/test/Transforms/InstCombine/sub-of-negatible.ll new file mode 100644 index 000000000000..2d9910352683 --- /dev/null +++ b/test/Transforms/InstCombine/sub-of-negatible.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use8(i8) + +; Constant can be freely negated. +define i8 @t0(i8 %x) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[T0:%.*]] = add i8 [[X:%.*]], 42 +; CHECK-NEXT: ret i8 [[T0]] +; + %t0 = sub i8 %x, -42 + ret i8 %t0 +} + +; Negation can be negated for free +define i8 @t1(i8 %x, i8 %y) { +; CHECK-LABEL: @t1( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} + +; Shift-left can be negated if all uses can be updated +define i8 @t2(i8 %x, i8 %y) { +; CHECK-LABEL: @t2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n2(i8 %x, i8 %y) { +; CHECK-LABEL: @n2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Select can be negated if all it's operands can be negated and all the users of select can be updated +define i8 @t4(i8 %x, i1 %y) { +; CHECK-LABEL: @t4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n4(i8 %x, i1 %y) { +; CHECK-LABEL: @n4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n5(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n5( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 %z + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t6(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t6( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 -42, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @t7(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t7( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n8(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n8( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Subtraction can be negated if the first operand can be negated +; x - (y - z) -> x - y + z -> x + (-y) + z +define i8 @t9(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t9( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n10(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n11(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %y, %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Addition can be negated if both operands can be negated +; x - (y + z) -> x - y - z -> x + ((-y) + (-z))) +define i8 @t12(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t12( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} +define i8 @n13(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = sub i8 [[Y]], [[Z:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = add i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n14(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 0, [[TMP1]] +; CHECK-NEXT: call void @use8(i8 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + call void @use8(i8 %t2) + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} + +; Multiplication can be negated if either one of operands can be negated +; x - (y * z) -> x + ((-y) * z) or x + ((-z) * y) +define i8 @t15(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t15( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[TMP1:%.*]] = mul i8 [[Z:%.*]], [[Y]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n16(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n16( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} diff --git a/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll new file mode 100644 index 000000000000..a5f38735a373 --- /dev/null +++ b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll @@ -0,0 +1,575 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use16(i16) +declare void @use32(i32) +declare void @use64(i64) + +define i32 @t0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_zext_of_nbits(i64 %data, i8 %nbits_narrow) { +; CHECK-LABEL: @t0_zext_of_nbits( +; CHECK-NEXT: [[NBITS:%.*]] = zext i8 [[NBITS_NARROW:%.*]] to i16 +; CHECK-NEXT: call void @use16(i16 [[NBITS]]) +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub nsw i16 64, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i16 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW:%.*]] = sub nsw i16 32, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = zext i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]] to i32 +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %nbits = zext i8 %nbits_narrow to i16 + call void @use16(i16 %nbits) + %skip_high = sub i16 64, %nbits + call void @use16(i16 %skip_high) + %skip_high_wide = zext i16 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow_narrow = sub i16 32, %nbits + call void @use16(i16 %num_high_bits_to_smear_narrow_narrow) + %num_high_bits_to_smear_narrow = zext i16 %num_high_bits_to_smear_narrow_narrow to i32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_exact(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0_exact( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr exact i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr exact i64 %data, %skip_high_wide ; We can preserve `exact`-ness of the original shift. + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @t1_redundant_sext(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t1_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED_WITH_SIGNEXTENSION]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION_NARROW:%.*]] = trunc i64 [[EXTRACTED_WITH_SIGNEXTENSION]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: ret i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted_with_signextension = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted_with_signextension) + %extracted_with_signextension_narrow = trunc i64 %extracted_with_signextension to i32 ; this is already the answer. + call void @use32(i32 %extracted_with_signextension_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_with_signextension_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i64 @t2_notrunc(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t2_notrunc( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH]] +; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = lshr i64 %data, %skip_high + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear ; + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear ; can just shift %data itself. + ret i64 %signextended +} + +define i64 @t3_notrunc_redundant_sext(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t3_notrunc_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: ret i64 [[EXTRACTED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = ashr i64 %data, %skip_high ; this is already the answer. + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear + ret i64 %signextended +} + +define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) { +; CHECK-LABEL: @t4_vec( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <2 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <2 x i32> [[SKIP_HIGH]] to <2 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> +; CHECK-NEXT: ret <2 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <2 x i32> , %nbits + %skip_high_wide = zext <2 x i32> %skip_high to <2 x i64> + %extracted = lshr <2 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <2 x i64> %extracted to <2 x i32> + %num_high_bits_to_smear_narrow = sub <2 x i32> , %nbits + %signbit_positioned = shl <2 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr <2 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow + ret <2 x i32> %signextended +} + +define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) { +; CHECK-LABEL: @t5_vec_undef( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <3 x i32> [[SKIP_HIGH]] to <3 x i64> +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <3 x i64> [[TMP1]] to <3 x i32> +; CHECK-NEXT: ret <3 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <3 x i32> , %nbits + %skip_high_wide = zext <3 x i32> %skip_high to <3 x i64> + %extracted = lshr <3 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <3 x i64> %extracted to <3 x i32> + %num_high_bits_to_smear_narrow0 = sub <3 x i32> , %nbits + %num_high_bits_to_smear_narrow1 = sub <3 x i32> , %nbits + %signbit_positioned = shl <3 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow0 + %signextended = ashr <3 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret <3 x i32> %signextended +} + +; Extra-uses +define i32 @t6_extrause_good0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t6_extrause_good0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow ; will go away + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t7_extrause_good1(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t7_extrause_good1( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow0 = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow0) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits ; will go away. + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow0 + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret i32 %signextended +} +define i32 @n8_extrause_bad(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n8_extrause_bad( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow ; neither of operands will go away. + ret i32 %signextended +} + +; Negative tests +define i32 @n9(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n9( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 63, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 63, %nbits ; not 64 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n10(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 31, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 31, %nbits ; not 32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n11(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 ; not %nbits2 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n12(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n12( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub i32 32, [[NBITS1]] +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW2:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]]) +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits1 ; not %nbits2 + %num_high_bits_to_smear_narrow2 = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow1) + call void @use32(i32 %num_high_bits_to_smear_narrow2) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow1 + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow2 + ret i32 %signextended +} + +define i32 @n13(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n13_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test index 60a1a0a2d10f..7e9223c94019 100644 --- a/test/tools/dsymutil/cmdline.test +++ b/test/tools/dsymutil/cmdline.test @@ -1,21 +1,19 @@ RUN: dsymutil -help 2>&1 | FileCheck --check-prefix=HELP %s HELP: OVERVIEW: manipulate archived DWARF debug symbol files. -HELP: USAGE: dsymutil{{[^ ]*}} [options] +HELP: USAGE: {{.*}}dsymutil{{[^ ]*}} [options] HELP-NOT: -reverse-iterate -HELP: Color Options -HELP: -color -HELP: Specific Options: +HELP: Dsymutil Options: HELP: -accelerator -HELP: -arch= +HELP: -arch HELP: -dump-debug-map HELP: -flat HELP: -minimize HELP: -no-odr HELP: -no-output HELP: -no-swiftmodule-timestamp -HELP: -num-threads= -HELP: -o= -HELP: -oso-prepend-path= +HELP: -num-threads +HELP: -oso-prepend-path +HELP: -o HELP: -papertrail HELP: -symbol-map HELP: -symtab diff --git a/test/tools/llvm-readobj/all.test b/test/tools/llvm-readobj/all.test index 17c5a007adfa..2fef5b842277 100644 --- a/test/tools/llvm-readobj/all.test +++ b/test/tools/llvm-readobj/all.test @@ -1,16 +1,25 @@ -RUN: llvm-readobj -a %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL -RUN: llvm-readobj --all %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-readobj -a %t.o | FileCheck %s --check-prefix ALL +# RUN: llvm-readobj --all %t.o | FileCheck %s --check-prefix ALL -ALL: Format: ELF32-i386 -ALL: Arch: i386 -ALL: AddressSize: 32bit -ALL: LoadName: -ALL: ElfHeader { -ALL: Sections [ -ALL: Relocations [ -ALL: Symbols [ -ALL: ProgramHeaders [ -ALL: Notes [ -ALL: StackSizes [ +# ALL: Format: ELF32-i386 +# ALL: Arch: i386 +# ALL: AddressSize: 32bit +# ALL: LoadName: +# ALL: ElfHeader { +# ALL: Sections [ +# ALL: Relocations [ +# ALL: Symbols [ +# ALL: ProgramHeaders [ +# ALL: Version symbols { +# ALL: SHT_GNU_verdef { +# ALL: SHT_GNU_verneed { +# ALL: Notes [ +# ALL: StackSizes [ + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_386 diff --git a/test/tools/llvm-readobj/elf-section-types.test b/test/tools/llvm-readobj/elf-section-types.test index aad9f43c8a30..20b881249c7f 100644 --- a/test/tools/llvm-readobj/elf-section-types.test +++ b/test/tools/llvm-readobj/elf-section-types.test @@ -196,6 +196,7 @@ Sections: Type: SHT_LLVM_CALL_GRAPH_PROFILE - Name: llvm_addrsig Type: SHT_LLVM_ADDRSIG + Symbols: - Name: .deplibs Type: SHT_LLVM_DEPENDENT_LIBRARIES - Name: .llvm_sympart.f diff --git a/test/tools/llvm-readobj/mips-abiflags.test b/test/tools/llvm-readobj/mips-abiflags.test index c06d147397eb..f014c10340fb 100644 --- a/test/tools/llvm-readobj/mips-abiflags.test +++ b/test/tools/llvm-readobj/mips-abiflags.test @@ -1,6 +1,6 @@ -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mipsel | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mipsel | \ RUN: FileCheck -check-prefix=EL64 %s -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mips | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mips | \ RUN: FileCheck -check-prefix=BE32 %s EL64: MIPS ABI Flags { diff --git a/test/tools/llvm-readobj/mips-got-overlapped.test b/test/tools/llvm-readobj/mips-got-overlapped.test index 85c4fe2d67c1..881c63b79a4f 100644 --- a/test/tools/llvm-readobj/mips-got-overlapped.test +++ b/test/tools/llvm-readobj/mips-got-overlapped.test @@ -1,9 +1,9 @@ -# Check that llvm-readobj --mips-plt-got correctly shows .got section +# Check that llvm-readobj -A correctly shows .got section # content if there are some other zero-sized sections with the same # address as the .got. got-over.exe.elf-mips has zero-sized .data # section at the same offset .got section. -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-over.exe.elf-mips | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/got-over.exe.elf-mips | FileCheck %s GOT-OBJ: Cannot find PLTGOT dynamic table tag. diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test index e6e21ad6aca2..8ed35d4b68e2 100644 --- a/test/tools/llvm-readobj/mips-got.test +++ b/test/tools/llvm-readobj/mips-got.test @@ -1,31 +1,25 @@ -RUN: not llvm-readobj --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GOT-OBJ -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s -check-prefix GOT-EXE -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s -check-prefix GOT-SO -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readobj -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s -check-prefix GOT-TLS -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readobj -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s -check-prefix GOT-EMPTY -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s -check-prefix GOT-STATIC -RUN: not llvm-readelf --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GNU-GOT-OBJ -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EXE -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-SO -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readelf -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-TLS -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readelf -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EMPTY -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-STATIC -GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GOT-EXE: Primary GOT { GOT-EXE-NEXT: Canonical gp value: 0x418880 GOT-EXE-NEXT: Reserved entries [ @@ -380,8 +374,6 @@ GOT-STATIC-NEXT: } GOT-STATIC-NEXT: ] GOT-STATIC-NEXT: } -GNU-GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GNU-GOT-EXE: Primary GOT: GNU-GOT-EXE-NEXT: Canonical gp value: 00418880 diff --git a/test/tools/llvm-readobj/mips-options-sec.test b/test/tools/llvm-readobj/mips-options-sec.test index 64b3f0e91795..3636d56cfe6e 100644 --- a/test/tools/llvm-readobj/mips-options-sec.test +++ b/test/tools/llvm-readobj/mips-options-sec.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-options %p/Inputs/options.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/options.obj.elf-mipsel | FileCheck %s CHECK: MIPS Options { CHECK-NEXT: ODK_REGINFO { diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test index b130a67d0443..4e40ca6aa2c1 100644 --- a/test/tools/llvm-readobj/mips-plt.test +++ b/test/tools/llvm-readobj/mips-plt.test @@ -1,5 +1,5 @@ -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s +RUN: llvm-readobj -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s +RUN: llvm-readelf -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s CHECK: PLT GOT { CHECK-NEXT: Reserved entries [ diff --git a/test/tools/llvm-readobj/mips-reginfo.test b/test/tools/llvm-readobj/mips-reginfo.test index 7571d4c56bf0..20177a99d8cb 100644 --- a/test/tools/llvm-readobj/mips-reginfo.test +++ b/test/tools/llvm-readobj/mips-reginfo.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-reginfo %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s CHECK: MIPS RegInfo { CHECK-NEXT: GP: 0x7FEF diff --git a/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..6f21c3212bd9 --- /dev/null +++ b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml @@ -0,0 +1,98 @@ +## Check how obj2yaml dumps the SHT_LLVM_ADDRSIG section. + +## Check that when possible obj2yaml tries to produce the "Name" tag when +## dumping entries of the SHT_LLVM_ADDRSIG section. It falls back to producing +## the "Index" tag when it can't match a symbol index with a symbol table entry. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=NAME + +# NAME: - Name: .llvm_addrsig +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Link: .symtab +# NAME-NEXT: Symbols: +# NAME-NEXT: - Name: foo +# NAME-NEXT: - Name: bar +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF +# NAME: - Name: .llvm_addrsig_unlinked +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Symbols: +# NAME-NEXT: - Index: 0x00000001 +# NAME-NEXT: - Index: 0x00000002 +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF + - Name: .llvm_addrsig_unlinked + Type: SHT_LLVM_ADDRSIG + Link: 0 + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check that obj2yaml dumps the SHT_LLVM_ADDRSIG section +## data using the "Content" tag when at least one of the entries is broken, +## e.g. because the entry contains a malformed uleb128 value. + +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=INVALID-ENTRY + +# INVALID-ENTRY: - Name: .llvm_addrsig +# INVALID-ENTRY-NEXT: Type: SHT_LLVM_ADDRSIG +# INVALID-ENTRY-NEXT: Link: .symtab +# INVALID-ENTRY-NEXT: Content: FFFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FFFFFFFFFF" + +## obj2yaml produces a "Symbols" tag when describing an empty SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=3 %s -o %t3 +# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=EMPTY + +# EMPTY: - Name: .llvm_addrsig +# EMPTY-NEXT: Type: SHT_LLVM_ADDRSIG +# EMPTY-NEXT: Link: .symtab +# EMPTY-NEXT: Symbols: [] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" diff --git a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..1433d6dbc13e --- /dev/null +++ b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml @@ -0,0 +1,307 @@ +## Check how yaml2obj produces SHT_LLVM_ADDRSIG sections. + +## Check we can describe SHT_LLVM_ADDRSIG using the "Symbols" tag. We can define +## symbols either using names or indexes. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefix=SYMBOLS + +# SYMBOLS: Section { +# SYMBOLS: Index: 1 +# SYMBOLS-NEXT: Name: .llvm_addrsig +# SYMBOLS-NEXT: Type: SHT_LLVM_ADDRSIG +# SYMBOLS-NEXT: Flags [ +# SYMBOLS-NEXT: ] +# SYMBOLS-NEXT: Address: 0x0 +# SYMBOLS-NEXT: Offset: 0x40 +# SYMBOLS-NEXT: Size: 4 +# SYMBOLS-NEXT: Link: 2 +# SYMBOLS-NEXT: Info: 0 +# SYMBOLS-NEXT: AddressAlignment: 0 +# SYMBOLS-NEXT: EntrySize: 0 +# SYMBOLS-NEXT: SectionData ( +# SYMBOLS-NEXT: 0000: 01020102 +# SYMBOLS-NEXT: ) +# SYMBOLS-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + - Index: 1 + - Index: 2 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## We can't specify both "Index" and "Name" when defining a symbol. + +# RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INDEX-NAME + +# INDEX-NAME: error: "Index" and "Name" cannot be used together when defining a symbol + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + Index: 1 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check we report an error if an unknown symbol is referenced in the +## SHT_LLVM_ADDRSIG section description. + +# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-UNKNOWN + +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'foo' by YAML section '.llvm_addrsig' +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'bar' by YAML section '.llvm_addrsig' + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + +## Check we can specify any arbitrary symbol indices. + +# RUN: yaml2obj --docnum=4 %s -o %t4 +# RUN: llvm-readobj --sections --section-data %t4 | FileCheck %s --check-prefix=SYMBOL-INDEX + +# SYMBOL-INDEX: Type: SHT_LLVM_ADDRSIG +# SYMBOL-INDEX: SectionData ( +# SYMBOL-INDEX-NEXT: 0000: 00FF01C4 E6888901 FFFFFFFF 0F +# SYMBOL-INDEX-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0 + - Index: 255 + - Index: 0x11223344 +## 0xFFFFFFFF is a maximum allowed index value. + - Index: 0xFFFFFFFF + +## Check that the maximum symbol index size is 32 bits. + +# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-INDEX-OVERFLOW + +# SYMBOL-INDEX-OVERFLOW: error: out of range hex32 number + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0x1122334455 + +## Check we can use the "Content" tag to specify any data for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=6 %s -o %t6 +# RUN: llvm-readobj --sections --section-data %t6 | FileCheck %s --check-prefix=CONTENT + +# CONTENT: Type: SHT_LLVM_ADDRSIG +# CONTENT: Size: +# CONTENT-SAME: 5 +# CONTENT: SectionData ( +# CONTENT-NEXT: 0000: 11223344 55 +# CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "1122334455" + +## Either "Content" or "Symbols" must be specifed for SHT_LLVM_ADDRSIG sections. + +# RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS + +# NO-TAGS: error: one of "Content", "Size" or "Symbols" must be specified + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + +## "Content" and "Symbols" cannot be used together to describe the SHT_LLVM_ADDRSIG section. + +# RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +# CONTENT-SYMBOLS: "Symbols" cannot be used with "Content" or "Size" + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" + Symbols: + +## Check we can set an arbitrary sh_link value for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=9 %s -o %t9 +# RUN: llvm-readobj --sections %t9 | FileCheck %s --check-prefix=LINK + +# LINK: Name: .llvm_addrsig +# LINK: Link: +# LINK-SAME: 123{{$}} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Link: 123 + Content: "" + +## Check we can use only "Size" to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=10 %s -o %t10 +# RUN: llvm-readobj --sections --section-data %t10 | FileCheck %s --check-prefix=SIZE + +# SIZE: Name: .llvm_addrsig +# SIZE: Size: +# SIZE-SAME: 17 +# SIZE: SectionData ( +# SIZE-NEXT: 0000: 00000000 00000000 00000000 00000000 | +# SIZE-NEXT: 0010: 00 | +# SIZE-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x11 + +## Check we can use "Size" and "Content" together to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=11 %s -o %t11 +# RUN: llvm-readobj --sections --section-data %t11 | FileCheck %s --check-prefix=SIZE-CONTENT + +# SIZE-CONTENT: Name: .llvm_addrsig_sizegr +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 5 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 11223300 00 | +# SIZE-CONTENT-NEXT: ) + +# SIZE-CONTENT: Name: .llvm_addrsig_sizeeq +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 3 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 112233 | +# SIZE-CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig_sizegr + Type: SHT_LLVM_ADDRSIG + Size: 0x5 + Content: "112233" + - Name: .llvm_addrsig_sizeeq + Type: SHT_LLVM_ADDRSIG + Size: 0x3 + Content: "112233" + +## Check that when "Size" and "Content" are used together, the size +## must be greater than or equal to the content size. + +# RUN: not yaml2obj --docnum=12 %s 2>&1 | FileCheck %s --check-prefix=SIZE-CONTENT-ERR + +# SIZE-CONTENT-ERR: error: "Size" must be greater than or equal to the content size + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Content: "1122" + +## Check we can't use "Size" and "Symbols" tags together. + +# RUN: not yaml2obj --docnum=13 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Symbols: [ ] diff --git a/tools/dsymutil/CMakeLists.txt b/tools/dsymutil/CMakeLists.txt index 19865e3d20e1..f88e6db62c38 100644 --- a/tools/dsymutil/CMakeLists.txt +++ b/tools/dsymutil/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(DsymutilTableGen) + set(LLVM_LINK_COMPONENTS AllTargetsAsmPrinters AllTargetsCodeGens @@ -7,6 +11,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoDWARF MC Object + Option Support Target ) @@ -27,6 +32,7 @@ add_llvm_tool(dsymutil DEPENDS intrinsics_gen + ${tablegen_deps} ) if(APPLE) diff --git a/tools/dsymutil/Options.td b/tools/dsymutil/Options.td new file mode 100644 index 000000000000..c2114c86a1a3 --- /dev/null +++ b/tools/dsymutil/Options.td @@ -0,0 +1,146 @@ +include "llvm/Option/OptParser.td" + +class F: Flag<["--", "-"], name>; + +def grp_general : OptionGroup<"Dsymutil">, HelpText<"Dsymutil Options">; + +def help: F<"help">, + HelpText<"Prints this help output.">, + Group; +def: Flag<["-"], "h">, + Alias, + HelpText<"Alias for --help">, + Group; + +def version: F<"version">, + HelpText<"Prints the dsymutil version.">, + Group; +def: Flag<["-"], "v">, + Alias, + HelpText<"Alias for --version">, + Group; + +def verbose: F<"verbose">, + HelpText<"Enable verbose mode.">, + Group; + +def verify: F<"verify">, + HelpText<"Run the DWARF verifier on the linked DWARF debug info.">, + Group; + +def no_output: F<"no-output">, + HelpText<"Do the link in memory, but do not emit the result file.">, + Group; + +def no_swiftmodule_timestamp: F<"no-swiftmodule-timestamp">, + HelpText<"Don't check timestamp for swiftmodule files.">, + Group; + +def no_odr: F<"no-odr">, + HelpText<"Do not use ODR (One Definition Rule) for type uniquing.">, + Group; + +def dump_debug_map: F<"dump-debug-map">, + HelpText<"Parse and dump the debug map to standard output. Not DWARF link will take place.">, + Group; + +def yaml_input: F<"y">, + HelpText<"Treat the input file is a YAML debug map rather than a binary.">, + Group; + +def papertrail: F<"papertrail">, + HelpText<"Embed warnings in the linked DWARF debug info.">, + Group; + +def assembly: F<"S">, + HelpText<"Output textual assembly instead of a binary dSYM companion file.">, + Group; + +def symtab: F<"symtab">, + HelpText<"Dumps the symbol table found in executable or object file(s) and exits.">, + Group; +def: Flag<["-"], "s">, + Alias, + HelpText<"Alias for --symtab">, + Group; + +def flat: F<"flat">, + HelpText<"Produce a flat dSYM file (not a bundle).">, + Group; +def: Flag<["-"], "f">, + Alias, + HelpText<"Alias for --flat">, + Group; + +def minimize: F<"minimize">, + HelpText<"When used when creating a dSYM file with Apple accelerator tables, " + "this option will suppress the emission of the .debug_inlines, " + ".debug_pubnames, and .debug_pubtypes sections since dsymutil " + "has better equivalents: .apple_names and .apple_types. When used in " + "conjunction with --update option, this option will cause redundant " + "accelerator tables to be removed.">, + Group; +def: Flag<["-"], "z">, + Alias, + HelpText<"Alias for --minimize">, + Group; + +def update: F<"update">, + HelpText<"Updates existing dSYM files to contain the latest accelerator tables and other DWARF optimizations.">, + Group; +def: Flag<["-"], "u">, + Alias, + HelpText<"Alias for --update">, + Group; + +def output: Separate<["--", "-"], "o">, + MetaVarName<"">, + HelpText<"Specify the output file. Defaults to .dwarf">, + Group; +def: Separate<["-"], "out">, + Alias, + HelpText<"Alias for --o">, + Group; +def: Joined<["--", "-"], "out=">, Alias; +def: Joined<["--", "-"], "o=">, Alias; + +def oso_prepend_path: Separate<["--", "-"], "oso-prepend-path">, + MetaVarName<"">, + HelpText<"Specify a directory to prepend to the paths of object files.">, + Group; +def: Joined<["--", "-"], "oso-prepend-path=">, Alias; + +def symbolmap: Separate<["--", "-"], "symbol-map">, + MetaVarName<"">, + HelpText<"Updates the existing dSYMs inplace using symbol map specified.">, + Group; +def: Joined<["--", "-"], "symbol-map=">, Alias; + +def arch: Separate<["--", "-"], "arch">, + MetaVarName<"">, + HelpText<"Link DWARF debug information only for specified CPU architecture" + "types. This option can be specified multiple times, once for each" + "desired architecture. All CPU architectures will be linked by" + "default.">, + Group; +def: Joined<["--", "-"], "arch=">, Alias; + +def accelerator: Separate<["--", "-"], "accelerator">, + MetaVarName<"">, + HelpText<"Specify the desired type of accelerator table. Valid options are 'Apple', 'Dwarf' and 'Default'">, + Group; +def: Joined<["--", "-"], "accelerator=">, Alias; + +def toolchain: Separate<["--", "-"], "toolchain">, + MetaVarName<"">, + HelpText<"Embed toolchain information in dSYM bundle.">, + Group; + +def threads: Separate<["--", "-"], "num-threads">, + MetaVarName<"">, + HelpText<"Specifies the maximum number of simultaneous threads to use when linking multiple architectures.">, + Group; +def: Separate<["-"], "j">, + Alias, + HelpText<"Alias for --num-threads">, + Group; diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index bf42ec73269c..fe69abed0a82 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -20,12 +20,16 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFVerifier.h" #include "llvm/Object/Binary.h" #include "llvm/Object/MachO.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InitLLVM.h" @@ -43,142 +47,232 @@ #include using namespace llvm; -using namespace llvm::cl; using namespace llvm::dsymutil; using namespace object; -static OptionCategory DsymCategory("Specific Options"); -static opt Help("h", desc("Alias for -help"), Hidden); -static opt Version("v", desc("Alias for -version"), Hidden); - -static list InputFiles(Positional, OneOrMore, - desc(""), cat(DsymCategory)); - -static opt - OutputFileOpt("o", - desc("Specify the output file. default: .dwarf"), - value_desc("filename"), cat(DsymCategory)); -static alias OutputFileOptA("out", desc("Alias for -o"), - aliasopt(OutputFileOpt)); - -static opt OsoPrependPath( - "oso-prepend-path", - desc("Specify a directory to prepend to the paths of object files."), - value_desc("path"), cat(DsymCategory)); - -static opt Assembly( - "S", - desc("Output textual assembly instead of a binary dSYM companion file."), - init(false), cat(DsymCategory), cl::Hidden); - -static opt DumpStab( - "symtab", - desc("Dumps the symbol table found in executable or object file(s) and\n" - "exits."), - init(false), cat(DsymCategory)); -static alias DumpStabA("s", desc("Alias for --symtab"), aliasopt(DumpStab)); - -static opt FlatOut("flat", - desc("Produce a flat dSYM file (not a bundle)."), - init(false), cat(DsymCategory)); -static alias FlatOutA("f", desc("Alias for --flat"), aliasopt(FlatOut)); - -static opt Minimize( - "minimize", - desc("When used when creating a dSYM file with Apple accelerator tables,\n" - "this option will suppress the emission of the .debug_inlines, \n" - ".debug_pubnames, and .debug_pubtypes sections since dsymutil \n" - "has better equivalents: .apple_names and .apple_types. When used in\n" - "conjunction with --update option, this option will cause redundant\n" - "accelerator tables to be removed."), - init(false), cat(DsymCategory)); -static alias MinimizeA("z", desc("Alias for --minimize"), aliasopt(Minimize)); - -static opt Update( - "update", - desc("Updates existing dSYM files to contain the latest accelerator\n" - "tables and other DWARF optimizations."), - init(false), cat(DsymCategory)); -static alias UpdateA("u", desc("Alias for --update"), aliasopt(Update)); - -static opt SymbolMap( - "symbol-map", - desc("Updates the existing dSYMs inplace using symbol map specified."), - value_desc("bcsymbolmap"), cat(DsymCategory)); - -static cl::opt AcceleratorTable( - "accelerator", cl::desc("Output accelerator tables."), - cl::values(clEnumValN(AccelTableKind::Default, "Default", - "Default for input."), - clEnumValN(AccelTableKind::Apple, "Apple", "Apple"), - clEnumValN(AccelTableKind::Dwarf, "Dwarf", "DWARF")), - cl::init(AccelTableKind::Default), cat(DsymCategory)); - -static opt NumThreads( - "num-threads", - desc("Specifies the maximum number (n) of simultaneous threads to use\n" - "when linking multiple architectures."), - value_desc("n"), init(0), cat(DsymCategory)); -static alias NumThreadsA("j", desc("Alias for --num-threads"), - aliasopt(NumThreads)); - -static opt Verbose("verbose", desc("Verbosity level"), init(false), - cat(DsymCategory)); - -static opt - NoOutput("no-output", - desc("Do the link in memory, but do not emit the result file."), - init(false), cat(DsymCategory)); - -static opt - NoTimestamp("no-swiftmodule-timestamp", - desc("Don't check timestamp for swiftmodule files."), - init(false), cat(DsymCategory)); - -static list ArchFlags( - "arch", - desc("Link DWARF debug information only for specified CPU architecture\n" - "types. This option can be specified multiple times, once for each\n" - "desired architecture. All CPU architectures will be linked by\n" - "default."), - value_desc("arch"), ZeroOrMore, cat(DsymCategory)); - -static opt - NoODR("no-odr", - desc("Do not use ODR (One Definition Rule) for type uniquing."), - init(false), cat(DsymCategory)); - -static opt DumpDebugMap( - "dump-debug-map", - desc("Parse and dump the debug map to standard output. Not DWARF link " - "will take place."), - init(false), cat(DsymCategory)); - -static opt InputIsYAMLDebugMap( - "y", desc("Treat the input file is a YAML debug map rather than a binary."), - init(false), cat(DsymCategory)); - -static opt Verify("verify", desc("Verify the linked DWARF debug info."), - cat(DsymCategory)); - -static opt - Toolchain("toolchain", desc("Embed toolchain information in dSYM bundle."), - cat(DsymCategory)); - -static opt - PaperTrailWarnings("papertrail", - desc("Embed warnings in the linked DWARF debug info."), - cat(DsymCategory)); - -static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { - if (NoOutput) - return Error::success(); +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "Options.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Options.inc" +#undef PREFIX + +const opt::OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + { \ + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, +#include "Options.inc" +#undef OPTION +}; + +class DsymutilOptTable : public opt::OptTable { +public: + DsymutilOptTable() : OptTable(InfoTable) {} +}; +} // namespace + +struct DsymutilOptions { + bool DumpDebugMap = false; + bool DumpStab = false; + bool Flat = false; + bool InputIsYAMLDebugMap = false; + bool PaperTrailWarnings = false; + bool Verify = false; + std::string SymbolMap; + std::string OutputFile; + std::string Toolchain; + std::vector Archs; + std::vector InputFiles; + unsigned NumThreads; + dsymutil::LinkOptions LinkOpts; +}; + +/// Return a list of input files. This function has logic for dealing with the +/// special case where we might have dSYM bundles as input. The function +/// returns an error when the directory structure doesn't match that of a dSYM +/// bundle. +static Expected> getInputs(opt::InputArgList &Args, + bool DsymAsInput) { + std::vector InputFiles; + for (auto *File : Args.filtered(OPT_INPUT)) + InputFiles.push_back(File->getValue()); + + if (!DsymAsInput) + return InputFiles; + + // If we are updating, we might get dSYM bundles as input. + std::vector Inputs; + for (const auto &Input : InputFiles) { + if (!sys::fs::is_directory(Input)) { + Inputs.push_back(Input); + continue; + } + + // Make sure that we're dealing with a dSYM bundle. + SmallString<256> BundlePath(Input); + sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); + if (!sys::fs::is_directory(BundlePath)) + return make_error( + Input + " is a directory, but doesn't look like a dSYM bundle.", + inconvertibleErrorCode()); + + // Create a directory iterator to iterate over all the entries in the + // bundle. + std::error_code EC; + sys::fs::directory_iterator DirIt(BundlePath, EC); + sys::fs::directory_iterator DirEnd; + if (EC) + return errorCodeToError(EC); + + // Add each entry to the list of inputs. + while (DirIt != DirEnd) { + Inputs.push_back(DirIt->path()); + DirIt.increment(EC); + if (EC) + return errorCodeToError(EC); + } + } + return Inputs; +} + +// Verify that the given combination of options makes sense. +static Error verifyOptions(const DsymutilOptions &Options) { + if (Options.LinkOpts.Update && + std::find(Options.InputFiles.begin(), Options.InputFiles.end(), "-") != + Options.InputFiles.end()) { + // FIXME: We cannot use stdin for an update because stdin will be + // consumed by the BinaryHolder during the debugmap parsing, and + // then we will want to consume it again in DwarfLinker. If we + // used a unique BinaryHolder object that could cache multiple + // binaries this restriction would go away. + return make_error( + "standard input cannot be used as input for a dSYM update.", + errc::invalid_argument); + } + + if (!Options.Flat && Options.OutputFile == "-") + return make_error( + "cannot emit to standard output without --flat.", + errc::invalid_argument); + + if (Options.InputFiles.size() > 1 && Options.Flat && + !Options.OutputFile.empty()) + return make_error( + "cannot use -o with multiple inputs in flat mode.", + errc::invalid_argument); + + if (Options.PaperTrailWarnings && Options.InputIsYAMLDebugMap) + return make_error( + "paper trail warnings are not supported for YAML input.", + errc::invalid_argument); + + return Error::success(); +} + +static Expected getAccelTableKind(opt::InputArgList &Args) { + if (opt::Arg *Accelerator = Args.getLastArg(OPT_accelerator)) { + StringRef S = Accelerator->getValue(); + if (S == "Apple") + return AccelTableKind::Apple; + if (S == "Dwarf") + return AccelTableKind::Dwarf; + if (S == "Default") + return AccelTableKind::Default; + return make_error( + "invalid accelerator type specified: '" + S + + "'. Support values are 'Apple', 'Dwarf' and 'Default'.", + inconvertibleErrorCode()); + } + return AccelTableKind::Default; +} + +/// Parses the command line options into the LinkOptions struct and performs +/// some sanity checking. Returns an error in case the latter fails. +static Expected getOptions(opt::InputArgList &Args) { + DsymutilOptions Options; + + Options.DumpDebugMap = Args.hasArg(OPT_dump_debug_map); + Options.DumpStab = Args.hasArg(OPT_symtab); + Options.Flat = Args.hasArg(OPT_flat); + Options.InputIsYAMLDebugMap = Args.hasArg(OPT_yaml_input); + Options.PaperTrailWarnings = Args.hasArg(OPT_papertrail); + Options.Verify = Args.hasArg(OPT_verify); + + Options.LinkOpts.Minimize = Args.hasArg(OPT_minimize); + Options.LinkOpts.NoODR = Args.hasArg(OPT_no_odr); + Options.LinkOpts.NoOutput = Args.hasArg(OPT_no_output); + Options.LinkOpts.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp); + Options.LinkOpts.Update = Args.hasArg(OPT_update); + Options.LinkOpts.Verbose = Args.hasArg(OPT_verbose); + + if (Expected AccelKind = getAccelTableKind(Args)) { + Options.LinkOpts.TheAccelTableKind = *AccelKind; + } else { + return AccelKind.takeError(); + } + + if (opt::Arg *SymbolMap = Args.getLastArg(OPT_symbolmap)) + Options.SymbolMap = SymbolMap->getValue(); + + if (Args.hasArg(OPT_symbolmap)) + Options.LinkOpts.Update = true; + + if (Expected> InputFiles = + getInputs(Args, Options.LinkOpts.Update)) { + Options.InputFiles = std::move(*InputFiles); + } else { + return InputFiles.takeError(); + } + + for (auto *Arch : Args.filtered(OPT_arch)) + Options.Archs.push_back(Arch->getValue()); + + if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) + Options.LinkOpts.PrependPath = OsoPrependPath->getValue(); + + if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) + Options.OutputFile = OutputFile->getValue(); + + if (opt::Arg *Toolchain = Args.getLastArg(OPT_toolchain)) + Options.Toolchain = Toolchain->getValue(); + + if (Args.hasArg(OPT_assembly)) + Options.LinkOpts.FileType = OutputFileType::Assembly; + + if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) + Options.LinkOpts.Threads = atoi(NumThreads->getValue()); + else + Options.LinkOpts.Threads = thread::hardware_concurrency(); + if (Options.DumpDebugMap || Options.LinkOpts.Verbose) + Options.LinkOpts.Threads = 1; + + if (getenv("RC_DEBUG_OPTIONS")) + Options.PaperTrailWarnings = true; + + if (Error E = verifyOptions(Options)) + return std::move(E); + return Options; +} + +static Error createPlistFile(StringRef Bin, StringRef BundleRoot, + StringRef Toolchain) { // Create plist file to write to. - llvm::SmallString<128> InfoPlist(BundleRoot); - llvm::sys::path::append(InfoPlist, "Contents/Info.plist"); + SmallString<128> InfoPlist(BundleRoot); + sys::path::append(InfoPlist, "Contents/Info.plist"); std::error_code EC; - llvm::raw_fd_ostream PL(InfoPlist, EC, llvm::sys::fs::OF_Text); + raw_fd_ostream PL(InfoPlist, EC, sys::fs::OF_Text); if (EC) return make_error( "cannot create Plist: " + toString(errorCodeToError(EC)), EC); @@ -186,9 +280,9 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { CFBundleInfo BI = getBundleInfo(Bin); if (BI.IDStr.empty()) { - llvm::StringRef BundleID = *llvm::sys::path::rbegin(BundleRoot); - if (llvm::sys::path::extension(BundleRoot) == ".dSYM") - BI.IDStr = llvm::sys::path::stem(BundleID); + StringRef BundleID = *sys::path::rbegin(BundleRoot); + if (sys::path::extension(BundleRoot) == ".dSYM") + BI.IDStr = sys::path::stem(BundleID); else BI.IDStr = BundleID; } @@ -236,21 +330,18 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { return Error::success(); } -static Error createBundleDir(llvm::StringRef BundleBase) { - if (NoOutput) - return Error::success(); - - llvm::SmallString<128> Bundle(BundleBase); - llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF"); +static Error createBundleDir(StringRef BundleBase) { + SmallString<128> Bundle(BundleBase); + sys::path::append(Bundle, "Contents", "Resources", "DWARF"); if (std::error_code EC = - create_directories(Bundle.str(), true, llvm::sys::fs::perms::all_all)) + create_directories(Bundle.str(), true, sys::fs::perms::all_all)) return make_error( "cannot create bundle: " + toString(errorCodeToError(EC)), EC); return Error::success(); } -static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) { +static bool verify(StringRef OutputFile, StringRef Arch, bool Verbose) { if (OutputFile == "-") { WithColor::warning() << "verification skipped for " << Arch << "because writing to stdout.\n"; @@ -280,33 +371,34 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) { namespace { struct OutputLocation { - OutputLocation(std::string DWARFFile, - llvm::Optional ResourceDir = {}) + OutputLocation(std::string DWARFFile, Optional ResourceDir = {}) : DWARFFile(DWARFFile), ResourceDir(ResourceDir) {} /// This method is a workaround for older compilers. - llvm::Optional getResourceDir() const { return ResourceDir; } + Optional getResourceDir() const { return ResourceDir; } std::string DWARFFile; - llvm::Optional ResourceDir; + Optional ResourceDir; }; -} +} // namespace -static Expected getOutputFileName(llvm::StringRef InputFile) { - if (OutputFileOpt == "-") - return OutputLocation(OutputFileOpt); +static Expected +getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) { + if (Options.OutputFile == "-") + return OutputLocation(Options.OutputFile); // When updating, do in place replacement. - if (OutputFileOpt.empty() && (Update || !SymbolMap.empty())) + if (Options.OutputFile.empty() && + (Options.LinkOpts.Update || !Options.SymbolMap.empty())) return OutputLocation(InputFile); // If a flat dSYM has been requested, things are pretty simple. - if (FlatOut) { - if (OutputFileOpt.empty()) { + if (Options.Flat) { + if (Options.OutputFile.empty()) { if (InputFile == "-") return OutputLocation{"a.out.dwarf", {}}; return OutputLocation((InputFile + ".dwarf").str()); } - return OutputLocation(OutputFileOpt); + return OutputLocation(Options.OutputFile); } // We need to create/update a dSYM bundle. @@ -317,193 +409,87 @@ static Expected getOutputFileName(llvm::StringRef InputFile) { // Resources/ // DWARF/ // - std::string DwarfFile = - InputFile == "-" ? llvm::StringRef("a.out") : InputFile; - llvm::SmallString<128> Path(OutputFileOpt); + std::string DwarfFile = InputFile == "-" ? StringRef("a.out") : InputFile; + SmallString<128> Path(Options.OutputFile); if (Path.empty()) Path = DwarfFile + ".dSYM"; - if (auto E = createBundleDir(Path)) - return std::move(E); - if (auto E = createPlistFile(DwarfFile, Path)) - return std::move(E); + if (!Options.LinkOpts.NoOutput) { + if (auto E = createBundleDir(Path)) + return std::move(E); + if (auto E = createPlistFile(DwarfFile, Path, Options.Toolchain)) + return std::move(E); + } - llvm::sys::path::append(Path, "Contents", "Resources"); + sys::path::append(Path, "Contents", "Resources"); std::string ResourceDir = Path.str(); - llvm::sys::path::append(Path, "DWARF", llvm::sys::path::filename(DwarfFile)); + sys::path::append(Path, "DWARF", sys::path::filename(DwarfFile)); return OutputLocation(Path.str(), ResourceDir); } -/// Parses the command line options into the LinkOptions struct and performs -/// some sanity checking. Returns an error in case the latter fails. -static Expected getOptions() { - LinkOptions Options; - - Options.Verbose = Verbose; - Options.NoOutput = NoOutput; - Options.NoODR = NoODR; - Options.Minimize = Minimize; - Options.Update = Update; - Options.NoTimestamp = NoTimestamp; - Options.PrependPath = OsoPrependPath; - Options.TheAccelTableKind = AcceleratorTable; - - if (!SymbolMap.empty()) - Options.Update = true; - - if (Assembly) - Options.FileType = OutputFileType::Assembly; - - if (Options.Update && std::find(InputFiles.begin(), InputFiles.end(), "-") != - InputFiles.end()) { - // FIXME: We cannot use stdin for an update because stdin will be - // consumed by the BinaryHolder during the debugmap parsing, and - // then we will want to consume it again in DwarfLinker. If we - // used a unique BinaryHolder object that could cache multiple - // binaries this restriction would go away. - return make_error( - "standard input cannot be used as input for a dSYM update.", - inconvertibleErrorCode()); - } - - if (NumThreads == 0) - Options.Threads = llvm::thread::hardware_concurrency(); - else - Options.Threads = NumThreads; - if (DumpDebugMap || Verbose) - Options.Threads = 1; - - return Options; -} - -/// Return a list of input files. This function has logic for dealing with the -/// special case where we might have dSYM bundles as input. The function -/// returns an error when the directory structure doesn't match that of a dSYM -/// bundle. -static Expected> getInputs(bool DsymAsInput) { - if (!DsymAsInput) - return InputFiles; - - // If we are updating, we might get dSYM bundles as input. - std::vector Inputs; - for (const auto &Input : InputFiles) { - if (!llvm::sys::fs::is_directory(Input)) { - Inputs.push_back(Input); - continue; - } - - // Make sure that we're dealing with a dSYM bundle. - SmallString<256> BundlePath(Input); - sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); - if (!llvm::sys::fs::is_directory(BundlePath)) - return make_error( - Input + " is a directory, but doesn't look like a dSYM bundle.", - inconvertibleErrorCode()); - - // Create a directory iterator to iterate over all the entries in the - // bundle. - std::error_code EC; - llvm::sys::fs::directory_iterator DirIt(BundlePath, EC); - llvm::sys::fs::directory_iterator DirEnd; - if (EC) - return errorCodeToError(EC); - - // Add each entry to the list of inputs. - while (DirIt != DirEnd) { - Inputs.push_back(DirIt->path()); - DirIt.increment(EC); - if (EC) - return errorCodeToError(EC); - } - } - return Inputs; -} - int main(int argc, char **argv) { InitLLVM X(argc, argv); + // Parse arguments. + DsymutilOptTable T; + unsigned MAI; + unsigned MAC; + ArrayRef ArgsArr = makeArrayRef(argv + 1, argc - 1); + opt::InputArgList Args = T.ParseArgs(ArgsArr, MAI, MAC); + void *P = (void *)(intptr_t)getOutputFileName; - std::string SDKPath = llvm::sys::fs::getMainExecutable(argv[0], P); - SDKPath = llvm::sys::path::parent_path(SDKPath); - - HideUnrelatedOptions({&DsymCategory, &ColorCategory}); - llvm::cl::ParseCommandLineOptions( - argc, argv, - "manipulate archived DWARF debug symbol files.\n\n" - "dsymutil links the DWARF debug information found in the object files\n" - "for the executable by using debug symbols information\n" - "contained in its symbol table.\n"); - - if (Help) { - PrintHelpMessage(); + std::string SDKPath = sys::fs::getMainExecutable(argv[0], P); + SDKPath = sys::path::parent_path(SDKPath); + + if (Args.hasArg(OPT_help)) { + T.PrintHelp( + outs(), (std::string(argv[0]) + " [options] ").c_str(), + "manipulate archived DWARF debug symbol files.\n\n" + "dsymutil links the DWARF debug information found in the object files\n" + "for the executable by using debug symbols information\n" + "contained in its symbol table.\n", + false); return 0; } - if (Version) { - llvm::cl::PrintVersionMessage(); + if (Args.hasArg(OPT_version)) { + cl::PrintVersionMessage(); return 0; } - auto OptionsOrErr = getOptions(); + auto OptionsOrErr = getOptions(Args); if (!OptionsOrErr) { WithColor::error() << toString(OptionsOrErr.takeError()); return 1; } - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); - - auto InputsOrErr = getInputs(OptionsOrErr->Update); - if (!InputsOrErr) { - WithColor::error() << toString(InputsOrErr.takeError()) << '\n'; - return 1; - } - - if (!FlatOut && OutputFileOpt == "-") { - WithColor::error() << "cannot emit to standard output without --flat\n"; - return 1; - } - - if (InputsOrErr->size() > 1 && FlatOut && !OutputFileOpt.empty()) { - WithColor::error() << "cannot use -o with multiple inputs in flat mode\n"; - return 1; - } - - if (InputFiles.size() > 1 && !SymbolMap.empty() && - !llvm::sys::fs::is_directory(SymbolMap)) { - WithColor::error() << "when unobfuscating multiple files, --symbol-map " - << "needs to point to a directory.\n"; - return 1; - } + auto &Options = *OptionsOrErr; - if (getenv("RC_DEBUG_OPTIONS")) - PaperTrailWarnings = true; + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllTargets(); + InitializeAllAsmPrinters(); - if (PaperTrailWarnings && InputIsYAMLDebugMap) - WithColor::warning() - << "Paper trail warnings are not supported for YAML input"; - - for (const auto &Arch : ArchFlags) + for (const auto &Arch : Options.Archs) if (Arch != "*" && Arch != "all" && - !llvm::object::MachOObjectFile::isValidArch(Arch)) { + !object::MachOObjectFile::isValidArch(Arch)) { WithColor::error() << "unsupported cpu architecture: '" << Arch << "'\n"; return 1; } - SymbolMapLoader SymMapLoader(SymbolMap); + SymbolMapLoader SymMapLoader(Options.SymbolMap); - for (auto &InputFile : *InputsOrErr) { + for (auto &InputFile : Options.InputFiles) { // Dump the symbol table for each input file and requested arch - if (DumpStab) { - if (!dumpStab(InputFile, ArchFlags, OsoPrependPath)) + if (Options.DumpStab) { + if (!dumpStab(InputFile, Options.Archs, Options.LinkOpts.PrependPath)) return 1; continue; } auto DebugMapPtrsOrErr = - parseDebugMap(InputFile, ArchFlags, OsoPrependPath, PaperTrailWarnings, - Verbose, InputIsYAMLDebugMap); + parseDebugMap(InputFile, Options.Archs, Options.LinkOpts.PrependPath, + Options.PaperTrailWarnings, Options.LinkOpts.Verbose, + Options.InputIsYAMLDebugMap); if (auto EC = DebugMapPtrsOrErr.getError()) { WithColor::error() << "cannot parse the debug map for '" << InputFile @@ -511,12 +497,12 @@ int main(int argc, char **argv) { return 1; } - if (OptionsOrErr->Update) { + if (Options.LinkOpts.Update) { // The debug map should be empty. Add one object file corresponding to // the input file. for (auto &Map : *DebugMapPtrsOrErr) Map->addDebugMapObject(InputFile, - llvm::sys::TimePoint()); + sys::TimePoint()); } // Ensure that the debug map is not empty (anymore). @@ -529,26 +515,26 @@ int main(int argc, char **argv) { BinaryHolder BinHolder; unsigned ThreadCount = - std::min(OptionsOrErr->Threads, DebugMapPtrsOrErr->size()); - llvm::ThreadPool Threads(ThreadCount); + std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); + ThreadPool Threads(ThreadCount); // If there is more than one link to execute, we need to generate // temporary files. bool NeedsTempFiles = - !DumpDebugMap && (OutputFileOpt != "-") && - (DebugMapPtrsOrErr->size() != 1 || OptionsOrErr->Update); + !Options.DumpDebugMap && (Options.OutputFile != "-") && + (DebugMapPtrsOrErr->size() != 1 || Options.LinkOpts.Update); - llvm::SmallVector TempFiles; + SmallVector TempFiles; std::atomic_char AllOK(1); for (auto &Map : *DebugMapPtrsOrErr) { - if (Verbose || DumpDebugMap) - Map->print(llvm::outs()); + if (Options.LinkOpts.Verbose || Options.DumpDebugMap) + Map->print(outs()); - if (DumpDebugMap) + if (Options.DumpDebugMap) continue; - if (!SymbolMap.empty()) - OptionsOrErr->Translator = SymMapLoader.Load(InputFile, *Map); + if (!Options.SymbolMap.empty()) + Options.LinkOpts.Translator = SymMapLoader.Load(InputFile, *Map); if (Map->begin() == Map->end()) WithColor::warning() @@ -560,12 +546,12 @@ int main(int argc, char **argv) { std::shared_ptr OS; Expected OutputLocationOrErr = - getOutputFileName(InputFile); + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } - OptionsOrErr->ResourceDir = OutputLocationOrErr->getResourceDir(); + Options.LinkOpts.ResourceDir = OutputLocationOrErr->getResourceDir(); std::string OutputFile = OutputLocationOrErr->DWARFFile; if (NeedsTempFiles) { @@ -583,30 +569,32 @@ int main(int argc, char **argv) { OutputFile = TempFile.TmpName; } else { std::error_code EC; - OS = std::make_shared(NoOutput ? "-" : OutputFile, EC, - sys::fs::OF_None); + OS = std::make_shared( + Options.LinkOpts.NoOutput ? "-" : OutputFile, EC, sys::fs::OF_None); if (EC) { WithColor::error() << OutputFile << ": " << EC.message(); return 1; } } + const bool Verify = Options.Verify && !Options.LinkOpts.NoOutput; auto LinkLambda = [&, OutputFile](std::shared_ptr Stream, LinkOptions Options) { AllOK.fetch_and( linkDwarf(*Stream, BinHolder, *Map, std::move(Options))); Stream->flush(); - if (Verify && !NoOutput) - AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName())); + if (Verify) + AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName(), + Options.Verbose)); }; // FIXME: The DwarfLinker can have some very deep recursion that can max // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. if (ThreadCount == 1) - LinkLambda(OS, *OptionsOrErr); + LinkLambda(OS, Options.LinkOpts); else - Threads.async(LinkLambda, OS, *OptionsOrErr); + Threads.async(LinkLambda, OS, Options.LinkOpts); } Threads.wait(); @@ -615,14 +603,15 @@ int main(int argc, char **argv) { return 1; if (NeedsTempFiles) { - Expected OutputLocationOrErr = getOutputFileName(InputFile); + Expected OutputLocationOrErr = + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } if (!MachOUtils::generateUniversalBinary(TempFiles, OutputLocationOrErr->DWARFFile, - *OptionsOrErr, SDKPath)) + Options.LinkOpts, SDKPath)) return 1; } } diff --git a/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/tools/llvm-exegesis/lib/SnippetRepetitor.cpp index 954471659e59..860db92a81da 100644 --- a/tools/llvm-exegesis/lib/SnippetRepetitor.cpp +++ b/tools/llvm-exegesis/lib/SnippetRepetitor.cpp @@ -79,7 +79,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor { for (const auto &LiveIn : Entry.MBB->liveins()) Loop.MBB->addLiveIn(LiveIn); Loop.addInstructions(Instructions); - ET.decrementLoopCounterAndLoop(*Loop.MBB, State.getInstrInfo()); + ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, + State.getInstrInfo()); // Set up the exit basic block. Loop.MBB->addSuccessor(Exit.MBB, llvm::BranchProbability::getZero()); diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h index 4b0c9d17dd7f..70313a7a2f7a 100644 --- a/tools/llvm-exegesis/lib/Target.h +++ b/tools/llvm-exegesis/lib/Target.h @@ -95,7 +95,8 @@ class ExegesisTarget { } // Adds the code to decrement the loop counter and - virtual void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const { llvm_unreachable("decrementLoopCounterAndBranch() requires " "getLoopCounterRegister() > 0"); diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index bf008e8bbc7a..ce66610891d0 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -448,7 +448,8 @@ class ExegesisX86Target : public ExegesisTarget { void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const override; - void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const override; std::vector setRegTo(const llvm::MCSubtargetInfo &STI, @@ -558,14 +559,15 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, SetOp(MemOpIdx + 4, MCOperand::createReg(0)); // Segment } -void ExegesisX86Target::decrementLoopCounterAndLoop( - MachineBasicBlock &MBB, const llvm::MCInstrInfo &MII) const { +void ExegesisX86Target::decrementLoopCounterAndJump( + MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, + const llvm::MCInstrInfo &MII) const { BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) .addDef(kLoopCounterReg) .addUse(kLoopCounterReg) .addImm(-1); BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) - .addMBB(&MBB) + .addMBB(&TargetMBB) .addImm(X86::COND_NE); } diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp index 1b3e8f4851df..2a5918486853 100644 --- a/tools/llvm-readobj/ELFDumper.cpp +++ b/tools/llvm-readobj/ELFDumper.cpp @@ -2315,7 +2315,7 @@ MipsGOTParser::MipsGOTParser(const ELFO *Obj, StringRef FileName, if (IsStatic) { GotSec = findSectionByName(*Obj, FileName, ".got"); if (!GotSec) - reportError(createError("Cannot find .got section"), FileName); + return; ArrayRef Content = unwrapOrError(FileName, Obj->getSectionContents(GotSec)); diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 5919a7eed3e3..dfcda9996310 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -237,23 +237,6 @@ namespace opts { cl::alias ArchSpecifcInfoShort("A", cl::desc("Alias for --arch-specific"), cl::aliasopt(ArchSpecificInfo), cl::NotHidden); - // --mips-plt-got - cl::opt - MipsPLTGOT("mips-plt-got", - cl::desc("Display the MIPS GOT and PLT GOT sections")); - - // --mips-abi-flags - cl::opt MipsABIFlags("mips-abi-flags", - cl::desc("Display the MIPS.abiflags section")); - - // --mips-reginfo - cl::opt MipsReginfo("mips-reginfo", - cl::desc("Display the MIPS .reginfo section")); - - // --mips-options - cl::opt MipsOptions("mips-options", - cl::desc("Display the MIPS .MIPS.options section")); - // --coff-imports cl::opt COFFImports("coff-imports", cl::desc("Display the PE/COFF import table")); @@ -519,18 +502,15 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer, if (Obj->isELF()) { if (opts::ELFLinkerOptions) Dumper->printELFLinkerOptions(); - if (opts::ArchSpecificInfo) + if (opts::ArchSpecificInfo) { if (Obj->getArch() == llvm::Triple::arm) Dumper->printAttributes(); - if (isMipsArch(Obj->getArch())) { - if (opts::MipsPLTGOT) - Dumper->printMipsPLTGOT(); - if (opts::MipsABIFlags) + else if (isMipsArch(Obj->getArch())) { Dumper->printMipsABIFlags(); - if (opts::MipsReginfo) - Dumper->printMipsReginfo(); - if (opts::MipsOptions) Dumper->printMipsOptions(); + Dumper->printMipsReginfo(); + Dumper->printMipsPLTGOT(); + } } if (opts::SectionGroups) Dumper->printGroupSections(); diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp index c4b6eb79d18c..2c17b9570e1b 100644 --- a/tools/obj2yaml/elf2yaml.cpp +++ b/tools/obj2yaml/elf2yaml.cpp @@ -41,6 +41,7 @@ class ELFDumper { Expected getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, const Elf_Shdr *SymTab); + Expected getSymbolName(uint32_t SymtabNdx, uint32_t SymbolNdx); const object::ELFFile &Obj; ArrayRef ShndxTable; @@ -56,6 +57,7 @@ class ELFDumper { Error dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, ELFYAML::Relocation &R); + Expected dumpAddrsigSection(const Elf_Shdr *Shdr); Expected dumpDynamicSection(const Elf_Shdr *Shdr); Expected dumpRelocSection(const Elf_Shdr *Shdr); Expected @@ -284,6 +286,13 @@ template Expected ELFDumper::dump() { Y->Sections.emplace_back(*SecOrErr); break; } + case ELF::SHT_LLVM_ADDRSIG: { + Expected SecOrErr = dumpAddrsigSection(&Sec); + if (!SecOrErr) + return SecOrErr.takeError(); + Y->Sections.emplace_back(*SecOrErr); + break; + } case ELF::SHT_NULL: { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj @@ -519,6 +528,46 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { return S.release(); } +template +Expected +ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); + + auto ContentOrErr = Obj.getSectionContents(Shdr); + if (!ContentOrErr) + return ContentOrErr.takeError(); + + ArrayRef Content = *ContentOrErr; + DataExtractor::Cursor Cur(0); + DataExtractor Data(Content, Obj.isLE(), /*AddressSize=*/0); + std::vector Symbols; + while (Cur && Cur.tell() < Content.size()) { + uint64_t SymNdx = Data.getULEB128(Cur); + if (!Cur) + break; + + Expected SymbolName = getSymbolName(Shdr->sh_link, SymNdx); + if (!SymbolName || SymbolName->empty()) { + consumeError(SymbolName.takeError()); + Symbols.emplace_back(SymNdx); + continue; + } + + Symbols.emplace_back(*SymbolName); + } + + if (Cur) { + S->Symbols = std::move(Symbols); + return S.release(); + } + + consumeError(Cur.takeError()); + S->Content = yaml::BinaryRef(Content); + return S.release(); +} + template Expected ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { @@ -791,25 +840,31 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { } template -Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - - auto SymtabOrErr = Obj.getSection(Shdr->sh_link); +Expected ELFDumper::getSymbolName(uint32_t SymtabNdx, + uint32_t SymbolNdx) { + auto SymtabOrErr = Obj.getSection(SymtabNdx); if (!SymtabOrErr) return SymtabOrErr.takeError(); - // Get symbol with index sh_info which name is the signature of the group. + const Elf_Shdr *Symtab = *SymtabOrErr; - auto SymOrErr = Obj.getSymbol(Symtab, Shdr->sh_info); + auto SymOrErr = Obj.getSymbol(Symtab, SymbolNdx); if (!SymOrErr) return SymOrErr.takeError(); + auto StrTabOrErr = Obj.getStringTableForSymtab(*Symtab); if (!StrTabOrErr) return StrTabOrErr.takeError(); + return getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); +} + +template +Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); - Expected SymbolName = - getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); + // Get symbol with index sh_info. This symbol's name is the signature of the group. + Expected SymbolName = getSymbolName(Shdr->sh_link, Shdr->sh_info); if (!SymbolName) return SymbolName.takeError(); S->Signature = *SymbolName; diff --git a/unittests/Target/AArch64/TestStackOffset.cpp b/unittests/Target/AArch64/TestStackOffset.cpp index 240cec9f2d0b..c85135ef6605 100644 --- a/unittests/Target/AArch64/TestStackOffset.cpp +++ b/unittests/Target/AArch64/TestStackOffset.cpp @@ -20,6 +20,15 @@ TEST(StackOffset, MixedSize) { StackOffset C(2, MVT::v4i64); EXPECT_EQ(64, C.getBytes()); + + StackOffset D(2, MVT::nxv4i64); + EXPECT_EQ(64, D.getScalableBytes()); + + StackOffset E(2, MVT::v4i64); + EXPECT_EQ(0, E.getScalableBytes()); + + StackOffset F(2, MVT::nxv4i64); + EXPECT_EQ(0, F.getBytes()); } TEST(StackOffset, Add) { @@ -31,6 +40,11 @@ TEST(StackOffset, Add) { StackOffset D(1, MVT::i32); D += A; EXPECT_EQ(12, D.getBytes()); + + StackOffset E(1, MVT::nxv1i32); + StackOffset F = C + E; + EXPECT_EQ(12, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, Sub) { @@ -42,6 +56,12 @@ TEST(StackOffset, Sub) { StackOffset D(1, MVT::i64); D -= A; EXPECT_EQ(0, D.getBytes()); + + C += StackOffset(2, MVT::nxv1i32); + StackOffset E = StackOffset(1, MVT::nxv1i32); + StackOffset F = C - E; + EXPECT_EQ(4, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, isZero) { @@ -49,12 +69,63 @@ TEST(StackOffset, isZero) { StackOffset B(0, MVT::i32); EXPECT_TRUE(!A); EXPECT_TRUE(!(A + B)); + + StackOffset C(0, MVT::nxv1i32); + EXPECT_TRUE(!(A + C)); + + StackOffset D(1, MVT::nxv1i32); + EXPECT_FALSE(!(A + D)); +} + +TEST(StackOffset, isValid) { + EXPECT_FALSE(StackOffset(1, MVT::nxv8i1).isValid()); + EXPECT_TRUE(StackOffset(2, MVT::nxv8i1).isValid()); + +#ifndef NDEBUG +#ifdef GTEST_HAS_DEATH_TEST + EXPECT_DEATH(StackOffset(1, MVT::i1), + "Offset type is not a multiple of bytes"); + EXPECT_DEATH(StackOffset(1, MVT::nxv1i1), + "Offset type is not a multiple of bytes"); +#endif // defined GTEST_HAS_DEATH_TEST +#endif // not defined NDEBUG } TEST(StackOffset, getForFrameOffset) { StackOffset A(1, MVT::i64); StackOffset B(1, MVT::i32); - int64_t ByteSized; - (A + B).getForFrameOffset(ByteSized); + StackOffset C(1, MVT::nxv4i32); + + // If all offsets can be materialized with only ADDVL, + // make sure PLSized is 0. + int64_t ByteSized, VLSized, PLSized; + (A + B + C).getForFrameOffset(ByteSized, PLSized, VLSized); EXPECT_EQ(12, ByteSized); + EXPECT_EQ(1, VLSized); + EXPECT_EQ(0, PLSized); + + // If we need an ADDPL to materialize the offset, and the number of scalable + // bytes fits the ADDPL immediate, fold the scalable bytes to fit in PLSized. + StackOffset D(1, MVT::nxv16i1); + (C + D).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(9, PLSized); + + StackOffset E(4, MVT::nxv4i32); + StackOffset F(1, MVT::nxv16i1); + (E + F).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(33, PLSized); + + // If the offset requires an ADDPL instruction to materialize, and would + // require more than two instructions, decompose it into both + // ADDVL (n x 16 bytes) and ADDPL (n x 2 bytes) instructions. + StackOffset G(8, MVT::nxv4i32); + StackOffset H(1, MVT::nxv16i1); + (G + H).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(8, VLSized); + EXPECT_EQ(1, PLSized); } diff --git a/unittests/Transforms/Utils/LocalTest.cpp b/unittests/Transforms/Utils/LocalTest.cpp index ff2eda1cec03..1f67a1ec84c7 100644 --- a/unittests/Transforms/Utils/LocalTest.cpp +++ b/unittests/Transforms/Utils/LocalTest.cpp @@ -867,6 +867,36 @@ TEST(Local, RemoveUnreachableBlocks) { bb2: br label %bb1 } + + declare i32 @__gxx_personality_v0(...) + + define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + entry: + br i1 undef, label %invoke.block, label %exit + + invoke.block: + %cond = invoke zeroext i1 @invokable() + to label %continue.block unwind label %lpad.block + + continue.block: + br i1 %cond, label %if.then, label %if.end + + if.then: + unreachable + + if.end: + unreachable + + lpad.block: + %lp = landingpad { i8*, i32 } + catch i8* null + br label %exit + + exit: + ret void + } + + declare i1 @invokable() )"); auto runEager = [&](Function &F, DominatorTree *DT) { @@ -890,12 +920,14 @@ TEST(Local, RemoveUnreachableBlocks) { runWithDomTree(*M, "br_self_loop", runEager); runWithDomTree(*M, "br_constant", runEager); runWithDomTree(*M, "br_loop", runEager); + runWithDomTree(*M, "invoke_terminator", runEager); // Test removeUnreachableBlocks under Lazy UpdateStrategy. runWithDomTree(*M, "br_simple", runLazy); runWithDomTree(*M, "br_self_loop", runLazy); runWithDomTree(*M, "br_constant", runLazy); runWithDomTree(*M, "br_loop", runLazy); + runWithDomTree(*M, "invoke_terminator", runLazy); M = parseIR(C, R"( diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index c88365a2b8ce..d97f9359f54d 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -24,6 +24,7 @@ add_tablegen(llvm-tblgen LLVM ExegesisEmitter.cpp FastISelEmitter.cpp FixedLenDecoderEmitter.cpp + GICombinerEmitter.cpp GlobalISelEmitter.cpp InfoByHwMode.cpp InstrInfoEmitter.cpp diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp index cb05f78fba41..f12d7d484a8e 100644 --- a/utils/TableGen/CodeGenSchedule.cpp +++ b/utils/TableGen/CodeGenSchedule.cpp @@ -1083,9 +1083,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { if (RWD->getValueAsDef("SchedModel") == RWModelDef && RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) { for (Record *Inst : InstDefs) { - PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " + - Inst->getName() + " also matches " + - RWD->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + Inst->getName() + + "\" also matches previous \"" + + RWD->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } } @@ -1115,9 +1119,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) { if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) { for (Record *InstDef : InstDefs) { - PrintFatalError(OldRWDef->getLoc(), "Overlapping InstRW def " + - InstDef->getName() + " also matches " + - OldRWDef->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + InstDef->getName() + + "\" also matches previous \"" + + OldRWDef->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } assert(OldRWDef != InstRWDef && diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp new file mode 100644 index 000000000000..a85462b5aa89 --- /dev/null +++ b/utils/TableGen/GICombinerEmitter.cpp @@ -0,0 +1,100 @@ +//===- GlobalCombinerEmitter.cpp - Generate a combiner --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Generate a combiner implementation for GlobalISel from a declarative +/// syntax +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Timer.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/TableGenBackend.h" +#include "CodeGenTarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "gicombiner-emitter" + +cl::OptionCategory + GICombinerEmitterCat("Options for -gen-global-isel-combiner"); +static cl::list + SelectedCombiners("combiners", cl::desc("Emit the specified combiners"), + cl::cat(GICombinerEmitterCat), cl::CommaSeparated); +namespace { +class GICombinerEmitter { + StringRef Name; + Record *Combiner; +public: + explicit GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner); + ~GICombinerEmitter() {} + + StringRef getClassName() const { + return Combiner->getValueAsString("Classname"); + } + void run(raw_ostream &OS); + +}; + +GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner) + : Name(Name), Combiner(Combiner) {} + +void GICombinerEmitter::run(raw_ostream &OS) { + NamedRegionTimer T("Emit", "Time spent emitting the combiner", + "Code Generation", "Time spent generating code", + TimeRegions); + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n" + << "class " << getClassName() << " {\n" + << "public:\n" + << " bool tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const;\n" + << "};\n"; + OS << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n" + << "\n" + << "bool " << getClassName() << "::tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const {\n" + << " MachineBasicBlock *MBB = MI.getParent();\n" + << " MachineFunction *MF = MBB->getParent();\n" + << " MachineRegisterInfo &MRI = MF->getRegInfo();\n" + << " (void)MBB; (void)MF; (void)MRI;\n\n"; + OS << "\n return false;\n" + << "}\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n"; +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +namespace llvm { +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS) { + CodeGenTarget Target(RK); + emitSourceFileHeader("Global Combiner", OS); + + if (SelectedCombiners.empty()) + PrintFatalError("No combiners selected with -combiners"); + for (const auto &Combiner : SelectedCombiners) { + Record *CombinerDef = RK.getDef(Combiner); + if (!CombinerDef) + PrintFatalError("Could not find " + Combiner); + GICombinerEmitter(RK, Combiner, CombinerDef).run(OS); + } +} + +} // namespace llvm diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp index 29ef46fd7fcc..817663b06e29 100644 --- a/utils/TableGen/TableGen.cpp +++ b/utils/TableGen/TableGen.cpp @@ -49,6 +49,7 @@ enum ActionType { GenAttributes, GenSearchableTables, GenGlobalISel, + GenGICombiner, GenX86EVEX2VEXTables, GenX86FoldTables, GenRegisterBank, @@ -62,75 +63,73 @@ bool TimeRegions = false; } // end namespace llvm namespace { - cl::opt - Action(cl::desc("Action to perform:"), - cl::values(clEnumValN(PrintRecords, "print-records", - "Print all records to stdout (default)"), - clEnumValN(DumpJSON, "dump-json", - "Dump all records as machine-readable JSON"), - clEnumValN(GenEmitter, "gen-emitter", - "Generate machine code emitter"), - clEnumValN(GenRegisterInfo, "gen-register-info", - "Generate registers and register classes info"), - clEnumValN(GenInstrInfo, "gen-instr-info", - "Generate instruction descriptions"), - clEnumValN(GenInstrDocs, "gen-instr-docs", - "Generate instruction documentation"), - clEnumValN(GenCallingConv, "gen-callingconv", - "Generate calling convention descriptions"), - clEnumValN(GenAsmWriter, "gen-asm-writer", - "Generate assembly writer"), - clEnumValN(GenDisassembler, "gen-disassembler", - "Generate disassembler"), - clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", - "Generate pseudo instruction lowering"), - clEnumValN(GenCompressInst, "gen-compress-inst-emitter", - "Generate RISCV compressed instructions."), - clEnumValN(GenAsmMatcher, "gen-asm-matcher", - "Generate assembly instruction matcher"), - clEnumValN(GenDAGISel, "gen-dag-isel", - "Generate a DAG instruction selector"), - clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", - "Generate DFA Packetizer for VLIW targets"), - clEnumValN(GenFastISel, "gen-fast-isel", - "Generate a \"fast\" instruction selector"), - clEnumValN(GenSubtarget, "gen-subtarget", - "Generate subtarget enumerations"), - clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", - "Generate intrinsic enums"), - clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", - "Generate intrinsic information"), - clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", - "Generate target intrinsic enums"), - clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", - "Generate target intrinsic information"), - clEnumValN(PrintEnums, "print-enums", - "Print enum values for a class"), - clEnumValN(PrintSets, "print-sets", - "Print expanded sets for testing DAG exprs"), - clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", - "Generate option definitions"), - clEnumValN(GenCTags, "gen-ctags", - "Generate ctags-compatible index"), - clEnumValN(GenAttributes, "gen-attrs", - "Generate attributes"), - clEnumValN(GenSearchableTables, "gen-searchable-tables", - "Generate generic binary-searchable table"), - clEnumValN(GenGlobalISel, "gen-global-isel", - "Generate GlobalISel selector"), - clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", - "Generate X86 EVEX to VEX compress tables"), - clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", - "Generate X86 fold tables"), - clEnumValN(GenRegisterBank, "gen-register-bank", - "Generate registers bank descriptions"), - clEnumValN(GenExegesis, "gen-exegesis", - "Generate llvm-exegesis tables"))); +cl::opt Action( + cl::desc("Action to perform:"), + cl::values( + clEnumValN(PrintRecords, "print-records", + "Print all records to stdout (default)"), + clEnumValN(DumpJSON, "dump-json", + "Dump all records as machine-readable JSON"), + clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"), + clEnumValN(GenRegisterInfo, "gen-register-info", + "Generate registers and register classes info"), + clEnumValN(GenInstrInfo, "gen-instr-info", + "Generate instruction descriptions"), + clEnumValN(GenInstrDocs, "gen-instr-docs", + "Generate instruction documentation"), + clEnumValN(GenCallingConv, "gen-callingconv", + "Generate calling convention descriptions"), + clEnumValN(GenAsmWriter, "gen-asm-writer", "Generate assembly writer"), + clEnumValN(GenDisassembler, "gen-disassembler", + "Generate disassembler"), + clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", + "Generate pseudo instruction lowering"), + clEnumValN(GenCompressInst, "gen-compress-inst-emitter", + "Generate RISCV compressed instructions."), + clEnumValN(GenAsmMatcher, "gen-asm-matcher", + "Generate assembly instruction matcher"), + clEnumValN(GenDAGISel, "gen-dag-isel", + "Generate a DAG instruction selector"), + clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", + "Generate DFA Packetizer for VLIW targets"), + clEnumValN(GenFastISel, "gen-fast-isel", + "Generate a \"fast\" instruction selector"), + clEnumValN(GenSubtarget, "gen-subtarget", + "Generate subtarget enumerations"), + clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", + "Generate intrinsic enums"), + clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", + "Generate intrinsic information"), + clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", + "Generate target intrinsic enums"), + clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", + "Generate target intrinsic information"), + clEnumValN(PrintEnums, "print-enums", "Print enum values for a class"), + clEnumValN(PrintSets, "print-sets", + "Print expanded sets for testing DAG exprs"), + clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", + "Generate option definitions"), + clEnumValN(GenCTags, "gen-ctags", "Generate ctags-compatible index"), + clEnumValN(GenAttributes, "gen-attrs", "Generate attributes"), + clEnumValN(GenSearchableTables, "gen-searchable-tables", + "Generate generic binary-searchable table"), + clEnumValN(GenGlobalISel, "gen-global-isel", + "Generate GlobalISel selector"), + clEnumValN(GenGICombiner, "gen-global-isel-combiner", + "Generate GlobalISel combiner"), + clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", + "Generate X86 EVEX to VEX compress tables"), + clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", + "Generate X86 fold tables"), + clEnumValN(GenRegisterBank, "gen-register-bank", + "Generate registers bank descriptions"), + clEnumValN(GenExegesis, "gen-exegesis", + "Generate llvm-exegesis tables"))); - cl::OptionCategory PrintEnumsCat("Options for -print-enums"); - cl::opt - Class("class", cl::desc("Print Enum list for this class"), - cl::value_desc("class name"), cl::cat(PrintEnumsCat)); +cl::OptionCategory PrintEnumsCat("Options for -print-enums"); +cl::opt Class("class", cl::desc("Print Enum list for this class"), + cl::value_desc("class name"), + cl::cat(PrintEnumsCat)); cl::opt TimeRegionsOpt("time-regions", @@ -235,6 +234,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenGlobalISel: EmitGlobalISel(Records, OS); break; + case GenGICombiner: + EmitGICombiner(Records, OS); + break; case GenRegisterBank: EmitRegisterBank(Records, OS); break; diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h index 135ec65c0f95..9c21ef354407 100644 --- a/utils/TableGen/TableGenBackends.h +++ b/utils/TableGen/TableGenBackends.h @@ -85,6 +85,7 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS); void EmitAttributes(RecordKeeper &RK, raw_ostream &OS); void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS); void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS); +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS); void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); diff --git a/utils/UpdateTestChecks/asm.py b/utils/UpdateTestChecks/asm.py index 1eb354d8a46a..81556d65802c 100644 --- a/utils/UpdateTestChecks/asm.py +++ b/utils/UpdateTestChecks/asm.py @@ -58,6 +58,12 @@ class string: # .Lfunc_end0: (mips64 - NewABI) flags=(re.M | re.S)) +ASM_FUNCTION_MSP430_RE = re.compile( + r'^_?(?P[^:]+):[ \t]*;+[ \t]*@(?P=func)\n[^:]*?' + r'(?P.*?)\n' + r'(\$|\.L)func_end[0-9]+:\n', # $func_end0: + flags=(re.M | re.S)) + ASM_FUNCTION_PPC_RE = re.compile( r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' r'.*?' @@ -231,6 +237,16 @@ def scrub_asm_mips(asm, args): asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) return asm +def scrub_asm_msp430(asm, args): + # Scrub runs of whitespace out of the assembly, but leave the leading + # whitespace in place. + asm = common.SCRUB_WHITESPACE_RE.sub(r' ', asm) + # Expand the tabs used for indentation. + asm = string.expandtabs(asm, 2) + # Strip trailing whitespace. + asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) + return asm + def scrub_asm_riscv(asm, args): # Scrub runs of whitespace out of the assembly, but leave the leading # whitespace in place. @@ -315,6 +331,7 @@ def build_function_body_dictionary_for_triple(args, raw_tool_output, triple, pre 'thumbv5-macho': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), 'thumbv7-apple-ios' : (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), 'mips': (scrub_asm_mips, ASM_FUNCTION_MIPS_RE), + 'msp430': (scrub_asm_msp430, ASM_FUNCTION_MSP430_RE), 'ppc32': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'powerpc': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'riscv32': (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE), diff --git a/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index a467d4947bff..5d36602f97ef 100644 --- a/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -18,6 +18,7 @@ static_library("cppcoreguidelines") { sources = [ "AvoidGotoCheck.cpp", "CppCoreGuidelinesTidyModule.cpp", + "InitVariablesCheck.cpp", "InterfacesGlobalInitCheck.cpp", "MacroUsageCheck.cpp", "NarrowingConversionsCheck.cpp", diff --git a/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index b05e0891d124..0c27c11e2e0b 100644 --- a/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -30,6 +30,15 @@ tablegen("AArch64GenGlobalISel") { td_file = "AArch64.td" } +tablegen("AArch64GenGICombiner") { + visibility = [ ":LLVMAArch64CodeGen" ] + args = [ + "-gen-global-isel-combiner", + "-combiners=AArch64PreLegalizerCombinerHelper", + ] + td_file = "AArch64.td" +} + tablegen("AArch64GenMCPseudoLowering") { visibility = [ ":LLVMAArch64CodeGen" ] args = [ "-gen-pseudo-lowering" ] @@ -48,6 +57,7 @@ static_library("LLVMAArch64CodeGen") { ":AArch64GenCallingConv", ":AArch64GenDAGISel", ":AArch64GenFastISel", + ":AArch64GenGICombiner", ":AArch64GenGlobalISel", ":AArch64GenMCPseudoLowering", ":AArch64GenRegisterBank", diff --git a/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn index d6c256b52b43..ec7f3d81cba9 100644 --- a/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn @@ -26,5 +26,6 @@ static_library("Instrumentation") { "PoisonChecking.cpp", "SanitizerCoverage.cpp", "ThreadSanitizer.cpp", + "ValueProfileCollector.cpp", ] } diff --git a/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn index 721b4c4be115..ee16b0a3a954 100644 --- a/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn +++ b/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn @@ -1,9 +1,18 @@ +import("//llvm/utils/TableGen/tablegen.gni") + +tablegen("Options") { + visibility = [ ":dsymutil" ] + args = [ "-gen-opt-parser-defs" ] +} + executable("dsymutil") { deps = [ + ":Options", "//llvm/lib/CodeGen/AsmPrinter", "//llvm/lib/DebugInfo/DWARF", "//llvm/lib/MC", "//llvm/lib/Object", + "//llvm/lib/Option", "//llvm/lib/Support", "//llvm/lib/Target", "//llvm/lib/Target:TargetsToBuild", diff --git a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 01219543d2db..9f5043faeed8 100644 --- a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -30,6 +30,7 @@ executable("llvm-tblgen") { "ExegesisEmitter.cpp", "FastISelEmitter.cpp", "FixedLenDecoderEmitter.cpp", + "GICombinerEmitter.cpp", "GlobalISelEmitter.cpp", "InfoByHwMode.cpp", "InstrDocsEmitter.cpp", diff --git a/utils/llvm-locstats/CMakeLists.txt b/utils/llvm-locstats/CMakeLists.txt index aa5aeca149f0..a919023e141e 100644 --- a/utils/llvm-locstats/CMakeLists.txt +++ b/utils/llvm-locstats/CMakeLists.txt @@ -1,7 +1,12 @@ if (LLVM_BUILD_UTILS AND LLVM_BUILD_TOOLS) - add_custom_target(llvm-locstats ALL - COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + add_custom_command( + OUTPUT ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py + COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats COMMENT "Copying llvm-locstats into ${LLVM_TOOLS_BINARY_DIR}" ) + add_custom_target(llvm-locstats ALL + DEPENDS ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + ) set_target_properties(llvm-locstats PROPERTIES FOLDER "Tools") endif()