From 8db4d72fa53d8f7751ea2ec0b0db84e59e36cd95 Mon Sep 17 00:00:00 2001 From: Piotr Sobczak Date: Wed, 2 Oct 2019 17:22:36 +0000 Subject: [PATCH 01/82] [AMDGPU] Extend buffer intrinsics with swizzling Summary: Extend cachepolicy operand in the new VMEM buffer intrinsics to supply information whether the buffer data is swizzled. Also, propagate this information to MIR. Intrinsics updated: int_amdgcn_raw_buffer_load int_amdgcn_raw_buffer_load_format int_amdgcn_raw_buffer_store int_amdgcn_raw_buffer_store_format int_amdgcn_raw_tbuffer_load int_amdgcn_raw_tbuffer_store int_amdgcn_struct_buffer_load int_amdgcn_struct_buffer_load_format int_amdgcn_struct_buffer_store int_amdgcn_struct_buffer_store_format int_amdgcn_struct_tbuffer_load int_amdgcn_struct_tbuffer_store Furthermore, disable merging of VMEM buffer instructions in SI Load/Store optimizer, if the "swizzled" bit on the instruction is on. The default value of the bit is 0, meaning that data in buffer is linear and buffer instructions can be merged. There is no difference in the generated code with this commit. However, in the future it will be expected that front-ends use buffer intrinsics with correct "swizzled" bit set. Reviewers: arsenm, nhaehnle, tpr Reviewed By: nhaehnle Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, arphaman, jfb, Petar.Avramovic, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68200 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373491 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IntrinsicsAMDGPU.td | 40 ++- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 32 +- .../AMDGPU/AMDGPUInstructionSelector.cpp | 25 +- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 4 + lib/Target/AMDGPU/BUFInstructions.td | 296 +++++++++++------- .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp | 4 + .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h | 2 + lib/Target/AMDGPU/SIFrameLowering.cpp | 4 + lib/Target/AMDGPU/SIISelLowering.cpp | 16 +- lib/Target/AMDGPU/SIInstrInfo.cpp | 2 + lib/Target/AMDGPU/SIInstrInfo.td | 9 +- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 8 + lib/Target/AMDGPU/SIRegisterInfo.cpp | 2 + lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 41 +++ lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 18 ++ .../GlobalISel/inst-select-load-private.mir | 92 +++--- .../GlobalISel/inst-select-store-private.mir | 36 +-- ...llvm.amdgcn.raw.buffer.store.format.f16.ll | 44 +-- ...llvm.amdgcn.raw.buffer.store.format.f32.ll | 24 +- .../llvm.amdgcn.raw.buffer.store.ll | 66 ++-- .../AMDGPU/break-vmem-soft-clauses.mir | 42 +-- .../AMDGPU/clamp-omod-special-case.mir | 24 +- .../coalescer-extend-pruned-subrange.mir | 4 +- ...scer-subranges-another-copymi-not-live.mir | 2 +- ...oalescer-subranges-another-prune-error.mir | 2 +- .../AMDGPU/coalescer-subregjoin-fullcopy.mir | 6 +- .../coalescer-with-subregs-bad-identical.mir | 2 +- test/CodeGen/AMDGPU/collapse-endcf.mir | 64 ++-- test/CodeGen/AMDGPU/collapse-endcf2.mir | 8 +- .../AMDGPU/constant-fold-imm-immreg.mir | 8 +- .../AMDGPU/couldnt-join-subrange-3.mir | 2 +- .../AMDGPU/extract_subvector_vec4_vec3.ll | 4 +- test/CodeGen/AMDGPU/fold-fi-mubuf.mir | 24 +- test/CodeGen/AMDGPU/fold-imm-copy.mir | 2 +- test/CodeGen/AMDGPU/fold-imm-f16-f32.mir | 78 ++--- .../AMDGPU/fold-immediate-output-mods.mir | 24 +- test/CodeGen/AMDGPU/fold-multiple.mir | 2 +- .../AMDGPU/hazard-buffer-store-v-interp.mir | 2 +- test/CodeGen/AMDGPU/hazard-hidden-bundle.mir | 4 +- .../AMDGPU/indirect-addressing-term.ll | 2 +- .../CodeGen/AMDGPU/insert-skips-flat-vmem.mir | 4 +- test/CodeGen/AMDGPU/insert-waitcnts-exp.mir | 8 +- test/CodeGen/AMDGPU/inserted-wait-states.mir | 16 +- test/CodeGen/AMDGPU/invert-br-undef-vcc.mir | 6 +- .../CodeGen/AMDGPU/lds-branch-vmem-hazard.mir | 32 +- .../AMDGPU/llvm.amdgcn.raw.buffer.load.ll | 40 +++ .../AMDGPU/llvm.amdgcn.raw.buffer.store.ll | 31 ++ .../memory-legalizer-atomic-insert-end.mir | 2 +- ...egalizer-multiple-mem-operands-atomics.mir | 6 +- ...er-multiple-mem-operands-nontemporal-1.mir | 6 +- ...er-multiple-mem-operands-nontemporal-2.mir | 6 +- test/CodeGen/AMDGPU/memory_clause.mir | 4 +- test/CodeGen/AMDGPU/merge-load-store.mir | 24 +- .../AMDGPU/mubuf-legalize-operands.mir | 30 +- test/CodeGen/AMDGPU/nsa-vmem-hazard.mir | 10 +- test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir | 6 +- .../AMDGPU/optimize-if-exec-masking.mir | 48 +-- .../AMDGPU/pei-reg-scavenger-position.mir | 4 +- .../CodeGen/AMDGPU/phi-elimination-end-cf.mir | 2 +- .../AMDGPU/power-sched-no-instr-sunit.mir | 2 +- .../AMDGPU/regcoal-subrange-join-seg.mir | 2 +- test/CodeGen/AMDGPU/regcoalesce-dbg.mir | 2 +- ...ename-independent-subregs-mac-operands.mir | 8 +- ...ssert-dead-def-subreg-use-other-subreg.mir | 4 +- test/CodeGen/AMDGPU/sched-crash-dbg-value.mir | 8 +- test/CodeGen/AMDGPU/schedule-barrier.mir | 4 +- test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir | 36 +-- .../AMDGPU/vccz-corrupt-bug-workaround.mir | 12 +- test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir | 24 +- test/CodeGen/AMDGPU/vmem-vcc-hazard.mir | 20 +- .../AMDGPU/waitcnt-loop-irreducible.mir | 4 +- .../MIR/AMDGPU/expected-target-index-name.mir | 2 +- .../AMDGPU/invalid-target-index-operand.mir | 2 +- .../CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir | 28 +- test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir | 4 +- .../MIR/AMDGPU/parse-order-reserved-regs.mir | 4 +- .../MIR/AMDGPU/target-index-operands.mir | 4 +- 77 files changed, 887 insertions(+), 639 deletions(-) diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td index 896170b4be57..ab6ee7f92dd1 100644 --- a/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/include/llvm/IR/IntrinsicsAMDGPU.td @@ -899,7 +899,10 @@ class AMDGPURawBufferLoad : Intrinsic < [llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_raw_buffer_load_format : AMDGPURawBufferLoad; @@ -911,7 +914,10 @@ class AMDGPUStructBufferLoad : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; def int_amdgcn_struct_buffer_load_format : AMDGPUStructBufferLoad; @@ -923,7 +929,10 @@ class AMDGPURawBufferStore : Intrinsic < llvm_v4i32_ty, // rsrc(SGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_raw_buffer_store_format : AMDGPURawBufferStore; @@ -936,7 +945,10 @@ class AMDGPUStructBufferStore : Intrinsic < llvm_i32_ty, // vindex(VGPR) llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; def int_amdgcn_struct_buffer_store_format : AMDGPUStructBufferStore; @@ -1050,7 +1062,10 @@ def int_amdgcn_raw_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<3>, ImmArg<4>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1061,7 +1076,10 @@ def int_amdgcn_raw_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; @@ -1072,7 +1090,10 @@ def int_amdgcn_struct_tbuffer_load : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrReadMem, ImmArg<4>, ImmArg<5>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>; @@ -1084,7 +1105,10 @@ def int_amdgcn_struct_tbuffer_store : Intrinsic < llvm_i32_ty, // offset(VGPR/imm, included in bounds checking and swizzling) llvm_i32_ty, // soffset(SGPR/imm, excluded from bounds checking and swizzling) llvm_i32_ty, // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt) - llvm_i32_ty], // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc on gfx10+) + llvm_i32_ty], // auxiliary data (imm, cachepolicy (bit 0 = glc, + // bit 1 = slc, + // bit 2 = dlc on gfx10+), + // swizzled buffer (bit 3 = swz)) [IntrWriteMem, ImmArg<5>, ImmArg<6>], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<1>; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index b2491ebc6f48..c74a361b2c71 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -186,10 +186,11 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUF(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, - SDValue &SLC, SDValue &TFE, SDValue &DLC) const; + SDValue &SLC, SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const; bool SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &SLC) const; @@ -202,7 +203,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const; + SDValue &TFE, SDValue &DLC, SDValue &SWZ) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const; bool SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, @@ -1313,7 +1314,8 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SDValue &Offset, SDValue &Offen, SDValue &Idxen, SDValue &Addr64, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { // Subtarget prefers to use flat instruction if (Subtarget->useFlatForGlobal()) return false; @@ -1326,6 +1328,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr, SLC = CurDAG->getTargetConstant(0, DL, MVT::i1); TFE = CurDAG->getTargetConstant(0, DL, MVT::i1); DLC = CurDAG->getTargetConstant(0, DL, MVT::i1); + SWZ = CurDAG->getTargetConstant(0, DL, MVT::i1); Idxen = CurDAG->getTargetConstant(0, DL, MVT::i1); Offen = CurDAG->getTargetConstant(0, DL, MVT::i1); @@ -1405,7 +1408,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &VAddr, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, SDValue &TFE, - SDValue &DLC) const { + SDValue &DLC, SDValue &SWZ) const { SDValue Ptr, Offen, Idxen, Addr64; // addr64 bit was removed for volcanic islands. @@ -1413,7 +1416,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, return false; if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; ConstantSDNode *C = cast(Addr64); @@ -1435,9 +1438,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc, SDValue &Offset, SDValue &SLC) const { SLC = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i1); - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFAddr64(Addr, SRsrc, VAddr, SOffset, Offset, GLC, SLC, TFE, DLC, SWZ); } static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) { @@ -1562,13 +1565,14 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &SOffset, SDValue &Offset, SDValue &GLC, SDValue &SLC, - SDValue &TFE, SDValue &DLC) const { + SDValue &TFE, SDValue &DLC, + SDValue &SWZ) const { SDValue Ptr, VAddr, Offen, Idxen, Addr64; const SIInstrInfo *TII = static_cast(Subtarget->getInstrInfo()); if (!SelectMUBUF(Addr, Ptr, VAddr, SOffset, Offset, Offen, Idxen, Addr64, - GLC, SLC, TFE, DLC)) + GLC, SLC, TFE, DLC, SWZ)) return false; if (!cast(Offen)->getSExtValue() && @@ -1590,16 +1594,16 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset ) const { - SDValue GLC, SLC, TFE, DLC; + SDValue GLC, SLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } bool AMDGPUDAGToDAGISel::SelectMUBUFOffset(SDValue Addr, SDValue &SRsrc, SDValue &Soffset, SDValue &Offset, SDValue &SLC) const { - SDValue GLC, TFE, DLC; + SDValue GLC, TFE, DLC, SWZ; - return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC); + return SelectMUBUFOffset(Addr, SRsrc, Soffset, Offset, GLC, SLC, TFE, DLC, SWZ); } template diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 5480eb5595a5..c5e60ed77be6 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -762,16 +762,20 @@ static bool isZero(Register Reg, MachineRegisterInfo &MRI) { return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0; } -static unsigned extractGLC(unsigned CachePolicy) { - return CachePolicy & 1; +static unsigned extractGLC(unsigned AuxiliaryData) { + return AuxiliaryData & 1; } -static unsigned extractSLC(unsigned CachePolicy) { - return (CachePolicy >> 1) & 1; +static unsigned extractSLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 1) & 1; } -static unsigned extractDLC(unsigned CachePolicy) { - return (CachePolicy >> 2) & 1; +static unsigned extractDLC(unsigned AuxiliaryData) { + return (AuxiliaryData >> 2) & 1; +} + +static unsigned extractSWZ(unsigned AuxiliaryData) { + return (AuxiliaryData >> 3) & 1; } // Returns Base register, constant offset, and offset def point. @@ -970,7 +974,7 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, Register RSrc = MI.getOperand(2).getReg(); Register VOffset = MI.getOperand(3).getReg(); Register SOffset = MI.getOperand(4).getReg(); - unsigned CachePolicy = MI.getOperand(5).getImm(); + unsigned AuxiliaryData = MI.getOperand(5).getImm(); unsigned ImmOffset; unsigned TotalOffset; @@ -994,10 +998,11 @@ bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI, MIB.addUse(RSrc) .addUse(SOffset) .addImm(ImmOffset) - .addImm(extractGLC(CachePolicy)) - .addImm(extractSLC(CachePolicy)) + .addImm(extractGLC(AuxiliaryData)) + .addImm(extractSLC(AuxiliaryData)) .addImm(0) // tfe: FIXME: Remove from inst - .addImm(extractDLC(CachePolicy)) + .addImm(extractDLC(AuxiliaryData)) + .addImm(extractSWZ(AuxiliaryData)) .addMemOperand(MMO); MI.eraseFromParent(); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index b946c308cf12..94d1d350dfd2 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -143,6 +143,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { ImmTyDLC, ImmTyGLC, ImmTySLC, + ImmTySWZ, ImmTyTFE, ImmTyD16, ImmTyClampSI, @@ -328,6 +329,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { bool isDLC() const { return isImmTy(ImmTyDLC); } bool isGLC() const { return isImmTy(ImmTyGLC); } bool isSLC() const { return isImmTy(ImmTySLC); } + bool isSWZ() const { return isImmTy(ImmTySWZ); } bool isTFE() const { return isImmTy(ImmTyTFE); } bool isD16() const { return isImmTy(ImmTyD16); } bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<8>(getImm()); } @@ -820,6 +822,7 @@ class AMDGPUOperand : public MCParsedAsmOperand { case ImmTyDLC: OS << "DLC"; break; case ImmTyGLC: OS << "GLC"; break; case ImmTySLC: OS << "SLC"; break; + case ImmTySWZ: OS << "SWZ"; break; case ImmTyTFE: OS << "TFE"; break; case ImmTyD16: OS << "D16"; break; case ImmTyFORMAT: OS << "FORMAT"; break; @@ -6037,6 +6040,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = { {"format", AMDGPUOperand::ImmTyFORMAT, false, nullptr}, {"glc", AMDGPUOperand::ImmTyGLC, true, nullptr}, {"slc", AMDGPUOperand::ImmTySLC, true, nullptr}, + {"swz", AMDGPUOperand::ImmTySWZ, true, nullptr}, {"tfe", AMDGPUOperand::ImmTyTFE, true, nullptr}, {"d16", AMDGPUOperand::ImmTyD16, true, nullptr}, {"high", AMDGPUOperand::ImmTyHigh, true, nullptr}, diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td index 40887a3c56eb..c9e8abad7c3c 100644 --- a/lib/Target/AMDGPU/BUFInstructions.td +++ b/lib/Target/AMDGPU/BUFInstructions.td @@ -7,13 +7,13 @@ //===----------------------------------------------------------------------===// def MUBUFAddr32 : ComplexPattern; -def MUBUFAddr64 : ComplexPattern; +def MUBUFAddr64 : ComplexPattern; def MUBUFAddr64Atomic : ComplexPattern; def MUBUFScratchOffen : ComplexPattern; def MUBUFScratchOffset : ComplexPattern; -def MUBUFOffset : ComplexPattern; +def MUBUFOffset : ComplexPattern; def MUBUFOffsetNoGLC : ComplexPattern; def MUBUFOffsetAtomic : ComplexPattern; @@ -54,6 +54,17 @@ class MTBUFAddr64Table { // MTBUF classes //===----------------------------------------------------------------------===// +class MTBUFGetBaseOpcode { + string ret = !subst("FORMAT_XY", "FORMAT_X", + !subst("FORMAT_XYZ", "FORMAT_X", + !subst("FORMAT_XYZW", "FORMAT_X", Op))); +} + +class getMTBUFElements { + int ret = 1; +} + + class MTBUF_Pseudo pattern=[]> : InstSI, @@ -67,6 +78,9 @@ class MTBUF_Pseudo (NAME); + Instruction BaseOpcode = !cast(MTBUFGetBaseOpcode.ret); + let VM_CNT = 1; let EXP_CNT = 1; let MTBUF = 1; @@ -90,6 +104,7 @@ class MTBUF_Pseudo has_offset = 1; bits<1> has_slc = 1; bits<1> has_tfe = 1; + bits<4> elements = 0; } class MTBUF_Real : @@ -126,17 +141,17 @@ class getMTBUFInsDA vdataList, RegisterClass vaddrClass = !if(!empty(vaddrList), ?, !head(vaddrList)); dag InsNoData = !if(!empty(vaddrList), (ins SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc), + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, - offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc) + offset:$offset, FORMAT:$format, GLC:$glc, SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag InsData = !if(!empty(vaddrList), (ins vdataClass:$vdata, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc), + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz), (ins vdataClass:$vdata, vaddrClass:$vaddr, SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, FORMAT:$format, GLC:$glc, - SLC:$slc, TFE:$tfe, DLC:$dlc) + SLC:$slc, TFE:$tfe, DLC:$dlc, SWZ:$swz) ); dag ret = !if(!empty(vdataList), InsNoData, InsData); } @@ -181,51 +196,54 @@ class MTBUF_SetupAddr { class MTBUF_Load_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind> : MTBUF_Pseudo.ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 1; let mayStore = 0; + let elements = elems; } multiclass MTBUF_Pseudo_Loads { - def _OFFSET : MTBUF_Load_Pseudo , + i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Load_Pseudo , + i8:$format, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Load_Pseudo ; - def _IDXEN : MTBUF_Load_Pseudo ; - def _BOTHEN : MTBUF_Load_Pseudo ; + def _OFFEN : MTBUF_Load_Pseudo ; + def _IDXEN : MTBUF_Load_Pseudo ; + def _BOTHEN : MTBUF_Load_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Load_Pseudo ; - def _OFFEN_exact : MTBUF_Load_Pseudo ; - def _IDXEN_exact : MTBUF_Load_Pseudo ; - def _BOTHEN_exact : MTBUF_Load_Pseudo ; + def _OFFSET_exact : MTBUF_Load_Pseudo ; + def _OFFEN_exact : MTBUF_Load_Pseudo ; + def _IDXEN_exact : MTBUF_Load_Pseudo ; + def _BOTHEN_exact : MTBUF_Load_Pseudo ; } } class MTBUF_Store_Pseudo pattern=[], // Workaround bug bz30254 int addrKindCopy = addrKind, @@ -233,39 +251,40 @@ class MTBUF_Store_Pseudo .ret, - " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMTBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MTBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; let mayLoad = 0; let mayStore = 1; + let elements = elems; } multiclass MTBUF_Pseudo_Stores { - def _OFFSET : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<0, NAME>; - def _ADDR64 : MTBUF_Store_Pseudo , + i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MTBUFAddr64Table<1, NAME>; - def _OFFEN : MTBUF_Store_Pseudo ; - def _IDXEN : MTBUF_Store_Pseudo ; - def _BOTHEN : MTBUF_Store_Pseudo ; + def _OFFEN : MTBUF_Store_Pseudo ; + def _IDXEN : MTBUF_Store_Pseudo ; + def _BOTHEN : MTBUF_Store_Pseudo ; let DisableWQM = 1 in { - def _OFFSET_exact : MTBUF_Store_Pseudo ; - def _OFFEN_exact : MTBUF_Store_Pseudo ; - def _IDXEN_exact : MTBUF_Store_Pseudo ; - def _BOTHEN_exact : MTBUF_Store_Pseudo ; + def _OFFSET_exact : MTBUF_Store_Pseudo ; + def _OFFEN_exact : MTBUF_Store_Pseudo ; + def _IDXEN_exact : MTBUF_Store_Pseudo ; + def _BOTHEN_exact : MTBUF_Store_Pseudo ; } } @@ -393,7 +412,7 @@ class getMUBUFInsDA vdataList, ); dag ret = !con( !if(!empty(vdataList), InsNoData, InsData), - !if(isLds, (ins DLC:$dlc), (ins TFE:$tfe, DLC:$dlc)) + !if(isLds, (ins DLC:$dlc, SWZ:$swz), (ins TFE:$tfe, DLC:$dlc,SWZ:$swz)) ); } @@ -465,7 +484,7 @@ class MUBUF_Load_Pseudo .ret, !if(HasTiedDest, (ins getVregSrcForVT.ret:$vdata_in), (ins))), " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc" # - !if(isLds, " lds", "$tfe") # "$dlc", + !if(isLds, " lds", "$tfe") # "$dlc" # "$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # !if(isLds, "_lds", "") # @@ -483,15 +502,15 @@ class MUBUF_Load_Pseudo : Pat < - (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; class MUBUF_Addr64_Load_Pat : Pat < - (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc)) + (load_vt (ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (load_vt (inst i64:$vaddr, v4i32:$srsrc, i32:$soffset, i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz)) >; multiclass MUBUF_Pseudo_Load_Pats { @@ -542,7 +561,7 @@ class MUBUF_Store_Pseudo .ret]>.ret, - " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc", + " $vdata, " # getMUBUFAsmOps.ret # "$glc$slc$tfe$dlc$swz", pattern>, MUBUF_SetupAddr { let PseudoInstr = opName # "_" # getAddrName.ret; @@ -558,12 +577,12 @@ multiclass MUBUF_Pseudo_Stores, + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<0, NAME>; def _ADDR64 : MUBUF_Store_Pseudo , + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))]>, MUBUFAddr64Table<1, NAME>; def _OFFEN : MUBUF_Store_Pseudo ; @@ -581,8 +600,8 @@ multiclass MUBUF_Pseudo_Stores : MUBUF_Pseudo { + (ins SReg_128:$srsrc, SCSrc_b32:$soffset, offset:$offset, GLC:$glc, SLC:$slc, SWZ:$swz), + " $srsrc, $soffset$offset lds$glc$slc$swz"> { let mayLoad = 0; let mayStore = 1; let maybeAtomic = 1; @@ -1065,35 +1084,35 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN < // MTBUF Instructions //===----------------------------------------------------------------------===// -defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32>; -defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64>; -defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96>; -defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128>; -defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32>; -defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64>; -defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96>; -defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128>; +defm TBUFFER_LOAD_FORMAT_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_x", VGPR_32, 1>; +defm TBUFFER_LOAD_FORMAT_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_xy", VReg_64, 2>; +defm TBUFFER_LOAD_FORMAT_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyz", VReg_96, 3>; +defm TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_xyzw", VReg_128, 4>; +defm TBUFFER_STORE_FORMAT_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_x", VGPR_32, 1>; +defm TBUFFER_STORE_FORMAT_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_xy", VReg_64, 2>; +defm TBUFFER_STORE_FORMAT_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyz", VReg_96, 3>; +defm TBUFFER_STORE_FORMAT_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_xyzw", VReg_128, 4>; let SubtargetPredicate = HasUnpackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128>; - defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96>; - defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128>; + defm TBUFFER_LOAD_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VReg_64, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_128, 4>; + defm TBUFFER_STORE_FORMAT_D16_X_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VReg_64, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_96, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW_gfx80 : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_128, 4>; } // End HasUnpackedD16VMem. let SubtargetPredicate = HasPackedD16VMem, D16Buf = 1 in { - defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32>; - defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64>; - defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32>; - defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64>; - defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64>; + defm TBUFFER_LOAD_FORMAT_D16_X : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_x", VGPR_32, 1>; + defm TBUFFER_LOAD_FORMAT_D16_XY : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_LOAD_FORMAT_D16_XYZ : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_LOAD_FORMAT_D16_XYZW : MTBUF_Pseudo_Loads <"tbuffer_load_format_d16_xyzw", VReg_64, 4>; + defm TBUFFER_STORE_FORMAT_D16_X : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_x", VGPR_32, 1>; + defm TBUFFER_STORE_FORMAT_D16_XY : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xy", VGPR_32, 2>; + defm TBUFFER_STORE_FORMAT_D16_XYZ : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyz", VReg_64, 3>; + defm TBUFFER_STORE_FORMAT_D16_XYZW : MTBUF_Pseudo_Stores <"tbuffer_store_format_d16_xyzw", VReg_64, 4>; } // End HasPackedD16VMem. let SubtargetPredicate = isGFX7Plus in { @@ -1128,6 +1147,10 @@ def extract_dlc : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8); }]>; +def extract_swz : SDNodeXFormgetTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8); +}]>; + //===----------------------------------------------------------------------===// // buffer_load/store_format patterns //===----------------------------------------------------------------------===// @@ -1136,32 +1159,36 @@ multiclass MUBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0)), + timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm)), + timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1211,35 +1238,39 @@ multiclass MUBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, 0), + timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, - (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$cachepolicy, timm), + timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), - $rsrc, $soffset, (as_i16imm $offset), (extract_glc $cachepolicy), - (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary), + (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1441,8 +1472,8 @@ def : GCNPat< class MUBUFLoad_PatternADDR64 : GCNPat < (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, - i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc))), - (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc) + i16:$offset, i1:$glc, i1:$slc, i1:$tfe, i1:$dlc, i1:$swz))), + (Instr_ADDR64 $vaddr, $srsrc, $soffset, $offset, $glc, $slc, $tfe, $dlc, $swz) >; multiclass MUBUFLoad_Atomic_Pattern ; def : GCNPat < (vt (atomic_ld (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset))), - (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } @@ -1476,8 +1507,8 @@ multiclass MUBUFLoad_Pattern ; } @@ -1500,12 +1531,12 @@ multiclass MUBUFScratchLoadPat ; def : GCNPat < (vt (ld (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset))), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1515,12 +1546,12 @@ multiclass MUBUFScratchLoadPat_D16 { def : GCNPat < (ld_frag (MUBUFScratchOffen v4i32:$srsrc, i32:$vaddr, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffen $vaddr, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; def : GCNPat < (ld_frag (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset), vt:$in), - (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, $in) + (InstrOffset $srsrc, $soffset, $offset, 0, 0, 0, 0, 0, $in) >; } @@ -1560,16 +1591,16 @@ defm : MUBUFScratchLoadPat_D16 { - // Store follows atomic op convention so address is forst + // Store follows atomic op convention so address is first def : GCNPat < (atomic_st (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i16:$offset, i1:$slc), vt:$val), - (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0) + (Instr_ADDR64 $val, $vaddr, $srsrc, $soffset, $offset, 0, $slc, 0, 0, 0) >; def : GCNPat < (atomic_st (MUBUFOffsetNoGLC v4i32:$rsrc, i32:$soffset, i16:$offset), vt:$val), - (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0) + (Instr_OFFSET $val, $rsrc, $soffset, (as_i16imm $offset), 0, 0, 0, 0, 0) >; } let SubtargetPredicate = isGFX6GFX7 in { @@ -1583,8 +1614,8 @@ multiclass MUBUFStore_Pattern ; } @@ -1598,13 +1629,13 @@ multiclass MUBUFScratchStorePat ; def : GCNPat < (st vt:$value, (MUBUFScratchOffset v4i32:$srsrc, i32:$soffset, u16imm:$offset)), - (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0) + (InstrOffset rc:$value, $srsrc, $soffset, $offset, 0, 0, 0, 0, 0) >; } @@ -1643,36 +1674,40 @@ multiclass MTBUF_LoadIntrinsicPat { def : GCNPat< (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0)), + timm:$format, timm:$auxiliary, 0)), (!cast(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm)), + timm:$format, timm:$auxiliary, timm)), (!cast(opcode # _BOTHEN) (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -1701,36 +1736,40 @@ multiclass MTBUF_StoreIntrinsicPat { def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, timm), + timm:$format, timm:$auxiliary, timm), (!cast(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset, - timm:$format, timm:$cachepolicy, 0), + timm:$format, timm:$auxiliary, 0), (!cast(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; def : GCNPat< (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, - timm:$offset, timm:$format, timm:$cachepolicy, timm), + timm:$offset, timm:$format, timm:$auxiliary, timm), (!cast(opcode # _BOTHEN_exact) $vdata, (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1), $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format), - (extract_glc $cachepolicy), (extract_slc $cachepolicy), 0, (extract_dlc $cachepolicy)) + (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary), + (extract_swz $auxiliary)) >; } @@ -2397,3 +2436,22 @@ def getMUBUFInfoFromBaseOpcodeAndElements : SearchIndex { let Table = MUBUFInfoTable; let Key = ["BaseOpcode", "elements"]; } + +def MTBUFInfoTable : GenericTable { + let FilterClass = "MTBUF_Pseudo"; + let CppTypeName = "MTBUFInfo"; + let Fields = ["Opcode", "BaseOpcode", "elements", "has_vaddr", "has_srsrc", "has_soffset"]; + + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getMTBUFOpcodeHelper"; +} + +def getMTBUFInfoFromOpcode : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["Opcode"]; +} + +def getMTBUFInfoFromBaseOpcodeAndElements : SearchIndex { + let Table = MTBUFInfoTable; + let Key = ["BaseOpcode", "elements"]; +} diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp index a45162543975..d2ea94548dfe 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp @@ -196,6 +196,10 @@ void AMDGPUInstPrinter::printSLC(const MCInst *MI, unsigned OpNo, printNamedBit(MI, OpNo, O, "slc"); } +void AMDGPUInstPrinter::printSWZ(const MCInst *MI, unsigned OpNo, + const MCSubtargetInfo &STI, raw_ostream &O) { +} + void AMDGPUInstPrinter::printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { printNamedBit(MI, OpNo, O, "tfe"); diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h index 0f62f039763e..66b70831ff9e 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h @@ -72,6 +72,8 @@ class AMDGPUInstPrinter : public MCInstPrinter { raw_ostream &O); void printSLC(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printSWZ(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, + raw_ostream &O); void printTFE(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); void printDMask(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 45c06ebb547a..22f035e7f3e6 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -112,6 +112,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -132,6 +133,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } @@ -157,6 +159,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); return; } @@ -177,6 +180,7 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(MMO); } diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 88dec95177c7..1883b28f657a 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -6271,7 +6271,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(4), // soffset Offsets.second, // offset - Op.getOperand(5), // cachepolicy + Op.getOperand(5), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6289,7 +6289,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6338,7 +6338,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(4), // soffset Offsets.second, // offset Op.getOperand(5), // format - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; @@ -6362,7 +6362,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; @@ -6832,7 +6832,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(6), // soffset Offsets.second, // offset Op.getOperand(7), // format - Op.getOperand(8), // cachepolicy + Op.getOperand(8), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6857,7 +6857,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Op.getOperand(5), // soffset Offsets.second, // offset Op.getOperand(6), // format - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idexen }; unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16 : @@ -6931,7 +6931,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(5), // soffset Offsets.second, // offset - Op.getOperand(6), // cachepolicy + Op.getOperand(6), // cachepolicy, swizzled buffer DAG.getTargetConstant(0, DL, MVT::i1), // idxen }; unsigned Opc = @@ -6975,7 +6975,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op, Offsets.first, // voffset Op.getOperand(6), // soffset Offsets.second, // offset - Op.getOperand(7), // cachepolicy + Op.getOperand(7), // cachepolicy, swizzled buffer DAG.getTargetConstant(1, DL, MVT::i1), // idxen }; unsigned Opc = IntrinsicID == Intrinsic::amdgcn_struct_buffer_store ? diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index b6a90241d4de..d5f2902f18a8 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -4693,6 +4693,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI, MIB.addImm(TFE->getImm()); } + MIB.addImm(getNamedImmOperand(MI, AMDGPU::OpName::swz)); + MIB.cloneMemRefs(MI); Addr64 = MIB; } else { diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index e1b32c4964c4..7473a0c64b2f 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -84,7 +84,7 @@ def SDTtbuffer_load : SDTypeProfile<1, 8, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -102,7 +102,7 @@ def SDTtbuffer_store : SDTypeProfile<0, 9, SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) SDTCisVT<6, i32>, // format(imm) - SDTCisVT<7, i32>, // cachecontrol(imm) + SDTCisVT<7, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<8, i1> // idxen(imm) ]>; @@ -119,7 +119,7 @@ def SDTBufferLoad : SDTypeProfile<1, 7, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad, @@ -145,7 +145,7 @@ def SDTBufferStore : SDTypeProfile<0, 8, SDTCisVT<3, i32>, // voffset(VGPR) SDTCisVT<4, i32>, // soffset(SGPR) SDTCisVT<5, i32>, // offset(imm) - SDTCisVT<6, i32>, // cachepolicy(imm) + SDTCisVT<6, i32>, // cachepolicy, swizzled buffer(imm) SDTCisVT<7, i1>]>; // idxen(imm) def SIbuffer_store : SDNode <"AMDGPUISD::BUFFER_STORE", SDTBufferStore, @@ -1035,6 +1035,7 @@ def DLC : NamedOperandBit<"DLC", NamedMatchClass<"DLC">>; def GLC : NamedOperandBit<"GLC", NamedMatchClass<"GLC">>; def SLC : NamedOperandBit<"SLC", NamedMatchClass<"SLC">>; def TFE : NamedOperandBit<"TFE", NamedMatchClass<"TFE">>; +def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>; def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>; def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>; def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>; diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index 302b299765ee..a78b62de7151 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -640,6 +640,12 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { return false; } + // Do not merge VMEM buffer instructions with "swizzled" bit set. + int Swizzled = + AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), AMDGPU::OpName::swz); + if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) + return false; + for (unsigned i = 0; i < CI.NumAddresses; i++) { // We only ever merge operations with the same base address register, so // don't bother scanning forward if there are no other uses. @@ -998,6 +1004,7 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { .addImm(CI.SLC0) // slc .addImm(0) // tfe .addImm(CI.DLC0) // dlc + .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); @@ -1191,6 +1198,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { .addImm(CI.SLC0) // slc .addImm(0) // tfe .addImm(CI.DLC0) // dlc + .addImm(0) // swz .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index 23c357dadde5..f4dd995316dd 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -617,6 +617,7 @@ static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .cloneMemRefs(*MI); const MachineOperand *VDataIn = TII->getNamedOperand(*MI, @@ -737,6 +738,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI, .addImm(0) // slc .addImm(0) // tfe .addImm(0) // dlc + .addImm(0) // swz .addMemOperand(NewMMO); if (!IsStore && TmpReg != AMDGPU::NoRegister) diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index bb4169788f46..afb2fd987afd 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -137,10 +137,51 @@ struct MUBUFInfo { bool has_soffset; }; +struct MTBUFInfo { + uint16_t Opcode; + uint16_t BaseOpcode; + uint8_t elements; + bool has_vaddr; + bool has_srsrc; + bool has_soffset; +}; + +#define GET_MTBUFInfoTable_DECL +#define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL #include "AMDGPUGenSearchableTables.inc" +int getMTBUFBaseOpcode(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFInfoFromOpcode(Opc); + return Info ? Info->BaseOpcode : -1; +} + +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements) { + const MTBUFInfo *Info = getMTBUFInfoFromBaseOpcodeAndElements(BaseOpc, Elements); + return Info ? Info->Opcode : -1; +} + +int getMTBUFElements(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->elements : 0; +} + +bool getMTBUFHasVAddr(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_vaddr : false; +} + +bool getMTBUFHasSrsrc(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_srsrc : false; +} + +bool getMTBUFHasSoffset(unsigned Opc) { + const MTBUFInfo *Info = getMTBUFOpcodeHelper(Opc); + return Info ? Info->has_soffset : false; +} + int getMUBUFBaseOpcode(unsigned Opc) { const MUBUFInfo *Info = getMUBUFInfoFromOpcode(Opc); return Info ? Info->BaseOpcode : -1; diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index a578fd2bb6a9..f78dadd447ff 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -263,6 +263,24 @@ struct MIMGInfo { LLVM_READONLY const MIMGInfo *getMIMGInfo(unsigned Opc); +LLVM_READONLY +int getMTBUFBaseOpcode(unsigned Opc); + +LLVM_READONLY +int getMTBUFOpcode(unsigned BaseOpc, unsigned Elements); + +LLVM_READONLY +int getMTBUFElements(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasVAddr(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSrsrc(unsigned Opc); + +LLVM_READONLY +bool getMTBUFHasSoffset(unsigned Opc); + LLVM_READONLY int getMUBUFBaseOpcode(unsigned Opc); diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir index 513f5b08c6a2..65a7fc7f4aa7 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-private.mir @@ -20,12 +20,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_4 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_4 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -51,12 +51,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_2 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX6: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_2 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) + ; GFX9: [[BUFFER_LOAD_USHORT_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_USHORT_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 2, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_USHORT_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 2, align 2, addrspace 5) @@ -82,12 +82,12 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1 ; GFX6: liveins: $vgpr0 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -208,12 +208,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2047, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2047 @@ -243,14 +243,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX6: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2047_known_bits ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2147483647, implicit $exec ; GFX9: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY]], [[V_MOV_B32_e32_]], implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_AND_B32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2047, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2147483647 @@ -283,12 +283,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2048, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 2048, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 2048 @@ -318,14 +318,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2047 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965249, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2047 @@ -355,14 +355,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m2048 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294965248, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -2048 @@ -392,12 +392,12 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[COPY]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -427,14 +427,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 4096 @@ -464,14 +464,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4095 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963201, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4095 @@ -501,14 +501,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m4096 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294963200, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -4096 @@ -538,14 +538,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8191, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8191 @@ -575,14 +575,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 8192, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 8192 @@ -612,14 +612,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8191 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959105, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8191 @@ -649,14 +649,14 @@ body: | ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_gep_m8192 ; GFX9: liveins: $vgpr0 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294959104, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(s32) = G_CONSTANT i32 -8192 @@ -681,10 +681,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_0 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -707,10 +707,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_4_constant_sgpr_16 - ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 16, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]] %0:sgpr(p5) = G_CONSTANT i32 16 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -733,10 +733,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFSET]] %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -760,11 +760,11 @@ body: | ; GFX6-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_LOAD %0 :: (load 1, align 1, addrspace 5) @@ -789,10 +789,10 @@ body: | bb.0: ; GFX6-LABEL: name: load_private_s32_from_fi - ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX6: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_fi - ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + ; GFX9: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_LOAD %0 :: (load 4, align 4, addrspace 5) @@ -820,10 +820,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4095 - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 @@ -853,13 +853,13 @@ body: | ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX6: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX6: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] ; GFX9-LABEL: name: load_private_s32_from_1_fi_offset_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec - ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) + ; GFX9: [[BUFFER_LOAD_UBYTE_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_UBYTE_OFFEN [[V_ADD_U32_e64_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 1, addrspace 5) ; GFX9: $vgpr0 = COPY [[BUFFER_LOAD_UBYTE_OFFEN]] %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4096 diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir index 2d8634025c96..ee5ff53ca676 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-private.mir @@ -21,12 +21,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_4 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -52,12 +52,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX6: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_2 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) + ; GFX9: BUFFER_STORE_SHORT_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 2, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 2, align 2, addrspace 5) @@ -83,12 +83,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(s32) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 1, align 1, addrspace 5) @@ -114,12 +114,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_v2s16 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(<2 x s16>) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -145,12 +145,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p3 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p3) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -176,12 +176,12 @@ body: | ; GFX6: liveins: $vgpr0, $vgpr1 ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX6: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) ; GFX9-LABEL: name: store_private_p5 ; GFX9: liveins: $vgpr0, $vgpr1 ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1 - ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) + ; GFX9: BUFFER_STORE_DWORD_OFFEN [[COPY]], [[COPY1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 5) %0:vgpr(p5) = COPY $vgpr0 %1:vgpr(p5) = COPY $vgpr1 G_STORE %0, %1 :: (store 4, align 4, addrspace 5) @@ -209,10 +209,10 @@ body: | ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4095, implicit $exec ; GFX6: %2:vgpr_32, dead %4:sreg_64_xexec = V_ADD_I32_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_2]], %2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_fi_offset_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_FRAME_INDEX %stack.0 %1:vgpr(s32) = G_CONSTANT i32 4095 %2:vgpr(p5) = G_GEP %0, %1 @@ -239,10 +239,10 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4095 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFSET [[V_MOV_B32_e32_]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4095, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4095 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) @@ -268,11 +268,11 @@ body: | ; GFX6-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX6: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX6: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) ; GFX9-LABEL: name: store_private_s32_to_1_constant_4096 ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; GFX9: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) + ; GFX9: BUFFER_STORE_BYTE_OFFEN [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 1, addrspace 5) %0:vgpr(p5) = G_CONSTANT i32 4096 %1:vgpr(s32) = G_CONSTANT i32 0 G_STORE %1, %0 :: (store 1, align 1, addrspace 5) diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll index 25f5d873e9d5..0643505bf999 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; UNPACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; UNPACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; UNPACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; UNPACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16 ; PACKED: bb.1 (%ir-block.0): @@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; PACKED: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; PACKED: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16 ; PACKED: bb.1 (%ir-block.0): @@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16 ; PACKED: bb.1 (%ir-block.0): @@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095 ; PACKED: bb.1 (%ir-block.0): @@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096 ; PACKED: bb.1 (%ir-block.0): @@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; PACKED: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -312,7 +312,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16 ; PACKED: bb.1 (%ir-block.0): @@ -328,7 +328,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -354,7 +354,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095 ; PACKED: bb.1 (%ir-block.0): @@ -370,7 +370,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -396,7 +396,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]] ; UNPACKED: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY8]], [[COPY4]], implicit $exec ; UNPACKED: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: S_ENDPGM 0 ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096 ; PACKED: bb.1 (%ir-block.0): @@ -412,7 +412,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; PACKED: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; PACKED: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -459,7 +459,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; UNPACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; UNPACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; UNPACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; UNPACKED: BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; UNPACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; UNPACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; UNPACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -500,7 +500,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; PACKED: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY10]], implicit $exec ; PACKED: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; PACKED: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; PACKED: BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; PACKED: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; PACKED: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; PACKED: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll index 7de9e4554425..5fa523f7b8da 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll @@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409 ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -202,7 +202,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -226,7 +226,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -250,7 +250,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %13:vgpr_32, dead %15:sreg_64_xexec = V_ADD_I32_e64 [[COPY6]], [[COPY8]], 0, implicit $exec - ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %13, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -294,7 +294,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY12]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %15, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll index 3ed040bdfffc..eea1750d63ee 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll +++ b/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll @@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] ; CHECK: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]] - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: successors: %bb.3(0x40000000), %bb.2(0x40000000) ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr ; CHECK: [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec ; CHECK: [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1) ret void @@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2) ret void @@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3) ret void @@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4) ret void @@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6) ret void @@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5) ret void @@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7) ret void @@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2 - ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) + ; CHECK: BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i8 call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %val.trunc = trunc i32 %val to i16 call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) @@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0) ret void @@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE4:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0 ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0) ret void @@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v ; CHECK: [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0) ret void @@ -537,7 +537,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -559,7 +559,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -581,7 +581,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -600,7 +600,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0) ret void @@ -618,7 +618,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK: [[REG_SEQUENCE:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3 ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0) ret void @@ -639,7 +639,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 16 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 16 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -661,7 +661,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4095 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4095 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -683,7 +683,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 4096 ; CHECK: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]] ; CHECK: %11:vgpr_32, dead %13:sreg_64_xexec = V_ADD_I32_e64 [[COPY5]], [[COPY7]], 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: S_ENDPGM 0 %voffset.add = add i32 %voffset, 4096 call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0) @@ -722,7 +722,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY9]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %11, [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec @@ -765,7 +765,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr ; CHECK: [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec ; CHECK: [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc ; CHECK: [[REG_SEQUENCE3:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3 - ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) + ; CHECK: BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.2, implicit $exec diff --git a/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir b/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir index 323396795dd4..4ac48b1133aa 100644 --- a/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir +++ b/test/CodeGen/AMDGPU/break-vmem-soft-clauses.mir @@ -393,12 +393,12 @@ name: trivial_clause_load_mubuf4_x2 body: | bb.0: ; GCN-LABEL: name: trivial_clause_load_mubuf4_x2 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFEN $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -407,13 +407,13 @@ name: break_clause_simple_load_mubuf_offen_ptr body: | bb.0: ; GCN-LABEL: name: break_clause_simple_load_mubuf_offen_ptr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -424,11 +424,11 @@ name: mubuf_load4_overwrite_ptr body: | bb.0: ; GCN-LABEL: name: mubuf_load4_overwrite_ptr - ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec ; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_MOV_B32_e32 0, implicit $exec $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec S_ENDPGM 0 @@ -443,11 +443,11 @@ body: | ; GCN-LABEL: name: break_clause_flat_load_mubuf_load ; GCN: $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr ; XNACK-NEXT: S_NOP 0 - ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 $vgpr0 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr - $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr2 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... # Break a clause from interference between mubuf and flat instructions @@ -462,7 +462,7 @@ name: break_clause_mubuf_load_flat_load body: | bb.0: - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = FLAT_LOAD_DWORD $vgpr2_vgpr3, 0, 0, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 @@ -504,12 +504,12 @@ name: break_clause_atomic_rtn_into_ptr_mubuf4 body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_rtn_into_ptr_mubuf4 - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; XNACK-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = BUFFER_ATOMIC_ADD_OFFEN_RTN $vgpr2, $vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -521,11 +521,11 @@ body: | bb.0: ; GCN-LABEL: name: break_clause_atomic_nortn_ptr_load_mubuf4 ; GCN: BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 BUFFER_ATOMIC_ADD_OFFEN $vgpr0, $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr2, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -536,11 +536,11 @@ name: no_break_clause_mubuf_load_novaddr body: | bb.0: ; GCN-LABEL: name: no_break_clause_mubuf_load_novaddr - ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN-NEXT: $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN-NEXT: S_ENDPGM 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir index 16021126d1ea..f631bcd25811 100644 --- a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir +++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir @@ -55,10 +55,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -117,10 +117,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -180,10 +180,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -245,10 +245,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -322,10 +322,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,10 +387,10 @@ body: | %26 = V_LSHL_B64 killed %25, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 %18 = COPY %26 - %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit $exec %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit $exec - BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir b/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir index 7839d514a144..599cacb82615 100644 --- a/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir +++ b/test/CodeGen/AMDGPU/coalescer-extend-pruned-subrange.mir @@ -30,7 +30,7 @@ body: | %14:vgpr_32 = V_AND_B32_e32 1, %13, implicit $exec %15:sreg_64_xexec = V_CMP_EQ_U32_e64 0, %14, implicit $exec %16:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %15, implicit $exec - BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) + BUFFER_STORE_DWORD_OFFEN_exact %16, undef %17:vgpr_32, undef %18:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into constant-pool, align 1, addrspace 4) S_ENDPGM 0 bb.2: @@ -78,7 +78,7 @@ body: | bb.8: successors: %bb.10 - %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %31:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %32:vgpr_32, undef %33:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %34:sreg_64_xexec = V_CMP_NE_U32_e64 0, %31, implicit $exec %35:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, %34, implicit $exec %28:vgpr_32 = COPY %35 diff --git a/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir b/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir index d7d8b41f6833..bc549f7bb87b 100644 --- a/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir +++ b/test/CodeGen/AMDGPU/coalescer-subranges-another-copymi-not-live.mir @@ -83,7 +83,7 @@ body: | bb.9: successors: %bb.10(0x80000000) - %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %19:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %18, undef %20:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %21:sreg_64 = V_CMP_NE_U32_e64 target-flags(amdgpu-gotprel) 0, killed %19.sub0, implicit $exec %22:sreg_64 = COPY $exec, implicit-def $exec %23:sreg_64 = S_AND_B64 %22, %21, implicit-def dead $scc diff --git a/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir b/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir index 4dffe32f9b18..67399883ae07 100644 --- a/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir +++ b/test/CodeGen/AMDGPU/coalescer-subranges-another-prune-error.mir @@ -68,7 +68,7 @@ body: | %23:vreg_128 = COPY killed %17 %24:sreg_64 = COPY killed %16 %25:vgpr_32 = V_OR_B32_e32 %22, %11, implicit $exec - %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + %26:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %25, undef %27:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) %28:vgpr_32 = V_LSHRREV_B32_e32 30, killed %26.sub0, implicit $exec %29:vreg_128 = COPY killed %21 %29.sub0:vreg_128 = COPY %1 diff --git a/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir b/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir index eb3d6169e976..773466af7adb 100644 --- a/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir +++ b/test/CodeGen/AMDGPU/coalescer-subregjoin-fullcopy.mir @@ -11,7 +11,7 @@ # # GCN-LABEL: bb.6: # GCN: successors: %bb.7(0x{{[0-9]+}}), %bb.18(0x{{[0-9]+}}) -# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN: %{{[0-9]+}}:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET %{{[0-9]+}}, 0, 0, 0, 0, 0, 0, 0, implicit $exec # --- | @@ -69,7 +69,7 @@ body: | %10:sreg_64 = COPY killed %5 undef %11.sub2:sreg_128 = COPY %4 %11.sub3:sreg_128 = COPY %3 - %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, implicit $exec + %12:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET killed %11, 0, 0, 0, 0, 0, 0, 0, implicit $exec undef %13.sub1:vreg_128 = COPY %9.sub1 %13.sub2:vreg_128 = COPY %9.sub2 %14:sreg_64 = V_CMP_GT_F32_e64 0, target-flags(amdgpu-rel32-lo) 0, 0, killed %12.sub3, 0, implicit $exec @@ -161,7 +161,7 @@ body: | bb.18: successors: %bb.7(0x80000000) dead %59:vgpr_32 = V_FMA_F32 0, killed %9.sub2, 0, undef %60:vgpr_32, 0, undef %61:vgpr_32, 0, 0, implicit $exec - dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, implicit $exec + dead %62:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN undef %63:vgpr_32, undef %64:sreg_128, undef %65:sreg_32, 0, 0, 0, 0, 0, 0, implicit $exec undef %66.sub1:vreg_128 = COPY %13.sub1 %66.sub2:vreg_128 = COPY %13.sub2 %67:sreg_64 = V_CMP_NGT_F32_e64 0, 0, 0, undef %68:vgpr_32, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir b/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir index a01f1e71dace..4c532e89398e 100644 --- a/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir +++ b/test/CodeGen/AMDGPU/coalescer-with-subregs-bad-identical.mir @@ -148,7 +148,7 @@ body: | %43:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %44:sreg_128, 12, 0, 0 :: (dereferenceable invariant load 4) %45:vgpr_32 = V_MUL_LO_I32 killed %42, killed %43, implicit $exec %46:vgpr_32 = V_LSHLREV_B32_e32 2, killed %45, implicit $exec - %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) + %47:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN killed %46, undef %48:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from constant-pool, align 1, addrspace 4) %49:sreg_64 = V_CMP_NE_U32_e64 0, killed %47, implicit $exec %50:sreg_64 = COPY $exec, implicit-def $exec %51:sreg_64 = S_AND_B64 %50, %49, implicit-def dead $scc diff --git a/test/CodeGen/AMDGPU/collapse-endcf.mir b/test/CodeGen/AMDGPU/collapse-endcf.mir index 708814e3df45..1a26a507cd9b 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf.mir +++ b/test/CodeGen/AMDGPU/collapse-endcf.mir @@ -33,7 +33,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -44,7 +44,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: DBG_VALUE @@ -80,7 +80,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -92,7 +92,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -141,7 +141,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -152,7 +152,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -188,7 +188,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -200,7 +200,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -249,7 +249,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -260,7 +260,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: bb.4: @@ -297,7 +297,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -309,7 +309,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -358,7 +358,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -370,7 +370,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -408,7 +408,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -420,7 +420,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: %15:sgpr_32 = IMPLICIT_DEF @@ -471,7 +471,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 $exec, [[V_CMP_NE_U32_e64_]], implicit-def dead $scc ; GCN: $exec = S_MOV_B64_term [[S_AND_B64_1]] @@ -482,7 +482,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF @@ -520,7 +520,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -532,7 +532,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -583,7 +583,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -595,7 +595,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -631,7 +631,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -643,7 +643,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -691,7 +691,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -703,7 +703,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -739,7 +739,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -751,7 +751,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc @@ -799,7 +799,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -811,7 +811,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.5(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -850,7 +850,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -862,7 +862,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/test/CodeGen/AMDGPU/collapse-endcf2.mir b/test/CodeGen/AMDGPU/collapse-endcf2.mir index 44a8e38a5655..9219083bb64c 100644 --- a/test/CodeGen/AMDGPU/collapse-endcf2.mir +++ b/test/CodeGen/AMDGPU/collapse-endcf2.mir @@ -42,7 +42,7 @@ body: | ; GCN: %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, [[COPY3]], %9, 0, implicit $exec ; GCN: %5.sub3:sgpr_128 = S_MOV_B32 61440 ; GCN: %5.sub2:sgpr_128 = S_MOV_B32 0 - ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: [[V_CMP_NE_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_NE_U32_e64 2, [[COPY1]], implicit $exec ; GCN: [[COPY4:%[0-9]+]]:sreg_64 = COPY $exec, implicit-def $exec ; GCN: [[S_AND_B64_1:%[0-9]+]]:sreg_64 = S_AND_B64 [[COPY4]], [[V_CMP_NE_U32_e64_]], implicit-def dead $scc @@ -54,7 +54,7 @@ body: | ; GCN: %5.sub0:sgpr_128 = COPY %5.sub2 ; GCN: %5.sub1:sgpr_128 = COPY %5.sub2 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_ADDR64 [[V_MOV_B32_e32_]], %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) ; GCN: bb.3: ; GCN: successors: %bb.4(0x80000000) ; GCN: $exec = S_OR_B64 $exec, [[COPY4]], implicit-def $scc @@ -91,7 +91,7 @@ body: | %8.sub1:vreg_64, dead %10:sreg_64_xexec = V_ADDC_U32_e64 0, %7, %9, 0, implicit $exec %5.sub3:sgpr_128 = S_MOV_B32 61440 %5.sub2:sgpr_128 = S_MOV_B32 0 - BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %6.sub1, %6, %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) %11:sreg_64 = V_CMP_NE_U32_e64 2, %0, implicit $exec %12:sreg_64 = COPY $exec, implicit-def $exec %13:sreg_64 = S_AND_B64 %12, %11, implicit-def dead $scc @@ -103,7 +103,7 @@ body: | %5.sub0:sgpr_128 = COPY %5.sub2 %5.sub1:sgpr_128 = COPY %5.sub2 %14:vgpr_32 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) + BUFFER_STORE_DWORD_ADDR64 %14, %8, %5, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4, addrspace 1) bb.3: $exec = S_OR_B64 $exec, %12, implicit-def $scc diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index e5ff97a7be3d..92e29f3a5290 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -54,7 +54,7 @@ body: | %8 = S_MOV_B32 9999 %9 = S_AND_B32 killed %7, killed %8, implicit-def dead $scc %10 = COPY %9 - BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %10, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -219,7 +219,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHL_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -419,7 +419,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_ASHR_I32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -627,7 +627,7 @@ body: | %10 = REG_SEQUENCE killed %7, 1, killed %6, 2, killed %9, 3, killed %8, 4 %12 = S_LSHR_B32 killed %5, 12, implicit-def dead $scc %13 = COPY %12 - BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %13, killed %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir b/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir index 4679831c786d..bba41584bc97 100644 --- a/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir +++ b/test/CodeGen/AMDGPU/couldnt-join-subrange-3.mir @@ -291,7 +291,7 @@ body: | bb.3..lr.ph3410.preheader: successors: %bb.4(0x80000000) - dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) + dead %22:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN killed %53.sub3, undef %24:sreg_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from constant-pool, align 1, addrspace 4) dead %60:vgpr_32 = V_MOV_B32_e32 -1, implicit $exec %36:sreg_64 = S_AND_B64 $exec, -1, implicit-def dead $scc dead %67:vgpr_32 = V_MOV_B32_e32 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll index a39833455a15..70e5df5788ae 100644 --- a/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll +++ b/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll @@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]] ; GCN: [[DEF1:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) + ; GCN: [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4) ; GCN: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2 ; GCN: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1 ; GCN: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0 @@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) { ; GCN: [[DEF2:%[0-9]+]]:sreg_32_xm0 = IMPLICIT_DEF ; GCN: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]] ; GCN: [[DEF3:%[0-9]+]]:sreg_128 = IMPLICIT_DEF - ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) + ; GCN: BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4) ; GCN: S_ENDPGM 0 main_body: %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0) diff --git a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir index a015a1ef4d11..f80176508bef 100644 --- a/test/CodeGen/AMDGPU/fold-fi-mubuf.mir +++ b/test/CodeGen/AMDGPU/fold-fi-mubuf.mir @@ -23,13 +23,13 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], 0, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %1:sreg_32_xm0 = S_MOV_B32 0 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, %1, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -57,12 +57,12 @@ body: | ; GCN: liveins: $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[COPY:%[0-9]+]]:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[V_MOV_B32_e32_]], [[COPY]], $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]] ; GCN: SI_RETURN_TO_EPILOG $vgpr0 %0:sgpr_128 = COPY $sgpr12_sgpr13_sgpr14_sgpr15 %2:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec - %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %2, %0, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %3 SI_RETURN_TO_EPILOG $vgpr0 @@ -87,15 +87,15 @@ body: | ; GCN-LABEL: name: fold_fi_mubuf_scratch_scratch_wave_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 @@ -119,15 +119,15 @@ body: | ; GCN-LABEL: name: no_fold_fi_mubuf_scratch_sp_offset ; GCN: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + ; GCN: BUFFER_STORE_DWORD_OFFEN [[V_MOV_B32_e32_]], %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + ; GCN: [[BUFFER_LOAD_DWORD_OFFEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec ; GCN: $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFEN]] ; GCN: S_ENDPGM 0, implicit $vgpr0 %0:vgpr_32 = V_MOV_B32_e32 %stack.0, implicit $exec %1:vgpr_32 = V_MOV_B32_e32 7, implicit $exec - BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %1:vgpr_32, %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec + %2:vgpr_32 = BUFFER_LOAD_DWORD_OFFEN %0:vgpr_32, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr0 = COPY %2 S_ENDPGM 0, implicit $vgpr0 diff --git a/test/CodeGen/AMDGPU/fold-imm-copy.mir b/test/CodeGen/AMDGPU/fold-imm-copy.mir index 7fe6ce845ab9..f2d423a70785 100644 --- a/test/CodeGen/AMDGPU/fold-imm-copy.mir +++ b/test/CodeGen/AMDGPU/fold-imm-copy.mir @@ -17,7 +17,7 @@ body: | %4:vgpr_32 = V_LSHLREV_B32_e64 killed %3, %0, implicit $exec %5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec %6:vreg_64 = REG_SEQUENCE killed %4, %subreg.sub0, killed %5, %subreg.sub1 - %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %6, %2, 0, 4, 0, 0, 0, 0, 0, implicit $exec %8:sreg_32_xm0 = S_MOV_B32 65535 %9:vgpr_32 = COPY %8 %10:vgpr_32 = V_AND_B32_e32 %7, %9, implicit $exec diff --git a/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir index 3ab99551012f..1e596b79016a 100644 --- a/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir +++ b/test/CodeGen/AMDGPU/fold-imm-f16-f32.mir @@ -158,10 +158,10 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %12 = V_MOV_B32_e32 1065353216, implicit $exec %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -222,13 +222,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1065353216, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -289,14 +289,14 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -360,16 +360,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 1065353216, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -427,13 +427,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 1, implicit $exec %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -494,16 +494,16 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %14 = V_MOV_B32_e32 -2, implicit $exec %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit $exec %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit $exec %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -564,13 +564,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) %13 = V_MOV_B32_e32 15360, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) S_ENDPGM 0 ... @@ -631,13 +631,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 80886784, implicit $exec %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... @@ -697,13 +697,13 @@ body: | %8 = S_MOV_B32 61440 %9 = S_MOV_B32 -1 %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 - %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 2 from `half addrspace(1)* undef`) %13 = V_MOV_B32_e32 305413120, implicit $exec %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit $exec %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) - BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 2 into `half addrspace(1)* undef`) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir index 65b254e7616a..e26f0c934fce 100644 --- a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir +++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir @@ -60,13 +60,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -131,13 +131,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -202,13 +202,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -273,13 +273,13 @@ body: | %17 = REG_SEQUENCE killed %6, 17, %13, 18 %18 = REG_SEQUENCE killed %4, 17, %13, 18 %20 = COPY %29 - %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, 0, 0, implicit $exec %22 = COPY %29 - %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, implicit $exec + %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec %23 = V_MOV_B32_e32 1090519040, implicit $exec %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit $exec %26 = COPY %29 - BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/fold-multiple.mir b/test/CodeGen/AMDGPU/fold-multiple.mir index ef35b2634579..d8c396c9d4a4 100644 --- a/test/CodeGen/AMDGPU/fold-multiple.mir +++ b/test/CodeGen/AMDGPU/fold-multiple.mir @@ -34,7 +34,7 @@ body: | %3 = S_LSHL_B32 %1, killed %1, implicit-def dead $scc %4 = V_AND_B32_e64 killed %2, killed %3, implicit $exec %5 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed %4, killed %5, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir b/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir index 8cc294f57b26..bd6244127e6f 100644 --- a/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir +++ b/test/CodeGen/AMDGPU/hazard-buffer-store-v-interp.mir @@ -12,7 +12,7 @@ body: | bb.0.entry: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr7, $vgpr8, $vgpr9, $vgpr10 - BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET_exact killed $vgpr7_vgpr8_vgpr9_vgpr10, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 96, 0, 0, 0, 0, 0, implicit $exec $vgpr7 = V_INTERP_P1_F32 $vgpr0, 0, 0, implicit $m0, implicit $exec S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir b/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir index 9ef2431df6ee..d0f32f287473 100644 --- a/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir +++ b/test/CodeGen/AMDGPU/hazard-hidden-bundle.mir @@ -57,7 +57,7 @@ body: | BUNDLE implicit-def $sgpr0_sgpr1, implicit $sgpr10_sgpr11 { $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 } - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -91,5 +91,5 @@ body: | } bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/indirect-addressing-term.ll b/test/CodeGen/AMDGPU/indirect-addressing-term.ll index 053a68440764..40722ce43741 100644 --- a/test/CodeGen/AMDGPU/indirect-addressing-term.ll +++ b/test/CodeGen/AMDGPU/indirect-addressing-term.ll @@ -104,7 +104,7 @@ define amdgpu_kernel void @extract_w_offset_vgpr(i32 addrspace(1)* %out) { ; GCN: $exec = S_MOV_B64 renamable $sgpr0_sgpr1 ; GCN: $vgpr0 = SI_SPILL_V32_RESTORE %stack.8, $sgpr96_sgpr97_sgpr98_sgpr99, $sgpr3, 0, implicit $exec :: (load 4 from %stack.8, addrspace 5) ; GCN: $sgpr4_sgpr5_sgpr6_sgpr7 = SI_SPILL_S128_RESTORE %stack.1, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr3, implicit-def dead $m0 :: (load 16 from %stack.1, align 4, addrspace 5) - ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) + ; GCN: BUFFER_STORE_DWORD_OFFSET renamable $vgpr0, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.load, addrspace 1) ; GCN: S_ENDPGM 0 entry: %id = call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir b/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir index 09f1ba901060..b305cfddb5a5 100644 --- a/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir +++ b/test/CodeGen/AMDGPU/insert-skips-flat-vmem.mir @@ -41,7 +41,7 @@ body: | ; CHECK: bb.1: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + ; CHECK: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec ; CHECK: bb.2: ; CHECK: S_ENDPGM 0 bb.0: @@ -51,7 +51,7 @@ body: | bb.1: successors: %bb.2 $vgpr0 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir b/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir index 9f39dc341509..5797bb5cfa29 100644 --- a/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir +++ b/test/CodeGen/AMDGPU/insert-waitcnts-exp.mir @@ -49,10 +49,10 @@ body: | bb.0 (%ir-block.2): $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) - $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr2 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) + $vgpr3 = BUFFER_LOAD_DWORD_OFFSET killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 4 from `float addrspace(1)* undef`) EXP_DONE 0, killed $vgpr0, killed $vgpr1, killed $vgpr2, killed $vgpr3, -1, -1, 15, implicit $exec $vgpr0 = V_MOV_B32_e32 1056964608, implicit $exec $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir index 6e67f7df30a7..1ab10fa92f7b 100644 --- a/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -230,17 +230,17 @@ name: vmem_gt_8dw_store body: | bb.0: - BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX3_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORDX4_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZ_OFFSET $vgpr2_vgpr3_vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec - BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_FORMAT_XYZW_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec BUFFER_ATOMIC_CMPSWAP_X2_OFFSET $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, implicit $exec $vgpr3 = V_MOV_B32_e32 0, implicit $exec @@ -553,10 +553,10 @@ body: | dead $sgpr6_sgpr7 = KILL $sgpr4_sgpr5 $sgpr8 = S_MOV_B32 $sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr + 4) $sgpr8 = S_MOV_B32 $sgpr4, implicit killed $sgpr4_sgpr5 $vgpr0 = V_MOV_B32_e32 killed $sgpr8, implicit $exec - BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) + BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.A.addr) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir index 48f0be4ff8fd..0a60eaf7c03f 100644 --- a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir +++ b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir @@ -64,7 +64,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec S_BRANCH %bb.3 @@ -72,7 +72,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec bb.3.done: @@ -80,7 +80,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir b/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir index 69c038b976b8..566b1c06fb12 100644 --- a/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir +++ b/test/CodeGen/AMDGPU/lds-branch-vmem-hazard.mir @@ -12,7 +12,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -24,7 +24,7 @@ name: hazard_buf_branch_lds body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -56,11 +56,11 @@ name: no_hazard_buf_branch_buf body: | bb.0: successors: %bb.1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -75,7 +75,7 @@ body: | $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -87,7 +87,7 @@ name: no_hazard_lds_branch_buf_samebb body: | bb.0: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -101,7 +101,7 @@ body: | bb.0: successors: %bb.0 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... @@ -118,8 +118,8 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -137,7 +137,7 @@ body: | bb.1: $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -150,11 +150,11 @@ body: | bb.0: successors: %bb.1 $vgpr1 = DS_READ_B32 undef $vgpr0, 0, 0, implicit $m0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -171,7 +171,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 1 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -189,7 +189,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr_null, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -206,7 +206,7 @@ body: | bb.1: S_WAITCNT_VSCNT undef $sgpr0, 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -223,7 +223,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll index 958a72566b5f..9f18f4df40bf 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.ll @@ -400,6 +400,46 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 0) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 0) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 0) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 0) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 0) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 0) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_load_x1_offset_swizzled_not_merged: +;CHECK-NEXT: %bb. +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-NEXT: buffer_load_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +;CHECK: s_waitcnt +define amdgpu_ps void @raw_buffer_load_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc) { +main_body: + %r1 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 4, i32 0, i32 8) + %r2 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 8, i32 0, i32 8) + %r3 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 12, i32 0, i32 8) + %r4 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 16, i32 0, i32 8) + %r5 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 28, i32 0, i32 8) + %r6 = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 32, i32 0, i32 8) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r1, float %r2, float %r3, float %r4, i1 true, i1 true) + call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r5, float %r6, float undef, float undef, i1 true, i1 true) + ret void +} + declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32) #0 declare <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32>, i32, i32, i32) #0 declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll index 7d1a5a3b99a0..1bfe0aa4086e 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll @@ -276,6 +276,37 @@ main_body: ret void } +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_merged: +;CHECK-NOT: s_waitcnt +;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28 +define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 0) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 0) + ret void +} + +;CHECK-LABEL: {{^}}raw_buffer_store_x1_offset_swizzled_not_merged: +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28 +;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32 +define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) { + call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v4, <4 x i32> %rsrc, i32 16, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v5, <4 x i32> %rsrc, i32 28, i32 0, i32 8) + call void @llvm.amdgcn.raw.buffer.store.f32(float %v6, <4 x i32> %rsrc, i32 32, i32 0, i32 8) + ret void +} + declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float>, <4 x i32>, i32, i32, i32) #0 declare void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i32) #0 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir index f33c2115dcb2..673fff50b39c 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-atomic-insert-end.mir @@ -86,7 +86,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 0 S_WAITCNT 127 - $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) + $vgpr1_vgpr2 = BUFFER_LOAD_DWORDX2_ADDR64 killed $vgpr1_vgpr2, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile load 8 from %ir.tid.gep) $vgpr0 = V_XOR_B32_e32 1, killed $vgpr0, implicit $exec V_CMP_NE_U32_e32 0, killed $vgpr0, implicit-def $vcc, implicit $exec $sgpr2_sgpr3 = S_AND_SAVEEXEC_B64 killed $vcc, implicit-def $exec, implicit-def $scc, implicit $exec diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir index 1046b9729df4..99348a57b9f6 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-atomics.mir @@ -23,13 +23,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into `i32 addrspace(5)* undef`) S_CBRANCH_SCC0 %bb.1, implicit killed $scc bb.2: @@ -55,7 +55,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load syncscope("agent-one-as") unordered 4 from `i32 addrspace(1)* undef`), (load syncscope("workgroup-one-as") seq_cst 4 from `[8192 x i32] addrspace(5)* undef`) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir index bf24ce15acb6..f52275af48c9 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-1.mir @@ -117,13 +117,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -149,7 +149,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (non-temporal load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir index a6088b0677a0..c543b80454b6 100644 --- a/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir +++ b/test/CodeGen/AMDGPU/memory-legalizer-multiple-mem-operands-nontemporal-2.mir @@ -97,13 +97,13 @@ body: | $sgpr10 = S_MOV_B32 4294967295, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $sgpr11 = S_MOV_B32 15204352, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11 $vgpr0 = V_MOV_B32_e32 1, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr01) S_WAITCNT 127 S_CMP_LG_U32 killed $sgpr2, 0, implicit-def $scc S_WAITCNT 3855 $vgpr0 = V_MOV_B32_e32 2, implicit $exec $vgpr1 = V_MOV_B32_e32 32772, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, killed $vgpr1, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.scratchptr12) S_CBRANCH_SCC0 %bb.1.if, implicit killed $scc bb.2.else: @@ -129,7 +129,7 @@ body: | S_WAITCNT 127 $sgpr0 = S_LSHL_B32 killed $sgpr0, 2, implicit-def dead $scc $vgpr0 = V_ADD_I32_e32 killed $sgpr0, killed $vgpr0, implicit-def dead $vcc, implicit $exec - $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) + $vgpr0 = BUFFER_LOAD_DWORD_OFFEN killed $vgpr0, killed $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %ir.else_ptr), (non-temporal load 4 from %ir.if_ptr) $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr4_sgpr5 $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit $sgpr4_sgpr5, implicit $exec S_WAITCNT 3952 diff --git a/test/CodeGen/AMDGPU/memory_clause.mir b/test/CodeGen/AMDGPU/memory_clause.mir index ac412a8fc29b..b46cfb16b7ba 100644 --- a/test/CodeGen/AMDGPU/memory_clause.mir +++ b/test/CodeGen/AMDGPU/memory_clause.mir @@ -337,7 +337,7 @@ body: | # GCN: dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec { # GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec # GCN-NEXT: } --- @@ -357,7 +357,7 @@ body: | %2 = IMPLICIT_DEF %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, implicit $exec + %5:vgpr_32 = BUFFER_LOAD_DWORD_ADDR64 %0, %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: atomic{{$}} diff --git a/test/CodeGen/AMDGPU/merge-load-store.mir b/test/CodeGen/AMDGPU/merge-load-store.mir index becd2e1b9c1e..6bff48467b59 100644 --- a/test/CodeGen/AMDGPU/merge-load-store.mir +++ b/test/CodeGen/AMDGPU/merge-load-store.mir @@ -169,10 +169,10 @@ body: | --- # CHECK-LABEL: merge_mmos # CHECK: S_BUFFER_LOAD_DWORDX2_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) -# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 -# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8, align 4) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8, align 4) +# CHECK: BUFFER_LOAD_DWORDX2_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 8 from %ir.ptr_addr1 + 64, align 4 +# CHECK: BUFFER_STORE_DWORDX2_OFFSET_exact killed %{{[0-9]+}}, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into %ir.ptr_addr1 + 64, align 4 name: merge_mmos tracksRegLiveness: true body: | @@ -182,14 +182,14 @@ body: | %0:sreg_128 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 0, 0, 0 :: (dereferenceable invariant load 4) %2:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %0, 1, 0, 0 :: (dereferenceable invariant load 4) - %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) - BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) - %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) - %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) - BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) - BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) + %3:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + %4:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4) + BUFFER_STORE_DWORD_OFFSET_exact %3, %0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + BUFFER_STORE_DWORD_OFFSET_exact %4, %0, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4) + %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 64) + %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from %ir.ptr_addr1 + 68) + BUFFER_STORE_DWORD_OFFSET_exact %5, %0, 0, 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 64) + BUFFER_STORE_DWORD_OFFSET_exact %6, %0, 0, 68, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into %ir.ptr_addr1 + 68) S_ENDPGM 0 diff --git a/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir b/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir index 9cfd92f86c4f..ccff6bb51275 100644 --- a/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir +++ b/test/CodeGen/AMDGPU/mubuf-legalize-operands.mir @@ -25,7 +25,7 @@ # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -47,7 +47,7 @@ # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -72,7 +72,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -94,7 +94,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -116,7 +116,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -141,7 +141,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -163,7 +163,7 @@ body: | # W64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-LABEL bb.2: @@ -185,7 +185,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -210,7 +210,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_BOTHEN %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -226,7 +226,7 @@ body: | # ADDR64: %9:vgpr_32, %12:sreg_64_xexec = V_ADD_I32_e64 %14.sub0, %4.sub0, 0, implicit $exec # ADDR64: %10:vgpr_32, dead %13:sreg_64_xexec = V_ADDC_U32_e64 %14.sub1, %4.sub1, killed %12, 0, implicit $exec # ADDR64: %11:vreg_64 = REG_SEQUENCE %9, %subreg.sub0, %10, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %11, killed %18, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: addr64 liveins: @@ -246,7 +246,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 %4, killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 @@ -269,7 +269,7 @@ body: | # W64-NO-ADDR64: [[CMP1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W64-NO-ADDR64: [[CMP:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[CMP0]], [[CMP1]], implicit-def $scc # W64-NO-ADDR64: [[TMPEXEC:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W64-NO-ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # W64-NO-ADDR64: $exec = S_XOR_B64_term $exec, [[TMPEXEC]], implicit-def $scc # W64-NO-ADDR64: S_CBRANCH_EXECNZ %bb.1, implicit $exec # W64-NO-ADDR64-LABEL bb.2: @@ -289,7 +289,7 @@ body: | # W32: [[CMP1:%[0-9]+]]:sreg_32_xm0_xexec = V_CMP_EQ_U64_e64 [[SRSRC]].sub2_sub3, [[VRSRC]].sub2_sub3, implicit $exec # W32: [[CMP:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_B32 [[CMP0]], [[CMP1]], implicit-def $scc # W32: [[TMPEXEC:%[0-9]+]]:sreg_32_xm0_xexec = S_AND_SAVEEXEC_B32 killed [[CMP]], implicit-def $exec, implicit-def $scc, implicit $exec -# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# W32: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed [[SRSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec # TODO: S_XOR_B32_term should be `implicit-def $scc` # W32: $exec_lo = S_XOR_B32_term $exec_lo, [[TMPEXEC]] # W32: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -303,7 +303,7 @@ body: | # ADDR64: [[RSRCFMTHI:%[0-9]+]]:sgpr_32 = S_MOV_B32 61440 # ADDR64: [[ZERORSRC:%[0-9]+]]:sreg_128 = REG_SEQUENCE [[ZERO64]], %subreg.sub0_sub1, [[RSRCFMTLO]], %subreg.sub2, [[RSRCFMTHI]], %subreg.sub3 # ADDR64: [[VADDR64:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[RSRCPTR]].sub0, %subreg.sub0, [[RSRCPTR]].sub1, %subreg.sub1 -# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, implicit $exec +# ADDR64: {{[0-9]+}}:vgpr_32 = BUFFER_LOAD_FORMAT_X_ADDR64 [[VADDR64]], [[ZERORSRC]], 0, 0, 0, 0, 0, 0, 0, implicit $exec --- name: offset @@ -324,7 +324,7 @@ body: | %1:vgpr_32 = COPY $vgpr1 %0:vgpr_32 = COPY $vgpr0 %6:sreg_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3 - %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, implicit $exec + %7:vgpr_32 = BUFFER_LOAD_FORMAT_X_OFFSET killed %6, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 $vgpr0 = COPY %7 S_SETPC_B64_return $sgpr30_sgpr31, implicit $vgpr0 diff --git a/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir b/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir index 0dd135723d8c..39d3efe2a1de 100644 --- a/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir +++ b/test/CodeGen/AMDGPU/nsa-vmem-hazard.mir @@ -9,7 +9,7 @@ name: hazard_image_sample_d_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_off1 @@ -20,7 +20,7 @@ name: no_hazard_image_sample_d_buf_off1 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 1, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: no_hazard_image_sample_d_buf_far @@ -33,7 +33,7 @@ body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_D_V4_V9_nsa_gfx10 undef $vgpr3, undef $vgpr8, undef $vgpr7, undef $vgpr5, undef $vgpr4, undef $vgpr6, undef $vgpr0, undef $vgpr2, undef $vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec V_NOP_e32 implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Non-NSA @@ -45,7 +45,7 @@ name: no_hazard_image_sample_v4_v2_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V2_gfx10 undef $vgpr1_vgpr2, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 1, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... # Less than 4 dwords @@ -57,5 +57,5 @@ name: no_hazard_image_sample_v4_v3_buf_off6 body: | bb.0: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_SAMPLE_V4_V3_nsa_gfx10 undef $vgpr1, undef $vgpr2, undef $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, undef $sgpr8_sgpr9_sgpr10_sgpr11, 15, 2, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec - $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFSET undef $sgpr0_sgpr1_sgpr2_sgpr3, undef $sgpr4, 6, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir index 3af2f0457fbb..0e7708210a90 100644 --- a/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir +++ b/test/CodeGen/AMDGPU/opt-sgpr-to-vgpr-copy.mir @@ -137,7 +137,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -243,7 +243,7 @@ body: | %37 = REG_SEQUENCE %6, 17, killed %36, 18 %38 = V_MOV_B32_e32 0, implicit $exec %39 = COPY %33 - BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %38, killed %39, killed %37, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec @@ -332,7 +332,7 @@ body: | %28 = REG_SEQUENCE %6, 17, killed %27, 18 %29 = V_MOV_B32_e32 0, implicit $exec %30 = COPY %24 - BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 killed %29, killed %30, killed %28, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.bb2: SI_END_CF %1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir index 7da7dc8d3199..53ca546969b9 100644 --- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir +++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir @@ -151,7 +151,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -159,7 +159,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -188,7 +188,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -196,7 +196,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -225,7 +225,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -233,14 +233,14 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- # CHECK-LABEL: name: optimize_if_and_saveexec_xor_valu_middle # CHECK: $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc -# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec # CHECK-NEXT: $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc # CHECK-NEXT: $exec = COPY killed $sgpr2_sgpr3 # CHECK-NEXT: SI_MASK_BRANCH @@ -255,7 +255,7 @@ body: | $vcc = V_CMP_EQ_I32_e64 0, killed $vgpr0, implicit $exec $vgpr0 = V_MOV_B32_e32 4, implicit $exec $sgpr2_sgpr3 = S_AND_B64 $sgpr0_sgpr1, killed $vcc, implicit-def $scc - BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET $vgpr0, undef $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0_sgpr1 = S_XOR_B64 $sgpr2_sgpr3, killed $sgpr0_sgpr1, implicit-def $scc $exec = S_MOV_B64_term killed $sgpr2_sgpr3 SI_MASK_BRANCH %bb.2, implicit $exec @@ -266,7 +266,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -274,7 +274,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -304,7 +304,7 @@ body: | bb.1.if: liveins: $sgpr0_sgpr1 , $sgpr4_sgpr5_sgpr6_sgpr7 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1, $sgpr4_sgpr5_sgpr6_sgpr7 @@ -312,7 +312,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -346,7 +346,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -356,7 +356,7 @@ body: | $sgpr1 = S_MOV_B32 1 $sgpr2 = S_MOV_B32 -1 $sgpr3 = S_MOV_B32 61440 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -387,7 +387,7 @@ body: | S_SLEEP 0, implicit $sgpr2_sgpr3 $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -395,7 +395,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -426,7 +426,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -434,7 +434,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -463,7 +463,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -471,7 +471,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -500,7 +500,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -508,7 +508,7 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -539,7 +539,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 - $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2.end: liveins: $vgpr0, $sgpr0_sgpr1 @@ -547,6 +547,6 @@ body: | $exec = S_OR_B64 $exec, killed $sgpr0_sgpr1, implicit-def $scc $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir b/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir index 7a1cfa32a60c..39915f2755ce 100644 --- a/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir +++ b/test/CodeGen/AMDGPU/pei-reg-scavenger-position.mir @@ -27,12 +27,12 @@ body: | ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr33, $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_BRANCH %bb.1 ; CHECK: bb.1: ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: $sgpr4 = S_ADD_U32 $sgpr32, 524288, implicit-def $scc - ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) + ; CHECK: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, killed $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec :: (load 4 from %stack.0, align 8192, addrspace 5) ; CHECK: S_ENDPGM 0, implicit $vgpr0 bb.0: $vgpr0 = SI_SPILL_V32_RESTORE %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, implicit $exec :: (load 4 from %stack.0, addrspace 5) diff --git a/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir b/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir index 3c99cc7c19da..807029a92f34 100644 --- a/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir +++ b/test/CodeGen/AMDGPU/phi-elimination-end-cf.mir @@ -46,7 +46,7 @@ body: | %15:sreg_32_xm0 = S_MOV_B32 61440 %16:sreg_32_xm0 = S_MOV_B32 -1 %17:sreg_128 = REG_SEQUENCE undef %14:sreg_32_xm0, %subreg.sub0, undef %12:sreg_32_xm0, %subreg.sub1, %16, %subreg.sub2, %15, %subreg.sub3 - BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %4, %17, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`, addrspace 1) %19:vgpr_32 = COPY %4 %20:sreg_64 = SI_IF %0, %bb.2, implicit-def dead $exec, implicit-def dead $scc, implicit $exec S_BRANCH %bb.3 diff --git a/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir index ed648ece0c71..66bd4c163c66 100644 --- a/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir +++ b/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir @@ -17,6 +17,6 @@ body: | S_BARRIER $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15_agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 = V_MFMA_F32_32X32X1F32 undef $vgpr0, undef $vgpr0, 0, 0, 0, 2, implicit $exec $vgpr0 = V_ACCVGPR_READ_B32 $agpr31, implicit $exec - BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN killed $vgpr0, undef $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, $sgpr6, 0, 0, 0, 0, 0, 0, implicit $exec ... diff --git a/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir b/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir index 196301f4cb07..1d9ab685c532 100644 --- a/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir +++ b/test/CodeGen/AMDGPU/regcoal-subrange-join-seg.mir @@ -185,7 +185,7 @@ body: | bb.28: %9 = S_FF1_I32_B32 undef %10 %13 = V_MAD_U32_U24 killed %9, 48, 32, 0, implicit $exec - %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) + %45 = BUFFER_LOAD_DWORD_OFFEN killed %13, undef %15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable invariant load 4) %46 = V_AND_B32_e32 1, killed %45, implicit $exec %21 = S_BUFFER_LOAD_DWORD_SGPR undef %22, undef %23, 0, 0 :: (dereferenceable invariant load 4) %25 = V_CMP_GE_F32_e64 0, 0, 0, killed %21, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir index 016c5ad32023..a92fe49e1b73 100644 --- a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir +++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir @@ -70,7 +70,7 @@ body: | %13.sub2_sub3 = COPY killed %12 %20 = V_LSHL_B64 killed %19, 2, implicit $exec %16 = COPY killed %5 - BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir index 0413075dd86c..331cccd853c2 100644 --- a/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir +++ b/test/CodeGen/AMDGPU/rename-independent-subregs-mac-operands.mir @@ -134,10 +134,10 @@ body: | %6.sub2 = COPY %6.sub0 bb.2: - BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub3, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 12, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub2, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 8, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub1, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 4, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %6.sub0, %0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr30_sgpr31 = COPY %5 S_SETPC_B64_return $sgpr30_sgpr31 diff --git a/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir b/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir index 915f63dbdd47..0d2f90793fc3 100644 --- a/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir +++ b/test/CodeGen/AMDGPU/sched-assert-dead-def-subreg-use-other-subreg.mir @@ -28,7 +28,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:vreg_512 = COPY %0 ; CHECK: bb.1: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + ; CHECK: BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) ; CHECK: dead %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec ; CHECK: dead %8:vreg_64 = DS_READ_B64_gfx9 [[V_MOV_B32_e32_]], 0, 0, implicit $exec ; CHECK: dead %9:vreg_128 = DS_READ_B128_gfx9 [[V_ADD_U32_e32_]], 0, 0, implicit $exec @@ -52,7 +52,7 @@ body: | %4:vreg_512 = COPY %0 bb.1: - BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) + BUFFER_STORE_DWORD_OFFEN %0.sub3, undef %5:vgpr_32, $sgpr24_sgpr25_sgpr26_sgpr27, $sgpr32, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4, align 8, addrspace 5) %6:vgpr_32 = DS_READ_B32_gfx9 undef %7:vgpr_32, 0, 0, implicit $exec %8:vreg_64 = DS_READ_B64_gfx9 %1, 0, 0, implicit $exec %9:vreg_128 = DS_READ_B128_gfx9 %2, 0, 0, implicit $exec diff --git a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir index eee471cb073b..b954b778dc65 100644 --- a/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir +++ b/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir @@ -279,10 +279,10 @@ body: | %80:vgpr_32 = IMPLICIT_DEF %81:vgpr_32 = IMPLICIT_DEF %84:vgpr_32 = IMPLICIT_DEF - BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, implicit $exec - BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %84, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 108, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %81, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 104, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %80, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 100, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN %78, %stack.0.tmp5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr101, 96, 0, 0, 0, 0, 0, implicit $exec %85:vgpr_32 = IMPLICIT_DEF %86:vgpr_32 = IMPLICIT_DEF %87:vgpr_32 = IMPLICIT_DEF diff --git a/test/CodeGen/AMDGPU/schedule-barrier.mir b/test/CodeGen/AMDGPU/schedule-barrier.mir index a72a406ff094..e52b955ffaf7 100644 --- a/test/CodeGen/AMDGPU/schedule-barrier.mir +++ b/test/CodeGen/AMDGPU/schedule-barrier.mir @@ -30,14 +30,14 @@ body: | %33.sub1:sgpr_128 = V_READFIRSTLANE_B32 %44.sub1, implicit $exec %33.sub2:sgpr_128 = V_READFIRSTLANE_B32 %45.sub2, implicit $exec %33.sub3:sgpr_128 = V_READFIRSTLANE_B32 %46.sub3, implicit $exec - %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, implicit $exec + %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %33, 0, 0, 0, 0, 0, 0, 0, implicit $exec %39:vgpr_32 = V_MUL_LO_U32 %15, %15, implicit $exec undef %27.sub0:sgpr_128 = V_READFIRSTLANE_B32 %26.sub0, implicit $exec %27.sub1:sgpr_128 = V_READFIRSTLANE_B32 %41.sub1, implicit $exec %27.sub2:sgpr_128 = V_READFIRSTLANE_B32 %42.sub2, implicit $exec %27.sub3:sgpr_128 = V_READFIRSTLANE_B32 %43.sub3, implicit $exec - %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, implicit $exec + %19:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %27, 0, 0, 0, 0, 0, 0, 0, implicit $exec %40:vgpr_32 = V_MUL_LO_U32 %19, %19, implicit $exec %23:vgpr_32 = V_ADD_U32_e32 %39, %40, implicit $exec diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir index 2aaf7f10b69a..42c42a48a616 100644 --- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir +++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir @@ -81,11 +81,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_ADD_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -165,11 +165,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUB_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -249,11 +249,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, %9 = V_SUBREV_I32_e64 %19, %17, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed %9, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %29, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -332,12 +332,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %9 = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, %9, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -417,12 +417,12 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec $vcc = S_MOV_B64 0 %29, $vcc = V_ADDC_U32_e64 %19, %17, $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... @@ -502,11 +502,11 @@ body: | %13 = REG_SEQUENCE killed %5, 17, %12, 18 %28 = V_LSHL_B64 killed %27, 2, implicit $exec %16 = REG_SEQUENCE killed %4, 17, %12, 18 - %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, implicit $exec - %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, implicit $exec + %17 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 0, 0, 0, 0, 0, 0, implicit $exec + %19 = BUFFER_LOAD_DWORD_ADDR64 %28, %13, 0, 4, 0, 0, 0, 0, 0, implicit $exec %29, $vcc = V_ADDC_U32_e64 %19, %17, undef $vcc, 0, implicit $exec %24 = V_CNDMASK_B32_e64 0, 0, 0, 1, killed $vcc, implicit $exec - BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %24, %28, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir index 41444b0ef0cd..330426fb0ad1 100644 --- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir +++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir @@ -88,7 +88,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -96,7 +96,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -104,7 +104,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... @@ -149,7 +149,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 9, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 0, implicit $exec S_BRANCH %bb.3 @@ -157,7 +157,7 @@ body: | liveins: $sgpr6, $sgpr7, $sgpr0_sgpr1_sgpr2_sgpr3:0x00000003 $vgpr0 = V_MOV_B32_e32 100, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (volatile store 4 into `i32 addrspace(1)* undef`) $vgpr0 = V_MOV_B32_e32 1, implicit $exec bb.3.done: @@ -165,7 +165,7 @@ body: | $sgpr3 = S_MOV_B32 61440 $sgpr2 = S_MOV_B32 -1 - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, killed $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out) S_ENDPGM 0 ... diff --git a/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir index 9d45c5b19e65..10ed241acb58 100644 --- a/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ b/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -11,7 +11,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_MOV_B32 0 ... # GCN-LABEL: name: vmem_smem_write_sgpr @@ -25,7 +25,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $sgpr0 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0, 0 ... # GCN-LABEL: name: vmem_snop_write_sgpr @@ -40,7 +40,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_NOP 0 $sgpr0 = S_MOV_B32 0 ... @@ -55,7 +55,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $exec $sgpr0 = S_MOV_B32 0 ... @@ -70,7 +70,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 0 $sgpr0 = S_MOV_B32 0 ... @@ -86,7 +86,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_WAITCNT 1 $sgpr0 = S_MOV_B32 0 ... @@ -101,7 +101,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: vmem_write_exec_expread @@ -114,7 +114,7 @@ body: | bb.0: $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $exec_lo, 0, 0, 0, 0, 0, 0, implicit $exec $exec = S_MOV_B64 7 ... # GCN-LABEL: name: ds_write_m0 @@ -143,7 +143,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $sgpr0 = S_MOV_B32 0 @@ -161,7 +161,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.1 bb.1: @@ -181,7 +181,7 @@ body: | $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF $sgpr4 = IMPLICIT_DEF $vgpr0 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.2 bb.1: @@ -206,7 +206,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, 0, 0, 0, 0, 0, 0, implicit $exec S_BRANCH %bb.0 ... # GCN-LABEL: name: ds_write_exec diff --git a/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir b/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir index 644651ded335..5dbe5d58d9bc 100644 --- a/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir +++ b/test/CodeGen/AMDGPU/vmem-vcc-hazard.mir @@ -19,7 +19,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_to_next # GCN: bb.1: @@ -40,7 +40,7 @@ body: | S_BRANCH %bb.1 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_too_far # GCN: bb.1: @@ -61,7 +61,7 @@ body: | $sgpr0 = S_MOV_B32 0 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_fallthrough_no_hazard_nops # GCN: bb.1: @@ -78,7 +78,7 @@ body: | S_NOP 4 bb.1: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_around # GCN: bb.2: @@ -107,7 +107,7 @@ body: | S_NOP 0 bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_branch_backedge # GCN: S_NOP @@ -123,7 +123,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec bb.1: $vgpr0 = IMPLICIT_DEF @@ -156,7 +156,7 @@ body: | $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec bb.2: - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: name: vmem_vcc_self_loop # GCN: S_NOP @@ -172,7 +172,7 @@ body: | $vgpr0 = IMPLICIT_DEF $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.0 ... @@ -198,7 +198,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... @@ -224,7 +224,7 @@ body: | successors: %bb.1 $sgpr0 = S_MOV_B32 0 - $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec $vgpr1 = V_ADDC_U32_e32 $vgpr1, $vgpr1, implicit-def $vcc, implicit $vcc, implicit $exec S_BRANCH %bb.1 ... diff --git a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir index d6e983ae5904..bfd92347d92b 100644 --- a/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir +++ b/test/CodeGen/AMDGPU/waitcnt-loop-irreducible.mir @@ -78,7 +78,7 @@ body: | bb.1: successors: %bb.2 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, renamable $vgpr2, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec bb.2: successors: %bb.3, %bb.6 @@ -86,7 +86,7 @@ body: | bb.3: successors: %bb.4, %bb.5 - BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact killed renamable $vgpr3, killed renamable $vgpr2, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_CBRANCH_VCCNZ %bb.5, implicit $vcc bb.4: diff --git a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir index a93af1d54d39..c92d35f92391 100644 --- a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir +++ b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir index 31efedd37960..5a0e0309e417 100644 --- a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir +++ b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir @@ -44,6 +44,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir index e3bcac22f136..8ad50b72c284 100644 --- a/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir +++ b/test/CodeGen/MIR/AMDGPU/load-store-opt-dlc.mir @@ -32,7 +32,7 @@ } ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test1 liveins: @@ -56,14 +56,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test2 liveins: @@ -87,14 +87,14 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) -# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) +# CHECK: BUFFER_STORE_DWORD_OFFSET %{{[0-9]+}}, %{{[0-9]+}}, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) --- name: test3 liveins: @@ -118,13 +118,13 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 0, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... -# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) +# CHECK: BUFFER_STORE_DWORDX2_OFFSET killed %{{[0-9]+}}, %{{[0-9]+}}, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 8 into %ir.out.gep.1, align 4, addrspace 1) --- name: test4 liveins: @@ -148,8 +148,8 @@ body: | %5:vgpr_32 = COPY $vgpr0 %6:vgpr_32 = COPY $vgpr1 - BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) - BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %5, %4, 0, 4, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) + BUFFER_STORE_DWORD_OFFSET %6, %4, 0, 8, 0, 0, 0, 1, 0, implicit $exec :: (store 4 into %ir.out.gep.1, addrspace 1) S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir b/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir index 005014d5e83f..67bf92b60814 100644 --- a/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir +++ b/test/CodeGen/MIR/AMDGPU/mir-canon-multi.mir @@ -7,7 +7,7 @@ # CHECK-NEXT: %namedVReg1353:vreg_64 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1 # CHECK-NEXT: %namedVReg1354:sgpr_128 = REG_SEQUENCE %namedVReg4354, %subreg.sub0, %namedVReg1352, %subreg.sub1, %namedVReg1358, %subreg.sub2, %namedVReg1359, %subreg.sub3 # This tests for the itereator invalidation fix (reviews.llvm.org/D62713) -# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, implicit $exec +# CHECK-NEXT: BUFFER_STORE_DWORD_ADDR64 %namedVReg1352, %namedVReg1353, %namedVReg1354, 0, 0, 0, 0, 0, 0, 0, implicit $exec ... --- name: foo @@ -27,7 +27,7 @@ body: | %vreg123_3:vgpr_32 = COPY %5 %16:sgpr_128 = REG_SEQUENCE killed %vreg123_0, %subreg.sub0, %vreg123_1, %subreg.sub1, %vreg123_2, %subreg.sub2, %vreg123_3, %subreg.sub3 - BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_ADDR64 %vreg123_1, %27, killed %16, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir b/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir index 629f7aefd6af..c48b13e46e20 100644 --- a/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir +++ b/test/CodeGen/MIR/AMDGPU/parse-order-reserved-regs.mir @@ -12,7 +12,7 @@ # CHECK: scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' # CHECK: scratchWaveOffsetReg: '$sgpr50' # CHECK: frameOffsetReg: '$sgpr50' -# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) +# CHECK: renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) name: reserve_correct_register tracksRegLiveness: true machineFunctionInfo: @@ -25,6 +25,6 @@ stack: body: | bb.0: - renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) + renamable $vgpr0 = BUFFER_LOAD_DWORD_OFFEN %stack.0, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr50, 4, 0, 0, 0, 0, 0, implicit $exec :: (load 4, addrspace 5) S_ENDPGM 0 ... diff --git a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir index b43705eaf8c0..1864f1cbec2f 100644 --- a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir +++ b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir @@ -52,7 +52,7 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... --- @@ -82,6 +82,6 @@ body: | $sgpr7 = S_MOV_B32 61440 $sgpr6 = S_MOV_B32 -1 $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec - BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, 0, 0, implicit $exec S_ENDPGM 0 ... From 0312be25f1c7f0dc99c444e7f507cb68afbc595f Mon Sep 17 00:00:00 2001 From: GN Sync Bot Date: Wed, 2 Oct 2019 17:23:41 +0000 Subject: [PATCH 02/82] gn build: Merge r373489 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373492 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn b/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn index a467d4947bff..5d36602f97ef 100644 --- a/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn +++ b/utils/gn/secondary/clang-tools-extra/clang-tidy/cppcoreguidelines/BUILD.gn @@ -18,6 +18,7 @@ static_library("cppcoreguidelines") { sources = [ "AvoidGotoCheck.cpp", "CppCoreGuidelinesTidyModule.cpp", + "InitVariablesCheck.cpp", "InterfacesGlobalInitCheck.cpp", "MacroUsageCheck.cpp", "NarrowingConversionsCheck.cpp", From e16224de2dd418fca8d6851fb8355bfdb24328bc Mon Sep 17 00:00:00 2001 From: Thomas Lively Date: Wed, 2 Oct 2019 17:34:44 +0000 Subject: [PATCH 03/82] [WebAssembly] Error when using wasm64 for ISel Summary: 64-bit WebAssembly (wasm64) is not specified and not supported in the WebAssembly backend. We do have support for it in clang, however, and we would like to keep that support because we expect wasm64 to be specified and supported in the future. For now add an error when trying to use wasm64 from the backend to minimize user confusion from unexplained crashes. Reviewers: aheejin, dschuff, sunfish Subscribers: sbc100, jgravelle-google, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68254 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373493 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp | 6 ++++++ test/CodeGen/WebAssembly/cpus.ll | 9 +++++---- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp index b0c03c13fe60..f83a8a984ae0 100644 --- a/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp +++ b/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp @@ -54,6 +54,12 @@ class WebAssemblyDAGToDAGISel final : public SelectionDAGISel { ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget(); + + // Wasm64 is not fully supported right now (and is not specified) + if (Subtarget->hasAddr64()) + report_fatal_error( + "64-bit WebAssembly (wasm64) is not currently supported"); + return SelectionDAGISel::runOnMachineFunction(MF); } diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll index 8ede6cbb5a71..01964e9c85ab 100644 --- a/test/CodeGen/WebAssembly/cpus.ll +++ b/test/CodeGen/WebAssembly/cpus.ll @@ -1,16 +1,17 @@ ; This tests that llc accepts all valid WebAssembly CPUs. ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s --check-prefix=WASM64 ; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID -; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID +; RUN: not llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=WASM64 ; CHECK-NOT: is not a recognized processor for this target ; INVALID: {{.+}} is not a recognized processor for this target +; WASM64: 64-bit WebAssembly (wasm64) is not currently supported define i32 @f(i32 %i_like_the_web) { ret i32 %i_like_the_web From 71048f045e6f7b0932741c32eb28b4face99f1f9 Mon Sep 17 00:00:00 2001 From: Aditya Kumar Date: Wed, 2 Oct 2019 17:35:06 +0000 Subject: [PATCH 04/82] Fix: Actually erase remove the elements from AssumeHandles Reviewers: sdmitriev, tejohnson Reviewed by: tejohnson Subscribers: llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68318 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373494 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/AssumptionCache.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp index 7d6429a0fec1..129944743c5e 100644 --- a/lib/Analysis/AssumptionCache.cpp +++ b/lib/Analysis/AssumptionCache.cpp @@ -130,7 +130,10 @@ void AssumptionCache::unregisterAssumption(CallInst *CI) { if (AVI != AffectedValues.end()) AffectedValues.erase(AVI); } - remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }); + + AssumeHandles.erase( + remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }), + AssumeHandles.end()); } void AssumptionCache::AffectedValueCallbackVH::deleted() { From ce5532153cd839ddb1f29c97ef093513dc7bcef9 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Oct 2019 17:47:09 +0000 Subject: [PATCH 05/82] [X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits that might be undef The previous code tried to do a trick where we would extract the subvector from the location we were inserting. Then xor that with the new value. Take the xored value and clear out the bits above the subvector size. Then shift that xored subvector to the insert location. And finally xor that with the original vector. Since the old subvector was used in both xors, this would leave just the new subvector at the inserted location. Since the surrounding bits had been zeroed no other bits of the original vector would be modified. Unfortunately, if the old subvector came from undef we might aggressively propagate the undef. Then we end up with the XORs not cancelling because they aren't using the same value for the two uses of the old subvector. @bkramer gave me a case that demonstrated this, but we haven't reduced it enough to make it easily readable to see what's happening. This patch uses a safer, but more costly approach. It isolate the bits above the insertion and bits below the insert point and ORs those together leaving 0 for the insertion location. Then widens the subvector with 0s in the upper bits, shifts it into position with 0s in the lower bits. Then we do another OR. Differential Revision: https://reviews.llvm.org/D68311 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373495 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 40 +- test/CodeGen/X86/avx512-calling-conv.ll | 2995 +++++++++++++-------- test/CodeGen/X86/avx512-ext.ll | 1514 ++++++----- test/CodeGen/X86/avx512-insert-extract.ll | 90 +- test/CodeGen/X86/avx512-mask-op.ll | 1693 +++++++----- test/CodeGen/X86/masked_store.ll | 100 +- test/CodeGen/X86/vec_smulo.ll | 155 +- test/CodeGen/X86/vec_umulo.ll | 123 +- 8 files changed, 4003 insertions(+), 2707 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 8c837dfb6af5..466a33cb6c1e 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -5769,23 +5769,35 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, // Widen the vector if needed. Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - // Move the current value of the bit to be replace to the lsbs. - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getTargetConstant(IdxVal, dl, MVT::i8)); - // Xor with the new bit. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); - // Shift to MSB, filling bottom bits with 0. + + // Clear the upper bits of the subvector and move it to its insert position. unsigned ShiftLeft = NumElems - SubVecNumElems; - Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); - // Shift to the final position, filling upper bits with 0. + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftLeft, dl, MVT::i8)); unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, - DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); - // Xor with original vector leaving the new value. - Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getTargetConstant(ShiftRight, dl, MVT::i8)); + + // Isolate the bits below the insertion point. + unsigned LowShift = NumElems - IdxVal; + SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low, + DAG.getTargetConstant(LowShift, dl, MVT::i8)); + + // Isolate the bits after the last inserted bit. + unsigned HighShift = IdxVal + SubVecNumElems; + SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High, + DAG.getTargetConstant(HighShift, dl, MVT::i8)); + + // Now OR all 3 pieces together. + Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High); + SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec); + // Reduce to original width if needed. - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG, diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll index 5fb114b3523a..b13c27e0d470 100644 --- a/test/CodeGen/X86/avx512-calling-conv.ll +++ b/test/CodeGen/X86/avx512-calling-conv.ll @@ -531,211 +531,256 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: pushq %r12 ; KNL-NEXT: pushq %rbx ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 -; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kmovw %esi, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k6 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k5 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k4 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %esi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftlw $1, %k0, %k0 -; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: korw %k2, %k0, %k0 -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 ; KNL-NEXT: xorl %ecx, %ecx ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) ; KNL-NEXT: movl $65535, %edx ## imm = 0xFFFF ; KNL-NEXT: movl $0, %esi ; KNL-NEXT: cmovnel %edx, %esi -; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 ; KNL-NEXT: testb $1, {{[0-9]+}}(%rsp) +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 ; KNL-NEXT: cmovnel %edx, %ecx +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 ## 2-byte Reload +; KNL-NEXT: kandw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kandw %k1, %k2, %k1 ; KNL-NEXT: kmovw %k1, %r8d @@ -832,193 +877,294 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: pushq %r13 ; SKX-NEXT: pushq %r12 ; SKX-NEXT: pushq %rbx -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k0, %k0 -; SKX-NEXT: kshiftrd $30, %k0, %k0 -; SKX-NEXT: kxord %k0, %k2, %k2 -; SKX-NEXT: kshiftrd $2, %k2, %k3 -; SKX-NEXT: kxord %k1, %k3, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $29, %k1, %k1 -; SKX-NEXT: kxord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $31, %k0, %k1 +; SKX-NEXT: kshiftld $2, %k0, %k0 +; SKX-NEXT: kord %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 ; SKX-NEXT: kshiftrd $3, %k1, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $29, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $4, %k1, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k1, %k1 +; SKX-NEXT: kshiftrd $29, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $5, %k1, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k1, %k1 +; SKX-NEXT: kshiftrd $28, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $4, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $5, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $6, %k1, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k1, %k1 +; SKX-NEXT: kshiftrd $27, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $6, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $7, %k1, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k1, %k1 +; SKX-NEXT: kshiftrd $26, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k1, %k1 +; SKX-NEXT: kshiftrd $25, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $7, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $8, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $24, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $9, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $10, %k1, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k1, %k1 +; SKX-NEXT: kshiftrd $23, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $11, %k1, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k1, %k1 +; SKX-NEXT: kshiftrd $22, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $10, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $11, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $12, %k1, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k1, %k1 +; SKX-NEXT: kshiftrd $21, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $12, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $13, %k1, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k1, %k1 +; SKX-NEXT: kshiftrd $20, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k1, %k1 +; SKX-NEXT: kshiftrd $19, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $13, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $14, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $18, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftrd $15, %k1, %k2 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k1, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kshiftld $31, %k3, %k2 ; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kshiftrd $17, %k1, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k1, %k1 +; SKX-NEXT: kshiftrd $16, %k1, %k1 +; SKX-NEXT: kord %k2, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $16, %k1, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k2, %k1 +; SKX-NEXT: kmovd %esi, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $31, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kmovd %edx, %k2 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $30, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $3, %k0, %k2 +; SKX-NEXT: kshiftld $3, %k2, %k2 +; SKX-NEXT: kshiftld $30, %k0, %k0 +; SKX-NEXT: kshiftrd $30, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k2 -; SKX-NEXT: kmovd %esi, %k3 -; SKX-NEXT: kxord %k0, %k3, %k0 -; SKX-NEXT: kshiftrd $2, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $29, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $3, %k0, %k2 -; SKX-NEXT: kmovd %r8d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $4, %k0, %k2 +; SKX-NEXT: kshiftld $4, %k2, %k2 +; SKX-NEXT: kshiftld $29, %k0, %k0 +; SKX-NEXT: kshiftrd $29, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $28, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k2 -; SKX-NEXT: kmovd %r9d, %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $5, %k0, %k2 +; SKX-NEXT: kshiftld $5, %k2, %k2 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $27, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $6, %k0, %k2 +; SKX-NEXT: kshiftld $6, %k2, %k2 +; SKX-NEXT: kshiftld $27, %k0, %k0 +; SKX-NEXT: kshiftrd $27, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $5, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $26, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $6, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $25, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $7, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $7, %k2, %k2 +; SKX-NEXT: kshiftld $26, %k0, %k0 +; SKX-NEXT: kshiftrd $26, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $24, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $8, %k0, %k2 +; SKX-NEXT: kshiftld $8, %k2, %k2 +; SKX-NEXT: kshiftld $25, %k0, %k0 +; SKX-NEXT: kshiftrd $25, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $8, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $23, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $9, %k0, %k2 +; SKX-NEXT: kshiftld $9, %k2, %k2 +; SKX-NEXT: kshiftld $24, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $22, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $24, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $23, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $10, %k0, %k2 +; SKX-NEXT: kshiftld $10, %k2, %k2 +; SKX-NEXT: kshiftld $23, %k0, %k0 +; SKX-NEXT: kshiftrd $23, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $22, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $11, %k0, %k2 +; SKX-NEXT: kshiftld $11, %k2, %k2 +; SKX-NEXT: kshiftld $22, %k0, %k0 +; SKX-NEXT: kshiftrd $22, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $21, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $12, %k0, %k2 +; SKX-NEXT: kshiftld $12, %k2, %k2 +; SKX-NEXT: kshiftld $21, %k0, %k0 +; SKX-NEXT: kshiftrd $21, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $11, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $20, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 -; SKX-NEXT: kshiftrd $12, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $19, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $13, %k0, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kshiftld $13, %k2, %k2 +; SKX-NEXT: kshiftld $20, %k0, %k0 +; SKX-NEXT: kshiftrd $20, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $18, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $14, %k0, %k2 +; SKX-NEXT: kshiftld $14, %k2, %k2 +; SKX-NEXT: kshiftld $19, %k0, %k0 +; SKX-NEXT: kshiftrd $19, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kshiftrd $14, %k0, %k3 -; SKX-NEXT: kxord %k2, %k3, %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $17, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $15, %k0, %k2 +; SKX-NEXT: kshiftld $15, %k2, %k2 +; SKX-NEXT: kshiftld $18, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 -; SKX-NEXT: kshiftld $31, %k2, %k2 -; SKX-NEXT: kshiftrd $16, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kshiftrd $18, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $17, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k2 +; SKX-NEXT: kshiftld $16, %k2, %k2 +; SKX-NEXT: kshiftld $17, %k0, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k0 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxord %k3, %k2, %k2 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k3, %k2 +; SKX-NEXT: kshiftrd $16, %k2, %k2 +; SKX-NEXT: kord %k0, %k2, %k0 +; SKX-NEXT: kshiftrd $17, %k0, %k2 +; SKX-NEXT: kshiftld $17, %k2, %k2 +; SKX-NEXT: kshiftld $16, %k0, %k0 +; SKX-NEXT: kshiftrd $16, %k0, %k0 +; SKX-NEXT: kord %k2, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftld $31, %k2, %k2 ; SKX-NEXT: kshiftrd $15, %k2, %k2 -; SKX-NEXT: kxord %k2, %k0, %k0 +; SKX-NEXT: kord %k0, %k2, %k0 ; SKX-NEXT: kandd %k1, %k0, %k0 ; SKX-NEXT: kshiftrd $16, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r8d @@ -1113,215 +1259,262 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: pushl %ebx ; KNL_X32-NEXT: pushl %edi ; KNL_X32-NEXT: pushl %esi +; KNL_X32-NEXT: subl $20, %esp ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 ; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $3, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $4, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $5, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $6, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $7, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $8, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $9, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k2 +; KNL_X32-NEXT: kmovw %k2, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k6 +; KNL_X32-NEXT: korw %k1, %k6, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k5 +; KNL_X32-NEXT: korw %k1, %k5, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k4 +; KNL_X32-NEXT: korw %k1, %k4, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k1 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 +; KNL_X32-NEXT: korw %k0, %k1, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $8, %k0, %k2 +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k2 ## 2-byte Reload +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $9, %k0, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $10, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $11, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $12, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $14, %k0, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $2, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 ; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k1 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $8, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $8, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $7, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $9, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $9, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $6, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $10, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $8, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $8, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $5, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $11, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $9, %k2, %k2 +; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k7 ## 2-byte Reload +; KNL_X32-NEXT: korw %k2, %k7, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $4, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $12, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k6, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $3, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $13, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k5, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k4, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 -; KNL_X32-NEXT: kshiftlw $1, %k1, %k1 -; KNL_X32-NEXT: kshiftrw $1, %k1, %k1 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k2, %k3, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k2, %k2 ; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: xorl %eax, %eax ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: movl $65535, %ecx ## imm = 0xFFFF ; KNL_X32-NEXT: movl $0, %edx ; KNL_X32-NEXT: cmovnel %ecx, %edx +; KNL_X32-NEXT: kshiftlw $2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $2, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $1, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %bl +; KNL_X32-NEXT: kmovw %ebx, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: kmovw %edx, %k1 ; KNL_X32-NEXT: testb $1, {{[0-9]+}}(%esp) ; KNL_X32-NEXT: cmovnel %ecx, %eax -; KNL_X32-NEXT: kandw %k0, %k1, %k0 -; KNL_X32-NEXT: kmovw %edx, %k1 +; KNL_X32-NEXT: kmovw (%esp), %k2 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k2, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kandw %k1, %k2, %k1 ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1403,6 +1596,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: orl %esi, %ecx ; KNL_X32-NEXT: orl %edx, %ecx ; KNL_X32-NEXT: movw %cx, (%eax) +; KNL_X32-NEXT: addl $20, %esp ; KNL_X32-NEXT: popl %esi ; KNL_X32-NEXT: popl %edi ; KNL_X32-NEXT: popl %ebx @@ -1416,356 +1610,550 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL-LABEL: test17: ; KNL: ## %bb.0: ; KNL-NEXT: movq %rdi, %rax -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $14, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $3, %k3, %k3 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $4, %k3, %k3 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $5, %k3, %k3 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 +; KNL-NEXT: kshiftlw $6, %k3, %k3 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 +; KNL-NEXT: kshiftlw $15, %k3, %k3 +; KNL-NEXT: kshiftrw $10, %k3, %k3 +; KNL-NEXT: korw %k0, %k3, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k3 +; KNL-NEXT: kshiftlw $7, %k3, %k3 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: kshiftlw $15, %k3, %k3 ; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k0, %k0 -; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: korw %k0, %k3, %k3 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $14, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $3, %k4, %k4 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $4, %k4, %k4 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $5, %k4, %k4 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 +; KNL-NEXT: kshiftlw $15, %k4, %k4 +; KNL-NEXT: kshiftrw $10, %k4, %k4 +; KNL-NEXT: korw %k0, %k4, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kshiftlw $7, %k4, %k4 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k4 ; KNL-NEXT: kshiftlw $15, %k4, %k4 ; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k5 -; KNL-NEXT: kshiftrw $2, %k5, %k6 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: korw %k0, %k4, %k4 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $14, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $3, %k5, %k5 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $12, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $4, %k5, %k5 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $11, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $5, %k5, %k5 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 -; KNL-NEXT: kshiftrw $10, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k6, %k5, %k5 +; KNL-NEXT: kshiftlw $6, %k5, %k5 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k0, %k5, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k5 +; KNL-NEXT: kshiftlw $7, %k5, %k5 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k5 ; KNL-NEXT: kshiftlw $15, %k5, %k5 ; KNL-NEXT: kshiftrw $9, %k5, %k5 -; KNL-NEXT: kxorw %k5, %k0, %k5 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k6 -; KNL-NEXT: kxorw %k1, %k6, %k6 -; KNL-NEXT: kshiftrw $2, %k6, %k7 -; KNL-NEXT: kxorw %k0, %k7, %k0 +; KNL-NEXT: korw %k0, %k5, %k5 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $14, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $3, %k6, %k6 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $12, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $4, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $4, %k6, %k6 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $11, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $5, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $5, %k6, %k6 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 -; KNL-NEXT: kshiftrw $10, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 ; KNL-NEXT: kshiftrw $6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k6, %k6 +; KNL-NEXT: kshiftlw $6, %k6, %k6 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 +; KNL-NEXT: kshiftlw $15, %k6, %k6 +; KNL-NEXT: kshiftrw $10, %k6, %k6 +; KNL-NEXT: korw %k0, %k6, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k6 +; KNL-NEXT: kshiftlw $7, %k6, %k6 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k6 ; KNL-NEXT: kshiftlw $15, %k6, %k6 ; KNL-NEXT: kshiftrw $9, %k6, %k6 -; KNL-NEXT: kxorw %k6, %k0, %k6 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: korw %k0, %k6, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil ; KNL-NEXT: kmovw %edi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kshiftrw $2, %k0, %k2 -; KNL-NEXT: kxorw %k7, %k2, %k2 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $14, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $13, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $12, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $11, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $10, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dil +; KNL-NEXT: kmovw %edi, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $9, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k7 +; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %edx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: movb {{[0-9]+}}(%rsp), %dl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %edx, %k7 -; KNL-NEXT: kxorw %k7, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k7 -; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k0, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r8d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %r9d, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k0, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl +; KNL-NEXT: kmovw %ecx, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $15, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $3, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kshiftrw $14, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl ; KNL-NEXT: kmovw %ecx, %k2 -; KNL-NEXT: kshiftrw $2, %k1, %k3 -; KNL-NEXT: kxorw %k2, %k3, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $4, %k1, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftrw $13, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $5, %k1, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftrw $12, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftrw $11, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftrw $10, %k1, %k1 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %cl -; KNL-NEXT: kshiftrw $6, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 +; KNL-NEXT: kmovw %ecx, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kandw %k7, %k0, %k0 ; KNL-NEXT: kandw %k6, %k0, %k0 ; KNL-NEXT: kandw %k5, %k0, %k0 ; KNL-NEXT: kandw %k4, %k0, %k0 -; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload -; KNL-NEXT: kandw %k1, %k0, %k0 +; KNL-NEXT: kandw %k3, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload ; KNL-NEXT: kandw %k1, %k0, %k0 ; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload @@ -1808,300 +2196,488 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; SKX-LABEL: test17: ; SKX: ## %bb.0: ; SKX-NEXT: movq %rdi, %rax -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k1 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k0 ; SKX-NEXT: kshiftlb $7, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 -; SKX-NEXT: kshiftlb $7, %k1, %k1 -; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k0, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: korb %k0, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $3, %k1, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftlb $6, %k1, %k1 +; SKX-NEXT: kshiftrb $6, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $4, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftlb $5, %k1, %k1 +; SKX-NEXT: kshiftrb $5, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $3, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftlb $7, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kshiftrb $6, %k1, %k2 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k1, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kshiftlb $7, %k3, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k1, %k2, %k1 +; SKX-NEXT: kshiftrb $7, %k1, %k2 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftlb $2, %k1, %k1 +; SKX-NEXT: kshiftrb $2, %k1, %k1 +; SKX-NEXT: korb %k2, %k1, %k1 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 ; SKX-NEXT: kshiftlb $7, %k2, %k2 ; SKX-NEXT: kshiftrb $1, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: korb %k1, %k2, %k1 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k2 -; SKX-NEXT: kxorb %k0, %k2, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k2, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $4, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kandb %k1, %k2, %k1 -; SKX-NEXT: kxorb %k0, %k4, %k2 -; SKX-NEXT: kshiftrb $2, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftrb $7, %k2, %k2 +; SKX-NEXT: korb %k0, %k2, %k2 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kshiftrb $3, %k2, %k3 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftlb $6, %k2, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k3 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $5, %k2, %k3 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftlb $4, %k2, %k2 +; SKX-NEXT: kshiftrb $4, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kshiftrb $4, %k2, %k4 -; SKX-NEXT: kxorb %k3, %k4, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $3, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $5, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $6, %k2, %k3 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftlb $3, %k2, %k2 +; SKX-NEXT: kshiftrb $3, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $2, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 -; SKX-NEXT: kshiftrb $6, %k2, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k2, %k3, %k2 +; SKX-NEXT: kshiftrb $7, %k2, %k3 +; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: kshiftlb $2, %k2, %k2 +; SKX-NEXT: kshiftrb $2, %k2, %k2 +; SKX-NEXT: korb %k3, %k2, %k2 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 ; SKX-NEXT: kshiftrb $1, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k2, %k2 +; SKX-NEXT: korb %k2, %k3, %k2 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 ; SKX-NEXT: kshiftlb $7, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 +; SKX-NEXT: korb %k3, %k4, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kandb %k2, %k3, %k2 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k3, %k5, %k3 -; SKX-NEXT: kshiftlb $7, %k3, %k3 +; SKX-NEXT: korb %k0, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k4 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftlb $6, %k3, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k4 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftlb $5, %k3, %k3 ; SKX-NEXT: kshiftrb $5, %k3, %k3 -; SKX-NEXT: kxorb %k3, %k4, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $3, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $4, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $5, %k3, %k4 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k3, %k3 +; SKX-NEXT: kshiftrb $4, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kshiftrb $5, %k3, %k4 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $6, %k3, %k4 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftlb $3, %k3, %k3 +; SKX-NEXT: kshiftrb $3, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 ; SKX-NEXT: kshiftrb $2, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kshiftrb $6, %k3, %k5 -; SKX-NEXT: kxorb %k4, %k5, %k4 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftrb $7, %k3, %k4 ; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftlb $2, %k3, %k3 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $2, %k3, %k3 +; SKX-NEXT: korb %k4, %k3, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftrb $1, %k4, %k4 -; SKX-NEXT: kxorb %k4, %k3, %k3 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 -; SKX-NEXT: kxorb %k0, %k4, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 +; SKX-NEXT: korb %k3, %k4, %k3 +; SKX-NEXT: kshiftlb $7, %k5, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 -; SKX-NEXT: kshiftrb $5, %k4, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k7 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kandb %k3, %k4, %k3 -; SKX-NEXT: kxorb %k0, %k7, %k4 -; SKX-NEXT: kshiftrb $2, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k4 +; SKX-NEXT: kshiftlb $7, %k4, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k4 +; SKX-NEXT: korb %k0, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $3, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k4, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftrb $5, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $4, %k4, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 -; SKX-NEXT: kshiftlb $7, %k5, %k5 -; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 +; SKX-NEXT: kshiftrb $5, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $4, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 ; SKX-NEXT: kshiftrb $5, %k4, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k4, %k4 +; SKX-NEXT: kshiftrb $4, %k4, %k4 ; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kshiftlb $7, %k6, %k5 +; SKX-NEXT: kshiftrb $3, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $6, %k4, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k4, %k4 +; SKX-NEXT: kshiftrb $3, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 -; SKX-NEXT: kshiftrb $6, %k4, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kshiftrb $7, %k4, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k4, %k4 +; SKX-NEXT: kshiftrb $2, %k4, %k4 +; SKX-NEXT: korb %k5, %k4, %k4 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k4, %k4 +; SKX-NEXT: korb %k4, %k5, %k4 +; SKX-NEXT: kmovd %esi, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $7, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kmovd %edx, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftrb $6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k5 +; SKX-NEXT: kshiftlb $3, %k5, %k5 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 ; SKX-NEXT: kmovd %ecx, %k5 -; SKX-NEXT: kmovd %esi, %k6 -; SKX-NEXT: kxorb %k0, %k6, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k6 -; SKX-NEXT: kxorb %k5, %k6, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $5, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $3, %k0, %k5 -; SKX-NEXT: kmovd %r8d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k5 +; SKX-NEXT: kshiftlb $4, %k5, %k5 +; SKX-NEXT: kshiftlb $5, %k0, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r8d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $4, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $4, %k0, %k5 -; SKX-NEXT: kmovd %r9d, %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $5, %k0, %k5 +; SKX-NEXT: kshiftlb $5, %k5, %k5 +; SKX-NEXT: kshiftlb $4, %k0, %k0 +; SKX-NEXT: kshiftrb $4, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovd %r9d, %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $3, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $5, %k0, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k5 +; SKX-NEXT: kshiftlb $6, %k5, %k5 +; SKX-NEXT: kshiftlb $3, %k0, %k0 +; SKX-NEXT: kshiftrb $3, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $2, %k5, %k5 -; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k6 -; SKX-NEXT: kxorb %k5, %k0, %k0 -; SKX-NEXT: kshiftrb $6, %k0, %k5 -; SKX-NEXT: kxorb %k6, %k5, %k5 +; SKX-NEXT: korb %k0, %k5, %k0 +; SKX-NEXT: kshiftrb $7, %k0, %k5 +; SKX-NEXT: kshiftlb $7, %k5, %k5 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: korb %k5, %k0, %k0 +; SKX-NEXT: kmovb {{[0-9]+}}(%rsp), %k5 ; SKX-NEXT: kshiftlb $7, %k5, %k5 ; SKX-NEXT: kshiftrb $1, %k5, %k5 -; SKX-NEXT: kxorb %k5, %k0, %k0 +; SKX-NEXT: korb %k0, %k5, %k0 ; SKX-NEXT: kandb %k4, %k0, %k0 ; SKX-NEXT: kandb %k3, %k0, %k0 ; SKX-NEXT: kandb %k2, %k0, %k0 @@ -2144,362 +2720,557 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-LABEL: test17: ; KNL_X32: ## %bb.0: ; KNL_X32-NEXT: pushl %ebx -; KNL_X32-NEXT: subl $8, %esp +; KNL_X32-NEXT: pushl %eax ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $2, %k0, %k1 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k1 -; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k2, %k3 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kmovw %k0, (%esp) ## 2-byte Spill ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k1, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $2, %k3, %k4 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $14, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $3, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $4, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $5, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 -; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $6, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k4, %k3, %k3 +; KNL_X32-NEXT: kmovw %eax, %k3 +; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 +; KNL_X32-NEXT: kshiftrw $10, %k3, %k3 +; KNL_X32-NEXT: korw %k0, %k3, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k3 +; KNL_X32-NEXT: kshiftlw $7, %k3, %k3 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k3, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k3 ; KNL_X32-NEXT: kshiftlw $15, %k3, %k3 ; KNL_X32-NEXT: kshiftrw $9, %k3, %k3 -; KNL_X32-NEXT: kxorw %k3, %k0, %k0 -; KNL_X32-NEXT: kmovw %k0, {{[-0-9]+}}(%e{{[sb]}}p) ## 2-byte Spill +; KNL_X32-NEXT: korw %k0, %k3, %k3 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k4 -; KNL_X32-NEXT: kxorw %k1, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $2, %k4, %k5 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $14, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $3, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $4, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $5, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 -; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $6, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k5, %k4, %k4 +; KNL_X32-NEXT: kmovw %eax, %k4 +; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 +; KNL_X32-NEXT: kshiftrw $10, %k4, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k4 +; KNL_X32-NEXT: kshiftlw $7, %k4, %k4 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k4, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k4 ; KNL_X32-NEXT: kshiftlw $15, %k4, %k4 ; KNL_X32-NEXT: kshiftrw $9, %k4, %k4 -; KNL_X32-NEXT: kxorw %k4, %k0, %k4 +; KNL_X32-NEXT: korw %k0, %k4, %k4 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k5 -; KNL_X32-NEXT: kxorw %k1, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $2, %k5, %k6 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $14, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $3, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $4, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $5, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 -; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $6, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k6, %k5, %k5 +; KNL_X32-NEXT: kmovw %eax, %k5 +; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 +; KNL_X32-NEXT: kshiftrw $10, %k5, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k5 +; KNL_X32-NEXT: kshiftlw $7, %k5, %k5 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k5, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k5 ; KNL_X32-NEXT: kshiftlw $15, %k5, %k5 ; KNL_X32-NEXT: kshiftrw $9, %k5, %k5 -; KNL_X32-NEXT: kxorw %k5, %k0, %k5 +; KNL_X32-NEXT: korw %k0, %k5, %k5 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k6 -; KNL_X32-NEXT: kxorw %k1, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $2, %k6, %k7 -; KNL_X32-NEXT: kxorw %k0, %k7, %k0 -; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 -; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 -; KNL_X32-NEXT: kxorw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $14, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $3, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $3, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $4, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $4, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $5, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $5, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 -; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 ; KNL_X32-NEXT: kshiftrw $6, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $6, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k6, %k6 +; KNL_X32-NEXT: kmovw %eax, %k6 +; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 +; KNL_X32-NEXT: kshiftrw $10, %k6, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k6 +; KNL_X32-NEXT: kshiftlw $7, %k6, %k6 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k6, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k6 ; KNL_X32-NEXT: kshiftlw $15, %k6, %k6 ; KNL_X32-NEXT: kshiftrw $9, %k6, %k6 -; KNL_X32-NEXT: kxorw %k6, %k0, %k6 +; KNL_X32-NEXT: korw %k0, %k6, %k6 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k2 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $3, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $13, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $4, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $12, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $5, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 -; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $11, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $6, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k7 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $10, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k7 +; KNL_X32-NEXT: kshiftlw $7, %k7, %k7 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k7, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 ; KNL_X32-NEXT: kmovw %eax, %k7 -; KNL_X32-NEXT: kxorw %k7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k7, %k7 +; KNL_X32-NEXT: kshiftrw $9, %k7, %k7 +; KNL_X32-NEXT: korw %k0, %k7, %k7 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k0 +; KNL_X32-NEXT: kshiftlw $15, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $15, %k0, %k0 +; KNL_X32-NEXT: korw %k1, %k0, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 -; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k7 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $14, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %cl -; KNL_X32-NEXT: kmovw %ecx, %k0 -; KNL_X32-NEXT: kxorw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k0, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $13, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $12, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $11, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: kshiftrw $7, %k0, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k0, %k0 +; KNL_X32-NEXT: kshiftrw $10, %k0, %k0 +; KNL_X32-NEXT: korw %k2, %k0, %k0 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k0, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k0, %k0 +; KNL_X32-NEXT: korw %k0, %k2, %k0 +; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al +; KNL_X32-NEXT: kmovw %eax, %k2 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $15, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kxorw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 +; KNL_X32-NEXT: kshiftrw $14, %k2, %k2 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $3, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $14, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $14, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al ; KNL_X32-NEXT: kmovw %eax, %k2 -; KNL_X32-NEXT: kshiftrw $2, %k1, %k3 -; KNL_X32-NEXT: kxorw %k2, %k3, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $13, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $4, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $13, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $13, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $3, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $12, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $5, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $12, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $12, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $4, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $11, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $6, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $11, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $11, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $5, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $10, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 +; KNL_X32-NEXT: kshiftrw $7, %k1, %k2 +; KNL_X32-NEXT: kshiftlw $7, %k2, %k2 +; KNL_X32-NEXT: kshiftlw $10, %k1, %k1 +; KNL_X32-NEXT: kshiftrw $10, %k1, %k1 +; KNL_X32-NEXT: korw %k2, %k1, %k1 ; KNL_X32-NEXT: movb {{[0-9]+}}(%esp), %al -; KNL_X32-NEXT: kshiftrw $6, %k1, %k2 -; KNL_X32-NEXT: kmovw %eax, %k3 -; KNL_X32-NEXT: kxorw %k3, %k2, %k2 +; KNL_X32-NEXT: kmovw %eax, %k2 ; KNL_X32-NEXT: kshiftlw $15, %k2, %k2 ; KNL_X32-NEXT: kshiftrw $9, %k2, %k2 -; KNL_X32-NEXT: kxorw %k2, %k1, %k1 +; KNL_X32-NEXT: korw %k1, %k2, %k1 ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kandw %k7, %k0, %k0 ; KNL_X32-NEXT: kandw %k6, %k0, %k0 ; KNL_X32-NEXT: kandw %k5, %k0, %k0 ; KNL_X32-NEXT: kandw %k4, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload -; KNL_X32-NEXT: kandw %k1, %k0, %k0 -; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload +; KNL_X32-NEXT: kandw %k3, %k0, %k0 +; KNL_X32-NEXT: kmovw (%esp), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 ; KNL_X32-NEXT: kmovw {{[-0-9]+}}(%e{{[sb]}}p), %k1 ## 2-byte Reload ; KNL_X32-NEXT: kandw %k1, %k0, %k0 @@ -2537,7 +3308,7 @@ define <7 x i1> @test17(<7 x i1> %a, <7 x i1> %b, <7 x i1> %c, <7 x i1> %d, <7 x ; KNL_X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; KNL_X32-NEXT: andb $127, %cl ; KNL_X32-NEXT: movb %cl, (%eax) -; KNL_X32-NEXT: addl $8, %esp +; KNL_X32-NEXT: addl $4, %esp ; KNL_X32-NEXT: popl %ebx ; KNL_X32-NEXT: retl $4 %j = and <7 x i1> %a, %b diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll index 20af81948256..fcb07a504067 100644 --- a/test/CodeGen/X86/avx512-ext.ll +++ b/test/CodeGen/X86/avx512-ext.ll @@ -1886,410 +1886,495 @@ define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) { define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; KNL-LABEL: test21: ; KNL: # %bb.0: -; KNL-NEXT: kmovw %edx, %k1 -; KNL-NEXT: kmovw %edi, %k2 +; KNL-NEXT: kmovw %edi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: kmovw %esi, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 ; KNL-NEXT: kshiftrw $14, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k2 -; KNL-NEXT: kmovw %ecx, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k2 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k0, %k2, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 -; KNL-NEXT: kshiftrw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k0, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k1 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $15, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k0, %k7, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k0, %k5, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k2, %k7 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 +; KNL-NEXT: korw %k7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: korw %k6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k6, %k2, %k5 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 +; KNL-NEXT: korw %k5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k4 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k5, %k2, %k4 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 +; KNL-NEXT: korw %k4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k4 +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k4, %k2, %k3 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 +; KNL-NEXT: korw %k3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 +; KNL-NEXT: korw %k2, %k0, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k0, %k4 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k4} {z} +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k2 +; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm6, %zmm6, %zmm6 {%k1} {z} +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; KNL-NEXT: vpternlogd $255, %zmm7, %zmm7, %zmm7 {%k1} {z} ; KNL-NEXT: vpmovdw %zmm4, %ymm4 ; KNL-NEXT: vpand %ymm1, %ymm4, %ymm1 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k3} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm5, %ymm4 ; KNL-NEXT: vpand %ymm2, %ymm4, %ymm2 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm6, %ymm4 ; KNL-NEXT: vpand %ymm3, %ymm4, %ymm3 -; KNL-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k1} {z} -; KNL-NEXT: vpmovdw %zmm4, %ymm4 +; KNL-NEXT: vpmovdw %zmm7, %ymm4 ; KNL-NEXT: vpand %ymm0, %ymm4, %ymm0 ; KNL-NEXT: retq ; @@ -2304,410 +2389,495 @@ define <64 x i16> @test21(<64 x i16> %x , <64 x i1> %mask) nounwind readnone { ; ; AVX512DQNOBW-LABEL: test21: ; AVX512DQNOBW: # %bb.0: -; AVX512DQNOBW-NEXT: kmovw %edx, %k0 -; AVX512DQNOBW-NEXT: kmovw %edi, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQNOBW-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQNOBW-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQNOBW-NEXT: kmovw %edi, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: kmovw %esi, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQNOBW-NEXT: kmovw %edx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k3, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQNOBW-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %ecx, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r8d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k2 -; AVX512DQNOBW-NEXT: kmovw %r9d, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQNOBW-NEXT: kmovw %ecx, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k4, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQNOBW-NEXT: kmovw %r8d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k5, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k6 +; AVX512DQNOBW-NEXT: kmovw %r9d, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k6, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k7, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQNOBW-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 +; AVX512DQNOBW-NEXT: korw %k1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %eax, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQNOBW-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k0, %k1, %k0 +; AVX512DQNOBW-NEXT: korw %k0, %k2, %k0 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 ; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQNOBW-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQNOBW-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQNOBW-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQNOBW-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k6, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k3, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k4, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k5, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQNOBW-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQNOBW-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k0 +; AVX512DQNOBW-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQNOBW-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k2, %k0, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k2, %k7, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k6, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k7, %k0, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k1, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k3, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k4, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k7, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k7 +; AVX512DQNOBW-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQNOBW-NEXT: korw %k7, %k5, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k6, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQNOBW-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQNOBW-NEXT: kmovw %eax, %k6 +; AVX512DQNOBW-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k6, %k1, %k5 +; AVX512DQNOBW-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k5, %k2, %k2 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQNOBW-NEXT: kmovw %eax, %k5 -; AVX512DQNOBW-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQNOBW-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k5, %k1, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k4, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k4 +; AVX512DQNOBW-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k4, %k1, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k3, %k2, %k2 +; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQNOBW-NEXT: kmovw %eax, %k3 +; AVX512DQNOBW-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQNOBW-NEXT: korw %k3, %k1, %k1 +; AVX512DQNOBW-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k1, %k2, %k1 ; AVX512DQNOBW-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQNOBW-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQNOBW-NEXT: kmovw %eax, %k4 -; AVX512DQNOBW-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQNOBW-NEXT: korw %k4, %k1, %k1 +; AVX512DQNOBW-NEXT: kmovw %eax, %k2 +; AVX512DQNOBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQNOBW-NEXT: korw %k2, %k1, %k1 ; AVX512DQNOBW-NEXT: vpmovm2d %k1, %zmm4 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm5 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm6 +; AVX512DQNOBW-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 # 2-byte Reload +; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm7 ; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm1, %ymm4, %ymm1 -; AVX512DQNOBW-NEXT: vpmovm2d %k3, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm5, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm2, %ymm4, %ymm2 -; AVX512DQNOBW-NEXT: vpmovm2d %k2, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm6, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm3, %ymm4, %ymm3 -; AVX512DQNOBW-NEXT: vpmovm2d %k0, %zmm4 -; AVX512DQNOBW-NEXT: vpmovdw %zmm4, %ymm4 +; AVX512DQNOBW-NEXT: vpmovdw %zmm7, %ymm4 ; AVX512DQNOBW-NEXT: vpand %ymm0, %ymm4, %ymm0 ; AVX512DQNOBW-NEXT: retq %ret = select <64 x i1> %mask, <64 x i16> %x, <64 x i16> zeroinitializer diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll index d37220222ce7..6e36bd1bb0eb 100644 --- a/test/CodeGen/X86/avx512-insert-extract.ll +++ b/test/CodeGen/X86/avx512-insert-extract.ll @@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 -; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $10, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $ax killed $ax killed $eax ; KNL-NEXT: retq @@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrw $10, %k1, %k2 -; SKX-NEXT: kxorw %k0, %k2, %k0 +; SKX-NEXT: kshiftrw $11, %k1, %k2 +; SKX-NEXT: kshiftlw $11, %k2, %k2 +; SKX-NEXT: kshiftlw $6, %k1, %k1 +; SKX-NEXT: kshiftrw $6, %k1, %k1 ; SKX-NEXT: kshiftlw $15, %k0, %k0 ; SKX-NEXT: kshiftrw $5, %k0, %k0 -; SKX-NEXT: kxorw %k0, %k1, %k0 +; SKX-NEXT: korw %k0, %k2, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $ax killed $ax killed $eax ; SKX-NEXT: retq @@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) { ; KNL: ## %bb.0: ; KNL-NEXT: movb (%rdi), %al ; KNL-NEXT: kmovw %esi, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k1 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kxorw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: retq @@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) { ; SKX: ## %bb.0: ; SKX-NEXT: kmovb (%rdi), %k0 ; SKX-NEXT: kmovd %esi, %k1 -; SKX-NEXT: kshiftrb $4, %k1, %k2 -; SKX-NEXT: kxorb %k0, %k2, %k0 +; SKX-NEXT: kshiftrb $5, %k1, %k2 +; SKX-NEXT: kshiftlb $5, %k2, %k2 +; SKX-NEXT: kshiftlb $4, %k1, %k1 +; SKX-NEXT: kshiftrb $4, %k1, %k1 ; SKX-NEXT: kshiftlb $7, %k0, %k0 ; SKX-NEXT: kshiftrb $3, %k0, %k0 -; SKX-NEXT: kxorb %k0, %k1, %k0 +; SKX-NEXT: korb %k0, %k2, %k0 +; SKX-NEXT: korb %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq @@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; KNL-NEXT: kmovw %k0, %ecx ; KNL-NEXT: shll $16, %ecx ; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k1 +; KNL-NEXT: kshiftrw $5, %k0, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 -; KNL-NEXT: kshiftlw $15, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: orl %ecx, %eax ; KNL-NEXT: vzeroupper @@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32> ; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0 ; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1 ; SKX-NEXT: kunpckwd %k0, %k1, %k0 -; SKX-NEXT: kshiftrd $4, %k0, %k1 +; SKX-NEXT: kshiftrd $5, %k0, %k1 +; SKX-NEXT: kshiftld $5, %k1, %k1 +; SKX-NEXT: kshiftld $28, %k0, %k0 +; SKX-NEXT: kshiftrd $28, %k0, %k0 ; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxord %k2, %k1, %k1 -; SKX-NEXT: kshiftld $31, %k1, %k1 -; SKX-NEXT: kshiftrd $27, %k1, %k1 -; SKX-NEXT: kxord %k1, %k0, %k0 +; SKX-NEXT: kshiftld $31, %k2, %k2 +; SKX-NEXT: kshiftrd $27, %k2, %k2 +; SKX-NEXT: kord %k2, %k1, %k1 +; SKX-NEXT: kord %k1, %k0, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: vzeroupper ; SKX-NEXT: retq @@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; KNL-NEXT: cmpl %esi, %edi ; KNL-NEXT: setb %al ; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0 -; KNL-NEXT: kshiftrw $2, %k0, %k1 -; KNL-NEXT: kmovw %eax, %k2 -; KNL-NEXT: kxorw %k2, %k1, %k1 +; KNL-NEXT: kshiftrw $3, %k0, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 ; KNL-NEXT: kshiftlw $15, %k1, %k1 ; KNL-NEXT: kshiftrw $13, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k0, %k1, %k0 ; KNL-NEXT: kmovw %k0, %eax ; KNL-NEXT: ## kill: def $al killed $al killed $eax ; KNL-NEXT: vzeroupper @@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y) ; SKX-NEXT: cmpl %esi, %edi ; SKX-NEXT: setb %al ; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0 -; SKX-NEXT: kshiftrb $2, %k0, %k1 -; SKX-NEXT: kmovd %eax, %k2 -; SKX-NEXT: kxorb %k2, %k1, %k1 +; SKX-NEXT: kshiftrb $3, %k0, %k1 +; SKX-NEXT: kshiftlb $3, %k1, %k1 +; SKX-NEXT: kshiftlb $6, %k0, %k0 +; SKX-NEXT: kshiftrb $6, %k0, %k0 +; SKX-NEXT: korw %k1, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k1 ; SKX-NEXT: kshiftlb $7, %k1, %k1 ; SKX-NEXT: kshiftrb $5, %k1, %k1 -; SKX-NEXT: kxorw %k1, %k0, %k0 +; SKX-NEXT: korw %k0, %k1, %k0 ; SKX-NEXT: kmovd %k0, %eax ; SKX-NEXT: ## kill: def $al killed $al killed $eax ; SKX-NEXT: retq diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll index ea9742a57621..19f9a2a2bd54 100644 --- a/test/CodeGen/X86/avx512-mask-op.ll +++ b/test/CodeGen/X86/avx512-mask-op.ll @@ -1069,12 +1069,16 @@ define <64 x i8> @test16(i64 %x) { ; KNL-NEXT: kmovw %ecx, %k1 ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kmovw %edi, %k3 -; KNL-NEXT: kshiftrw $5, %k0, %k4 -; KNL-NEXT: kxnorw %k0, %k0, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: movb $1, %al +; KNL-NEXT: kmovw %eax, %k5 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1091,24 +1095,32 @@ define <64 x i8> @test16(i64 %x) { ; SKX-LABEL: test16: ; SKX: ## %bb.0: ; SKX-NEXT: kmovq %rdi, %k0 -; SKX-NEXT: kxnorw %k0, %k0, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: movb $1, %al +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; ; AVX512BW-LABEL: test16: ; AVX512BW: ## %bb.0: ; AVX512BW-NEXT: kmovq %rdi, %k0 -; AVX512BW-NEXT: kxnorw %k0, %k0, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: movb $1, %al +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1116,27 +1128,31 @@ define <64 x i8> @test16(i64 %x) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 -; AVX512DQ-NEXT: kxnorw %k0, %k0, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 +; AVX512DQ-NEXT: movb $1, %al +; AVX512DQ-NEXT: kmovw %eax, %k5 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1145,12 +1161,16 @@ define <64 x i8> @test16(i64 %x) { ; X86-LABEL: test16: ; X86: ## %bb.0: ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 -; X86-NEXT: kshiftrq $5, %k0, %k1 -; X86-NEXT: kxnorw %k0, %k0, %k2 -; X86-NEXT: kxorq %k2, %k1, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: movb $1, %al +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1174,12 +1194,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; KNL-NEXT: kmovw %edi, %k3 ; KNL-NEXT: cmpl %edx, %esi ; KNL-NEXT: setg %al -; KNL-NEXT: kshiftrw $5, %k0, %k4 +; KNL-NEXT: kshiftrw $6, %k0, %k4 +; KNL-NEXT: kshiftlw $6, %k4, %k4 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k0, %k4 +; KNL-NEXT: kshiftlw $15, %k5, %k5 +; KNL-NEXT: kshiftrw $10, %k5, %k5 +; KNL-NEXT: korw %k5, %k4, %k4 +; KNL-NEXT: korw %k4, %k0, %k4 ; KNL-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k3} {z} ; KNL-NEXT: vpmovdb %zmm0, %xmm0 ; KNL-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z} @@ -1198,12 +1221,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; SKX-NEXT: kmovq %rdi, %k0 ; SKX-NEXT: cmpl %edx, %esi ; SKX-NEXT: setg %al -; SKX-NEXT: kmovd %eax, %k1 -; SKX-NEXT: kshiftrq $5, %k0, %k2 -; SKX-NEXT: kxorq %k1, %k2, %k1 -; SKX-NEXT: kshiftlq $63, %k1, %k1 -; SKX-NEXT: kshiftrq $58, %k1, %k1 -; SKX-NEXT: kxorq %k1, %k0, %k0 +; SKX-NEXT: kshiftrq $6, %k0, %k1 +; SKX-NEXT: kshiftlq $6, %k1, %k1 +; SKX-NEXT: kshiftlq $59, %k0, %k0 +; SKX-NEXT: kshiftrq $59, %k0, %k0 +; SKX-NEXT: kmovd %eax, %k2 +; SKX-NEXT: kshiftlq $63, %k2, %k2 +; SKX-NEXT: kshiftrq $58, %k2, %k2 +; SKX-NEXT: korq %k2, %k1, %k1 +; SKX-NEXT: korq %k1, %k0, %k0 ; SKX-NEXT: vpmovm2b %k0, %zmm0 ; SKX-NEXT: retq ; @@ -1212,12 +1238,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512BW-NEXT: kmovq %rdi, %k0 ; AVX512BW-NEXT: cmpl %edx, %esi ; AVX512BW-NEXT: setg %al -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: kshiftrq $5, %k0, %k2 -; AVX512BW-NEXT: kxorq %k1, %k2, %k1 -; AVX512BW-NEXT: kshiftlq $63, %k1, %k1 -; AVX512BW-NEXT: kshiftrq $58, %k1, %k1 -; AVX512BW-NEXT: kxorq %k1, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $6, %k0, %k1 +; AVX512BW-NEXT: kshiftlq $6, %k1, %k1 +; AVX512BW-NEXT: kshiftlq $59, %k0, %k0 +; AVX512BW-NEXT: kshiftrq $59, %k0, %k0 +; AVX512BW-NEXT: kmovd %eax, %k2 +; AVX512BW-NEXT: kshiftlq $63, %k2, %k2 +; AVX512BW-NEXT: kshiftrq $58, %k2, %k2 +; AVX512BW-NEXT: korq %k2, %k1, %k1 +; AVX512BW-NEXT: korq %k1, %k0, %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-NEXT: retq ; @@ -1225,29 +1254,32 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; AVX512DQ: ## %bb.0: ; AVX512DQ-NEXT: movq %rdi, %rax ; AVX512DQ-NEXT: movl %edi, %ecx -; AVX512DQ-NEXT: kmovw %edi, %k0 +; AVX512DQ-NEXT: kmovw %edi, %k1 ; AVX512DQ-NEXT: shrq $32, %rdi ; AVX512DQ-NEXT: shrq $48, %rax ; AVX512DQ-NEXT: shrl $16, %ecx -; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kmovw %ecx, %k0 ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kmovw %edi, %k3 ; AVX512DQ-NEXT: cmpl %edx, %esi ; AVX512DQ-NEXT: setg %al -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k4 +; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $6, %k4, %k4 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftrw $11, %k1, %k1 ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k5, %k5 +; AVX512DQ-NEXT: kshiftrw $10, %k5, %k5 +; AVX512DQ-NEXT: korw %k5, %k4, %k4 +; AVX512DQ-NEXT: korw %k4, %k1, %k1 ; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 ; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0 ; AVX512DQ-NEXT: vpmovm2d %k2, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 ; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm1 +; AVX512DQ-NEXT: vpmovm2d %k1, %zmm1 ; AVX512DQ-NEXT: vpmovdb %zmm1, %xmm1 -; AVX512DQ-NEXT: vpmovm2d %k1, %zmm2 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 ; AVX512DQ-NEXT: vpmovdb %zmm2, %xmm2 ; AVX512DQ-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0 @@ -1259,12 +1291,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) { ; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k0 ; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: setg %al -; X86-NEXT: kmovd %eax, %k1 -; X86-NEXT: kshiftrq $5, %k0, %k2 -; X86-NEXT: kxorq %k1, %k2, %k1 -; X86-NEXT: kshiftlq $63, %k1, %k1 -; X86-NEXT: kshiftrq $58, %k1, %k1 -; X86-NEXT: kxorq %k1, %k0, %k0 +; X86-NEXT: kshiftrq $6, %k0, %k1 +; X86-NEXT: kshiftlq $6, %k1, %k1 +; X86-NEXT: kshiftlq $59, %k0, %k0 +; X86-NEXT: kshiftrq $59, %k0, %k0 +; X86-NEXT: kmovd %eax, %k2 +; X86-NEXT: kshiftlq $63, %k2, %k2 +; X86-NEXT: kshiftrq $58, %k2, %k2 +; X86-NEXT: korq %k2, %k1, %k1 +; X86-NEXT: korq %k1, %k0, %k0 ; X86-NEXT: vpmovm2b %k0, %zmm0 ; X86-NEXT: retl %a = bitcast i64 %x to <64 x i1> @@ -1281,10 +1316,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; KNL-NEXT: kmovw %esi, %k1 ; KNL-NEXT: kshiftrw $8, %k1, %k2 ; KNL-NEXT: kshiftrw $9, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k0, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k3 ; KNL-NEXT: kshiftlw $6, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k0, %k0 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 ; KNL-NEXT: kshiftlw $9, %k0, %k0 ; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: kshiftlw $7, %k2, %k1 @@ -1301,10 +1338,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; SKX-NEXT: kmovd %esi, %k1 ; SKX-NEXT: kshiftrw $8, %k1, %k2 ; SKX-NEXT: kshiftrw $9, %k1, %k1 -; SKX-NEXT: kshiftrb $6, %k0, %k3 -; SKX-NEXT: kxorb %k1, %k3, %k1 +; SKX-NEXT: kshiftlb $2, %k0, %k0 +; SKX-NEXT: kshiftrb $2, %k0, %k0 +; SKX-NEXT: kshiftlb $7, %k0, %k3 ; SKX-NEXT: kshiftlb $6, %k1, %k1 -; SKX-NEXT: kxorb %k1, %k0, %k0 +; SKX-NEXT: korb %k1, %k3, %k1 +; SKX-NEXT: korb %k1, %k0, %k0 ; SKX-NEXT: kshiftlb $1, %k0, %k0 ; SKX-NEXT: kshiftrb $1, %k0, %k0 ; SKX-NEXT: kshiftlb $7, %k2, %k1 @@ -1318,10 +1357,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: kshiftrw $8, %k1, %k2 ; AVX512BW-NEXT: kshiftrw $9, %k1, %k1 -; AVX512BW-NEXT: kshiftrw $6, %k0, %k3 -; AVX512BW-NEXT: kxorw %k1, %k3, %k1 +; AVX512BW-NEXT: kshiftlw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftrw $10, %k0, %k0 +; AVX512BW-NEXT: kshiftlw $7, %k0, %k3 ; AVX512BW-NEXT: kshiftlw $6, %k1, %k1 -; AVX512BW-NEXT: kxorw %k1, %k0, %k0 +; AVX512BW-NEXT: korw %k1, %k3, %k1 +; AVX512BW-NEXT: korw %k1, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $9, %k0, %k0 ; AVX512BW-NEXT: kshiftlw $7, %k2, %k1 @@ -1337,10 +1378,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; AVX512DQ-NEXT: kmovw %esi, %k1 ; AVX512DQ-NEXT: kshiftrw $8, %k1, %k2 ; AVX512DQ-NEXT: kshiftrw $9, %k1, %k1 -; AVX512DQ-NEXT: kshiftrb $6, %k0, %k3 -; AVX512DQ-NEXT: kxorb %k1, %k3, %k1 +; AVX512DQ-NEXT: kshiftlb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrb $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlb $7, %k0, %k3 ; AVX512DQ-NEXT: kshiftlb $6, %k1, %k1 -; AVX512DQ-NEXT: kxorb %k1, %k0, %k0 +; AVX512DQ-NEXT: korb %k1, %k3, %k1 +; AVX512DQ-NEXT: korb %k1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrb $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftlb $7, %k2, %k1 @@ -1357,10 +1400,12 @@ define <8 x i1> @test18(i8 %a, i16 %y) { ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 ; X86-NEXT: kshiftrw $8, %k1, %k2 ; X86-NEXT: kshiftrw $9, %k1, %k1 -; X86-NEXT: kshiftrb $6, %k0, %k3 -; X86-NEXT: kxorb %k1, %k3, %k1 +; X86-NEXT: kshiftlb $7, %k0, %k3 +; X86-NEXT: kshiftlb $2, %k0, %k0 +; X86-NEXT: kshiftrb $2, %k0, %k0 ; X86-NEXT: kshiftlb $6, %k1, %k1 -; X86-NEXT: kxorb %k1, %k0, %k0 +; X86-NEXT: korb %k1, %k3, %k1 +; X86-NEXT: korb %k1, %k0, %k0 ; X86-NEXT: kshiftlb $1, %k0, %k0 ; X86-NEXT: kshiftrb $1, %k0, %k0 ; X86-NEXT: kshiftlb $7, %k2, %k1 @@ -2748,403 +2793,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; KNL-LABEL: store_64i1: ; KNL: ## %bb.0: -; KNL-NEXT: kmovw %ecx, %k0 -; KNL-NEXT: kmovw %esi, %k2 -; KNL-NEXT: kshiftlw $15, %k0, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k1 -; KNL-NEXT: kxorw %k1, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k3 -; KNL-NEXT: kxorw %k0, %k3, %k0 +; KNL-NEXT: kmovw %esi, %k0 ; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: kmovw %edx, %k1 +; KNL-NEXT: kshiftlw $1, %k1, %k1 +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k3 +; KNL-NEXT: kmovw %ecx, %k1 +; KNL-NEXT: kshiftlw $2, %k1, %k1 +; KNL-NEXT: korw %k1, %k3, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 ; KNL-NEXT: kshiftrw $13, %k0, %k0 -; KNL-NEXT: kxorw %k0, %k2, %k0 -; KNL-NEXT: kshiftrw $3, %k0, %k2 -; KNL-NEXT: kmovw %r8d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $4, %k0, %k2 -; KNL-NEXT: kmovw %r9d, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $5, %k0, %k2 +; KNL-NEXT: kshiftlw $4, %k0, %k4 +; KNL-NEXT: kmovw %r8d, %k1 +; KNL-NEXT: kshiftlw $3, %k1, %k1 +; KNL-NEXT: korw %k1, %k4, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k5 +; KNL-NEXT: kmovw %r9d, %k1 +; KNL-NEXT: kshiftlw $4, %k1, %k1 +; KNL-NEXT: korw %k1, %k5, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k6 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $6, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $5, %k1, %k1 +; KNL-NEXT: korw %k1, %k6, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k7 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $7, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $6, %k1, %k1 +; KNL-NEXT: korw %k1, %k7, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $8, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $7, %k1, %k1 +; KNL-NEXT: kshiftlw $8, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $9, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $8, %k1, %k1 +; KNL-NEXT: kshiftlw $9, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $10, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $9, %k1, %k1 +; KNL-NEXT: kshiftlw $10, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $11, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $10, %k1, %k1 +; KNL-NEXT: kshiftlw $11, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $12, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $11, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $13, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $12, %k1, %k1 +; KNL-NEXT: kshiftlw $13, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $2, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 -; KNL-NEXT: kshiftrw $14, %k0, %k2 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $13, %k1, %k1 +; KNL-NEXT: kshiftlw $14, %k0, %k2 +; KNL-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k1, %k2, %k1 +; KNL-NEXT: korw %k1, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $14, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k0, %k0 +; KNL-NEXT: kmovw %eax, %k1 +; KNL-NEXT: kshiftlw $14, %k1, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k1 +; KNL-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k0, %k1, %k0 +; KNL-NEXT: korw %k0, %k2, %k0 ; KNL-NEXT: kshiftlw $1, %k0, %k0 ; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 ; KNL-NEXT: kshiftlw $15, %k2, %k2 ; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kxorw %k1, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k4 -; KNL-NEXT: kxorw %k2, %k4, %k2 -; KNL-NEXT: kshiftlw $15, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k2 -; KNL-NEXT: kxorw %k2, %k3, %k2 -; KNL-NEXT: kshiftrw $3, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $4, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $5, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $6, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $7, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $8, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $9, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $10, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $11, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $12, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $13, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $2, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftrw $14, %k2, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $14, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k2, %k2 -; KNL-NEXT: kshiftlw $1, %k2, %k2 -; KNL-NEXT: kshiftrw $1, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kmovw %eax, %k0 +; KNL-NEXT: kshiftlw $15, %k0, %k0 +; KNL-NEXT: kshiftrw $15, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kxorw %k1, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k5 -; KNL-NEXT: kxorw %k3, %k5, %k3 -; KNL-NEXT: kshiftlw $15, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k3 -; KNL-NEXT: kxorw %k3, %k4, %k3 -; KNL-NEXT: kshiftrw $3, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $14, %k0, %k0 +; KNL-NEXT: kshiftrw $14, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $4, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $13, %k0, %k0 +; KNL-NEXT: kshiftrw $13, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $5, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $12, %k0, %k0 +; KNL-NEXT: kshiftrw $12, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $6, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $11, %k0, %k0 +; KNL-NEXT: kshiftrw $11, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $7, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: korw %k2, %k6, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $10, %k0, %k0 +; KNL-NEXT: kshiftrw $10, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $8, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $9, %k0, %k0 +; KNL-NEXT: kshiftrw $9, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $9, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k1, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $8, %k0, %k0 +; KNL-NEXT: kshiftrw $8, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $10, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k3, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $7, %k0, %k0 +; KNL-NEXT: kshiftrw $7, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $11, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k4, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $6, %k0, %k0 +; KNL-NEXT: kshiftrw $6, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $12, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k5, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $5, %k0, %k0 +; KNL-NEXT: kshiftrw $5, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $13, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $4, %k0, %k0 +; KNL-NEXT: kshiftrw $4, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftrw $14, %k3, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $3, %k0, %k0 +; KNL-NEXT: kshiftrw $3, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k3, %k3 -; KNL-NEXT: kshiftlw $1, %k3, %k3 -; KNL-NEXT: kshiftrw $1, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $2, %k0, %k0 +; KNL-NEXT: kshiftrw $2, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k3, %k3 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k7, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kshiftlw $1, %k0, %k0 +; KNL-NEXT: kshiftrw $1, %k0, %k0 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k0, %k0 +; KNL-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k1, %k5, %k1 -; KNL-NEXT: kshiftrw $2, %k1, %k5 -; KNL-NEXT: kxorw %k4, %k5, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $13, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $3, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $1, %k2, %k2 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k2, %k0, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $12, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $4, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $15, %k7, %k7 +; KNL-NEXT: kshiftrw $15, %k7, %k7 +; KNL-NEXT: korw %k2, %k7, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $11, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $5, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $2, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $14, %k2, %k2 +; KNL-NEXT: kshiftrw $14, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $10, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $6, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $3, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $13, %k2, %k2 +; KNL-NEXT: kshiftrw $13, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $9, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $7, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $4, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $12, %k2, %k2 +; KNL-NEXT: kshiftrw $12, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $8, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $8, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $5, %k7, %k7 +; KNL-NEXT: korw %k7, %k6, %k7 +; KNL-NEXT: kshiftlw $11, %k2, %k2 +; KNL-NEXT: kshiftrw $11, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $7, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $9, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $6, %k7, %k7 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: korw %k7, %k0, %k7 +; KNL-NEXT: kshiftlw $10, %k2, %k2 +; KNL-NEXT: kshiftrw $10, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $6, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $10, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $7, %k7, %k7 +; KNL-NEXT: korw %k7, %k1, %k7 +; KNL-NEXT: kshiftlw $9, %k2, %k2 +; KNL-NEXT: kshiftrw $9, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $5, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $11, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $8, %k7, %k7 +; KNL-NEXT: korw %k7, %k3, %k7 +; KNL-NEXT: kshiftlw $8, %k2, %k2 +; KNL-NEXT: kshiftrw $8, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $4, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $12, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $9, %k7, %k7 +; KNL-NEXT: korw %k7, %k4, %k7 +; KNL-NEXT: kshiftlw $7, %k2, %k2 +; KNL-NEXT: kshiftrw $7, %k2, %k2 +; KNL-NEXT: korw %k7, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $3, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $13, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k7 +; KNL-NEXT: kshiftlw $10, %k7, %k7 +; KNL-NEXT: korw %k7, %k5, %k6 +; KNL-NEXT: kshiftlw $6, %k2, %k2 +; KNL-NEXT: kshiftrw $6, %k2, %k2 +; KNL-NEXT: korw %k6, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: kshiftrw $2, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 -; KNL-NEXT: kshiftrw $14, %k1, %k4 +; KNL-NEXT: kmovw %eax, %k6 +; KNL-NEXT: kshiftlw $11, %k6, %k6 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k6, %k1, %k5 +; KNL-NEXT: kshiftlw $5, %k2, %k2 +; KNL-NEXT: kshiftrw $5, %k2, %k2 +; KNL-NEXT: korw %k5, %k2, %k2 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al ; KNL-NEXT: kmovw %eax, %k5 -; KNL-NEXT: kxorw %k5, %k4, %k4 -; KNL-NEXT: kshiftlw $14, %k4, %k4 -; KNL-NEXT: kxorw %k4, %k1, %k1 +; KNL-NEXT: kshiftlw $12, %k5, %k5 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k5, %k1, %k4 +; KNL-NEXT: kshiftlw $4, %k2, %k2 +; KNL-NEXT: kshiftrw $4, %k2, %k2 +; KNL-NEXT: korw %k4, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k4 +; KNL-NEXT: kshiftlw $13, %k4, %k4 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k4, %k1, %k3 +; KNL-NEXT: kshiftlw $3, %k2, %k2 +; KNL-NEXT: kshiftrw $3, %k2, %k2 +; KNL-NEXT: korw %k3, %k2, %k2 +; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al +; KNL-NEXT: kmovw %eax, %k3 +; KNL-NEXT: kshiftlw $14, %k3, %k3 +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; KNL-NEXT: korw %k3, %k1, %k1 +; KNL-NEXT: kshiftlw $2, %k2, %k2 +; KNL-NEXT: kshiftrw $2, %k2, %k2 +; KNL-NEXT: korw %k1, %k2, %k1 ; KNL-NEXT: kshiftlw $1, %k1, %k1 ; KNL-NEXT: kshiftrw $1, %k1, %k1 ; KNL-NEXT: movb {{[0-9]+}}(%rsp), %al -; KNL-NEXT: kmovw %eax, %k4 -; KNL-NEXT: kshiftlw $15, %k4, %k4 -; KNL-NEXT: korw %k4, %k1, %k1 +; KNL-NEXT: kmovw %eax, %k2 +; KNL-NEXT: kshiftlw $15, %k2, %k2 +; KNL-NEXT: korw %k2, %k1, %k1 ; KNL-NEXT: kmovw %k1, 6(%rdi) -; KNL-NEXT: kmovw %k3, 4(%rdi) -; KNL-NEXT: kmovw %k2, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 4(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; KNL-NEXT: kmovw %k0, 2(%rdi) +; KNL-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; KNL-NEXT: kmovw %k0, (%rdi) ; KNL-NEXT: retq ; @@ -3166,403 +3296,488 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) { ; ; AVX512DQ-LABEL: store_64i1: ; AVX512DQ: ## %bb.0: -; AVX512DQ-NEXT: kmovw %ecx, %k0 -; AVX512DQ-NEXT: kmovw %esi, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k1 -; AVX512DQ-NEXT: kxorw %k1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k3 -; AVX512DQ-NEXT: kxorw %k0, %k3, %k0 +; AVX512DQ-NEXT: kmovw %esi, %k0 ; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: kmovw %edx, %k1 +; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k3 +; AVX512DQ-NEXT: kmovw %ecx, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k3, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 -; AVX512DQ-NEXT: kxorw %k0, %k2, %k0 -; AVX512DQ-NEXT: kshiftrw $3, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r8d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $4, %k0, %k2 -; AVX512DQ-NEXT: kmovw %r9d, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $5, %k0, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k4 +; AVX512DQ-NEXT: kmovw %r8d, %k1 +; AVX512DQ-NEXT: kshiftlw $3, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k4, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k5 +; AVX512DQ-NEXT: kmovw %r9d, %k1 +; AVX512DQ-NEXT: kshiftlw $4, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k5, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k6 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $6, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $5, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k6, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k7 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $7, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $6, %k1, %k1 +; AVX512DQ-NEXT: korw %k1, %k7, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $7, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $8, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $9, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $10, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $11, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 -; AVX512DQ-NEXT: kshiftrw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $13, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k2 +; AVX512DQ-NEXT: kmovw %k2, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k1, %k2, %k1 +; AVX512DQ-NEXT: korw %k1, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %eax, %k1 +; AVX512DQ-NEXT: kshiftlw $14, %k1, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k1 +; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k0, %k1, %k0 +; AVX512DQ-NEXT: korw %k0, %k2, %k0 ; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 ; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 ; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 ; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kxorw %k1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k4 -; AVX512DQ-NEXT: kxorw %k2, %k4, %k2 -; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 -; AVX512DQ-NEXT: kxorw %k2, %k3, %k2 -; AVX512DQ-NEXT: kshiftrw $3, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k3, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $4, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $5, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $6, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $7, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $8, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $9, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $10, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $11, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $12, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $13, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $2, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $14, %k2, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k2, %k2 -; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 -; AVX512DQ-NEXT: kshiftrw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k0 +; AVX512DQ-NEXT: kshiftlw $15, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $15, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kxorw %k1, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k5 -; AVX512DQ-NEXT: kxorw %k3, %k5, %k3 -; AVX512DQ-NEXT: kshiftlw $15, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k3 -; AVX512DQ-NEXT: kxorw %k3, %k4, %k3 -; AVX512DQ-NEXT: kshiftrw $3, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $14, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $14, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $4, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $13, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $13, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $5, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $12, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $12, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $6, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $11, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $11, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $7, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k6, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $10, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $10, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $8, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $9, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $9, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $9, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k1, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $8, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $8, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $10, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k3 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k3, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $7, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $7, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $11, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k4, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $6, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $6, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $12, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k5, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $5, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $5, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $13, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $4, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $4, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $14, %k3, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $3, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $3, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k3, %k3 -; AVX512DQ-NEXT: kshiftlw $1, %k3, %k3 -; AVX512DQ-NEXT: kshiftrw $1, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $2, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $2, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k3, %k3 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k7, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kshiftlw $1, %k0, %k0 +; AVX512DQ-NEXT: kshiftrw $1, %k0, %k0 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k0, %k0 +; AVX512DQ-NEXT: kmovw %k0, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k1, %k5, %k1 -; AVX512DQ-NEXT: kshiftrw $2, %k1, %k5 -; AVX512DQ-NEXT: kxorw %k4, %k5, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $13, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $3, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $1, %k2, %k2 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k2, %k0, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $12, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $4, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $15, %k7, %k7 +; AVX512DQ-NEXT: kshiftrw $15, %k7, %k7 +; AVX512DQ-NEXT: korw %k2, %k7, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $11, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $5, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $2, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $14, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $14, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $10, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $6, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $3, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $13, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $13, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $9, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $7, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $4, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $12, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $12, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $8, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $8, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $5, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k6, %k7 +; AVX512DQ-NEXT: kshiftlw $11, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $11, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $7, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $9, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $6, %k7, %k7 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k7, %k0, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $10, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $6, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $10, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k1, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $9, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $5, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $11, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k3, %k7 +; AVX512DQ-NEXT: kshiftlw $8, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $8, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $4, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $12, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $9, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k4, %k7 +; AVX512DQ-NEXT: kshiftlw $7, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $7, %k2, %k2 +; AVX512DQ-NEXT: korw %k7, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $3, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $13, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k7 +; AVX512DQ-NEXT: kshiftlw $10, %k7, %k7 +; AVX512DQ-NEXT: korw %k7, %k5, %k6 +; AVX512DQ-NEXT: kshiftlw $6, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $6, %k2, %k2 +; AVX512DQ-NEXT: korw %k6, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: kshiftrw $2, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 -; AVX512DQ-NEXT: kshiftrw $14, %k1, %k4 +; AVX512DQ-NEXT: kmovw %eax, %k6 +; AVX512DQ-NEXT: kshiftlw $11, %k6, %k6 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k6, %k1, %k5 +; AVX512DQ-NEXT: kshiftlw $5, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $5, %k2, %k2 +; AVX512DQ-NEXT: korw %k5, %k2, %k2 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al ; AVX512DQ-NEXT: kmovw %eax, %k5 -; AVX512DQ-NEXT: kxorw %k5, %k4, %k4 -; AVX512DQ-NEXT: kshiftlw $14, %k4, %k4 -; AVX512DQ-NEXT: kxorw %k4, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $12, %k5, %k5 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k5, %k1, %k4 +; AVX512DQ-NEXT: kshiftlw $4, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $4, %k2, %k2 +; AVX512DQ-NEXT: korw %k4, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k4 +; AVX512DQ-NEXT: kshiftlw $13, %k4, %k4 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k4, %k1, %k3 +; AVX512DQ-NEXT: kshiftlw $3, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $3, %k2, %k2 +; AVX512DQ-NEXT: korw %k3, %k2, %k2 +; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al +; AVX512DQ-NEXT: kmovw %eax, %k3 +; AVX512DQ-NEXT: kshiftlw $14, %k3, %k3 +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 ## 2-byte Reload +; AVX512DQ-NEXT: korw %k3, %k1, %k1 +; AVX512DQ-NEXT: kshiftlw $2, %k2, %k2 +; AVX512DQ-NEXT: kshiftrw $2, %k2, %k2 +; AVX512DQ-NEXT: korw %k1, %k2, %k1 ; AVX512DQ-NEXT: kshiftlw $1, %k1, %k1 ; AVX512DQ-NEXT: kshiftrw $1, %k1, %k1 ; AVX512DQ-NEXT: movb {{[0-9]+}}(%rsp), %al -; AVX512DQ-NEXT: kmovw %eax, %k4 -; AVX512DQ-NEXT: kshiftlw $15, %k4, %k4 -; AVX512DQ-NEXT: korw %k4, %k1, %k1 +; AVX512DQ-NEXT: kmovw %eax, %k2 +; AVX512DQ-NEXT: kshiftlw $15, %k2, %k2 +; AVX512DQ-NEXT: korw %k2, %k1, %k1 ; AVX512DQ-NEXT: kmovw %k1, 6(%rdi) -; AVX512DQ-NEXT: kmovw %k3, 4(%rdi) -; AVX512DQ-NEXT: kmovw %k2, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 4(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload +; AVX512DQ-NEXT: kmovw %k0, 2(%rdi) +; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k0 ## 2-byte Reload ; AVX512DQ-NEXT: kmovw %k0, (%rdi) ; AVX512DQ-NEXT: retq ; diff --git a/test/CodeGen/X86/masked_store.ll b/test/CodeGen/X86/masked_store.ll index 8c7232cf950e..164826d87155 100644 --- a/test/CodeGen/X86/masked_store.ll +++ b/test/CodeGen/X86/masked_store.ll @@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; AVX512F-LABEL: widen_masked_store: ; AVX512F: ## %bb.0: ; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; AVX512F-NEXT: kmovw %edx, %k0 ; AVX512F-NEXT: andl $1, %esi -; AVX512F-NEXT: kmovw %esi, %k1 -; AVX512F-NEXT: kxorw %k0, %k0, %k2 -; AVX512F-NEXT: kshiftrw $1, %k2, %k2 -; AVX512F-NEXT: kshiftlw $1, %k2, %k2 -; AVX512F-NEXT: korw %k1, %k2, %k1 -; AVX512F-NEXT: kshiftrw $1, %k1, %k2 -; AVX512F-NEXT: kxorw %k0, %k2, %k0 +; AVX512F-NEXT: kmovw %esi, %k0 +; AVX512F-NEXT: kxorw %k0, %k0, %k1 +; AVX512F-NEXT: kshiftrw $1, %k1, %k1 +; AVX512F-NEXT: kshiftlw $1, %k1, %k1 +; AVX512F-NEXT: korw %k0, %k1, %k0 +; AVX512F-NEXT: kshiftrw $2, %k0, %k1 +; AVX512F-NEXT: kshiftlw $2, %k1, %k1 ; AVX512F-NEXT: kshiftlw $15, %k0, %k0 +; AVX512F-NEXT: kshiftrw $15, %k0, %k0 +; AVX512F-NEXT: kmovw %edx, %k2 +; AVX512F-NEXT: kshiftlw $15, %k2, %k2 +; AVX512F-NEXT: kshiftrw $14, %k2, %k2 +; AVX512F-NEXT: korw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kshiftrw $3, %k0, %k1 +; AVX512F-NEXT: kshiftlw $3, %k1, %k1 +; AVX512F-NEXT: kshiftlw $14, %k0, %k0 ; AVX512F-NEXT: kshiftrw $14, %k0, %k0 -; AVX512F-NEXT: kxorw %k0, %k1, %k0 -; AVX512F-NEXT: kshiftrw $2, %k0, %k1 -; AVX512F-NEXT: kmovw %ecx, %k2 -; AVX512F-NEXT: kxorw %k2, %k1, %k1 +; AVX512F-NEXT: korw %k1, %k0, %k0 +; AVX512F-NEXT: kmovw %ecx, %k1 ; AVX512F-NEXT: kshiftlw $15, %k1, %k1 ; AVX512F-NEXT: kshiftrw $13, %k1, %k1 -; AVX512F-NEXT: kxorw %k1, %k0, %k0 +; AVX512F-NEXT: korw %k0, %k1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} @@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) { ; ; AVX512VLDQ-LABEL: widen_masked_store: ; AVX512VLDQ: ## %bb.0: -; AVX512VLDQ-NEXT: kmovw %edx, %k0 -; AVX512VLDQ-NEXT: kmovw %esi, %k1 -; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2 -; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2 -; AVX512VLDQ-NEXT: korb %k1, %k2, %k1 -; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2 -; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0 +; AVX512VLDQ-NEXT: kmovw %esi, %k0 ; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 -; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 -; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $1, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k0, %k1, %k0 ; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1 -; AVX512VLDQ-NEXT: kmovw %ecx, %k2 -; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $2, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %edx, %k2 +; AVX512VLDQ-NEXT: kshiftlb $7, %k2, %k2 +; AVX512VLDQ-NEXT: kshiftrb $6, %k2, %k2 +; AVX512VLDQ-NEXT: korb %k2, %k1, %k1 +; AVX512VLDQ-NEXT: korb %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k1 +; AVX512VLDQ-NEXT: kshiftlb $3, %k1, %k1 +; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0 +; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0 +; AVX512VLDQ-NEXT: korw %k1, %k0, %k0 +; AVX512VLDQ-NEXT: kmovw %ecx, %k1 ; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1 ; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1 -; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLDQ-NEXT: korw %k0, %k1, %k1 ; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLDQ-NEXT: retq ; ; AVX512VLBW-LABEL: widen_masked_store: ; AVX512VLBW: ## %bb.0: -; AVX512VLBW-NEXT: kmovd %edx, %k0 ; AVX512VLBW-NEXT: andl $1, %esi -; AVX512VLBW-NEXT: kmovw %esi, %k1 -; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2 -; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2 -; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2 -; AVX512VLBW-NEXT: korw %k1, %k2, %k1 -; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2 -; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0 +; AVX512VLBW-NEXT: kmovw %esi, %k0 +; AVX512VLBW-NEXT: kxorw %k0, %k0, %k1 +; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k0 +; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $2, %k1, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $15, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %edx, %k2 +; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2 +; AVX512VLBW-NEXT: kshiftrw $14, %k2, %k2 +; AVX512VLBW-NEXT: korw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kshiftrw $3, %k0, %k1 +; AVX512VLBW-NEXT: kshiftlw $3, %k1, %k1 +; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0 ; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0 -; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0 -; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1 -; AVX512VLBW-NEXT: kmovd %ecx, %k2 -; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1 +; AVX512VLBW-NEXT: korw %k1, %k0, %k0 +; AVX512VLBW-NEXT: kmovd %ecx, %k1 ; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1 ; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1 -; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1 +; AVX512VLBW-NEXT: korw %k0, %k1, %k1 ; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1} ; AVX512VLBW-NEXT: retq call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask) diff --git a/test/CodeGen/X86/vec_smulo.ll b/test/CodeGen/X86/vec_smulo.ll index b809e55ce5e2..436e48729f9f 100644 --- a/test/CodeGen/X86/vec_smulo.ll +++ b/test/CodeGen/X86/vec_smulo.ll @@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: smulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm1, %rax -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx -; AVX512-NEXT: vpextrq $1, %xmm0, %rsi +; AVX512-NEXT: vpextrq $1, %xmm1, %rax +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vmovq %xmm1, %rdx +; AVX512-NEXT: vmovq %xmm0, %rsi ; AVX512-NEXT: imulq %rdx, %rsi -; AVX512-NEXT: vmovq %rsi, %xmm0 +; AVX512-NEXT: seto %dl ; AVX512-NEXT: imulq %rax, %rcx -; AVX512-NEXT: vmovq %rcx, %xmm1 +; AVX512-NEXT: vmovq %rcx, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %edx, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: smulo_v4i1: ; AVX512: # %bb.0: -; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 +; AVX512-NEXT: pushq %rbx +; AVX512-NEXT: vpslld $31, %xmm1, %xmm1 +; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 ; AVX512-NEXT: kmovd %k1, %r9d ; AVX512-NEXT: andb $1, %r9b ; AVX512-NEXT: negb %r9b -; AVX512-NEXT: vpslld $31, %xmm1, %xmm0 +; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1 ; AVX512-NEXT: kshiftrw $3, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: negb %r10b ; AVX512-NEXT: kshiftrw $2, %k1, %k2 -; AVX512-NEXT: kmovd %k1, %ecx -; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: negb %cl -; AVX512-NEXT: kshiftrw $2, %k0, %k1 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: negb %r11b +; AVX512-NEXT: kshiftrw $2, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil ; AVX512-NEXT: negb %sil -; AVX512-NEXT: kmovd %k1, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: negb %al +; AVX512-NEXT: kshiftrw $1, %k1, %k2 ; AVX512-NEXT: kmovd %k2, %edx ; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: negb %dl +; AVX512-NEXT: kmovd %k1, %eax +; AVX512-NEXT: andb $1, %al +; AVX512-NEXT: negb %al +; AVX512-NEXT: kmovd %k0, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: imulb %dl +; AVX512-NEXT: imulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al -; AVX512-NEXT: movl %r8d, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %r8b, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: movl %r8d, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %r8b, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: imulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: imulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: movl %edx, %ecx +; AVX512-NEXT: andb $1, %cl +; AVX512-NEXT: negb %cl +; AVX512-NEXT: cmpb %dl, %cl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: imulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: movl %esi, %ecx @@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: movl %r9d, %eax -; AVX512-NEXT: imulb %r10b +; AVX512-NEXT: movl %r10d, %eax +; AVX512-NEXT: imulb %r9b ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl -; AVX512-NEXT: movl %eax, %edx -; AVX512-NEXT: andb $1, %dl -; AVX512-NEXT: negb %dl -; AVX512-NEXT: cmpb %al, %dl -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: movl %eax, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: negb %bl +; AVX512-NEXT: cmpb %al, %bl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 diff --git a/test/CodeGen/X86/vec_umulo.ll b/test/CodeGen/X86/vec_umulo.ll index 07899d0dddf6..c859ce7b74bb 100644 --- a/test/CodeGen/X86/vec_umulo.ll +++ b/test/CodeGen/X86/vec_umulo.ll @@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun ; ; AVX512-LABEL: umulo_v2i64: ; AVX512: # %bb.0: -; AVX512-NEXT: vmovq %xmm0, %rcx -; AVX512-NEXT: vmovq %xmm1, %rsi -; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vpextrq $1, %xmm1, %rdx +; AVX512-NEXT: vpextrq $1, %xmm0, %rcx +; AVX512-NEXT: vpextrq $1, %xmm1, %r8 +; AVX512-NEXT: vmovq %xmm0, %rax +; AVX512-NEXT: vmovq %xmm1, %rdx ; AVX512-NEXT: mulq %rdx -; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: movq %rax, %rsi +; AVX512-NEXT: seto %r9b ; AVX512-NEXT: movq %rcx, %rax -; AVX512-NEXT: mulq %rsi -; AVX512-NEXT: vmovq %rax, %xmm1 +; AVX512-NEXT: mulq %r8 +; AVX512-NEXT: vmovq %rax, %xmm0 +; AVX512-NEXT: vmovq %rsi, %xmm1 ; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] ; AVX512-NEXT: seto %al ; AVX512-NEXT: kmovd %eax, %k0 -; AVX512-NEXT: kshiftlw $15, %k0, %k1 -; AVX512-NEXT: kshiftrw $14, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k1 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %r9d, %k1 +; AVX512-NEXT: kshiftlw $15, %k1, %k1 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k1 ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: vmovdqa %xmm1, (%rdi) @@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; ; AVX512-LABEL: umulo_v4i1: ; AVX512: # %bb.0: +; AVX512-NEXT: pushq %rbx ; AVX512-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 @@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: kmovd %k2, %r10d ; AVX512-NEXT: andb $1, %r10b ; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kmovd %k0, %esi +; AVX512-NEXT: kmovd %k2, %r11d +; AVX512-NEXT: andb $1, %r11b +; AVX512-NEXT: kshiftrw $2, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %ebx +; AVX512-NEXT: andb $1, %bl +; AVX512-NEXT: kshiftrw $1, %k0, %k2 +; AVX512-NEXT: kmovd %k2, %edx +; AVX512-NEXT: andb $1, %dl +; AVX512-NEXT: kshiftrw $1, %k1, %k2 +; AVX512-NEXT: kmovd %k2, %esi ; AVX512-NEXT: andb $1, %sil -; AVX512-NEXT: kshiftrw $2, %k1, %k0 +; AVX512-NEXT: kmovd %k0, %eax +; AVX512-NEXT: andb $1, %al ; AVX512-NEXT: kmovd %k1, %ecx ; AVX512-NEXT: andb $1, %cl -; AVX512-NEXT: kmovd %k2, %eax -; AVX512-NEXT: andb $1, %al -; AVX512-NEXT: kmovd %k0, %edx -; AVX512-NEXT: andb $1, %dl ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: mulb %dl +; AVX512-NEXT: mulb %cl ; AVX512-NEXT: movl %eax, %r8d ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %r8b -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %al, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k1 -; AVX512-NEXT: movl %esi, %eax -; AVX512-NEXT: mulb %cl +; AVX512-NEXT: kmovd %eax, %k0 +; AVX512-NEXT: kshiftlw $15, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k0, %k1 +; AVX512-NEXT: kshiftlw $2, %k0, %k0 +; AVX512-NEXT: movl %edx, %eax +; AVX512-NEXT: mulb %sil +; AVX512-NEXT: movl %eax, %edx +; AVX512-NEXT: seto %al +; AVX512-NEXT: testb $-2, %dl +; AVX512-NEXT: setne %cl +; AVX512-NEXT: orb %al, %cl +; AVX512-NEXT: setne %al +; AVX512-NEXT: kmovd %eax, %k2 +; AVX512-NEXT: kshiftlw $1, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k1, %k1 +; AVX512-NEXT: kshiftrw $14, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k0, %k2 +; AVX512-NEXT: movl %r11d, %eax +; AVX512-NEXT: mulb %bl ; AVX512-NEXT: movl %eax, %esi ; AVX512-NEXT: seto %al ; AVX512-NEXT: testb $-2, %sil ; AVX512-NEXT: setne %cl ; AVX512-NEXT: orb %al, %cl ; AVX512-NEXT: setne %al -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kshiftlw $15, %k0, %k0 -; AVX512-NEXT: kshiftrw $14, %k0, %k0 -; AVX512-NEXT: kxorw %k0, %k2, %k2 -; AVX512-NEXT: kshiftrw $2, %k2, %k3 -; AVX512-NEXT: kxorw %k1, %k3, %k1 -; AVX512-NEXT: kshiftlw $2, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k2, %k1 +; AVX512-NEXT: kmovd %eax, %k3 +; AVX512-NEXT: kshiftlw $2, %k3, %k3 +; AVX512-NEXT: korw %k3, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 ; AVX512-NEXT: kshiftlw $13, %k1, %k1 ; AVX512-NEXT: kshiftrw $13, %k1, %k1 ; AVX512-NEXT: movl %r9d, %eax @@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: # kill: def $al killed $al def $eax ; AVX512-NEXT: seto %cl ; AVX512-NEXT: testb $-2, %al -; AVX512-NEXT: setne %dl -; AVX512-NEXT: orb %cl, %dl +; AVX512-NEXT: setne %bl +; AVX512-NEXT: orb %cl, %bl ; AVX512-NEXT: setne %cl ; AVX512-NEXT: kmovd %ecx, %k2 ; AVX512-NEXT: kshiftlw $3, %k2, %k2 @@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind ; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 ; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} ; AVX512-NEXT: kmovd %r8d, %k1 -; AVX512-NEXT: kmovd %esi, %k2 -; AVX512-NEXT: kxorw %k0, %k2, %k0 -; AVX512-NEXT: kshiftrw $2, %k0, %k2 -; AVX512-NEXT: kxorw %k1, %k2, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 -; AVX512-NEXT: kshiftrw $13, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $15, %k1, %k1 +; AVX512-NEXT: kmovd %edx, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $14, %k2, %k2 +; AVX512-NEXT: korw %k2, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kshiftrw $3, %k0, %k1 -; AVX512-NEXT: kmovd %eax, %k2 -; AVX512-NEXT: kxorw %k2, %k1, %k1 +; AVX512-NEXT: kshiftlw $3, %k1, %k1 +; AVX512-NEXT: kshiftlw $14, %k0, %k0 +; AVX512-NEXT: kshiftrw $14, %k0, %k0 +; AVX512-NEXT: kmovd %esi, %k2 +; AVX512-NEXT: kshiftlw $15, %k2, %k2 +; AVX512-NEXT: kshiftrw $13, %k2, %k2 +; AVX512-NEXT: korw %k2, %k1, %k1 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kshiftrw $4, %k0, %k1 +; AVX512-NEXT: kshiftlw $4, %k1, %k1 +; AVX512-NEXT: kshiftlw $13, %k0, %k0 +; AVX512-NEXT: kshiftrw $13, %k0, %k0 +; AVX512-NEXT: korw %k1, %k0, %k0 +; AVX512-NEXT: kmovd %eax, %k1 ; AVX512-NEXT: kshiftlw $15, %k1, %k1 ; AVX512-NEXT: kshiftrw $12, %k1, %k1 -; AVX512-NEXT: kxorw %k1, %k0, %k0 +; AVX512-NEXT: korw %k0, %k1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: movb %al, (%rdi) +; AVX512-NEXT: popq %rbx ; AVX512-NEXT: retq %t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1) %val = extractvalue {<4 x i1>, <4 x i1>} %t, 0 From 091f50348a699dbaac6bad4ed5aa9962bb910ff6 Mon Sep 17 00:00:00 2001 From: Benjamin Kramer Date: Wed, 2 Oct 2019 18:20:24 +0000 Subject: [PATCH 06/82] [ARM] Make helpers static. NFC. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373503 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMISelLowering.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index b3138312150c..45bf67633822 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -13122,7 +13122,8 @@ static SDValue PerformLOADCombine(SDNode *N, // Optimize trunc store (of multiple scalars) to shuffle and store. First, // pack all of the elements in one place. Next, store to memory in fewer // chunks. -SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, + SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); if (!St->isTruncatingStore() || !VT.isVector()) @@ -13206,7 +13207,8 @@ SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { // Try taking a single vector store from an truncate (which would otherwise turn // into an expensive buildvector) and splitting it into a series of narrowing // stores. -SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) { +static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); @@ -13696,7 +13698,7 @@ static SDValue PerformShiftCombine(SDNode *N, // Look for a sign/zero extend of a larger than legal load. This can be split // into two extending loads, which are simpler to deal with than an arbitrary // sign extend. -SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { +static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); if (N0.getOpcode() != ISD::LOAD) return SDValue(); From c1679d6c229dca9bf7b6a7e98256ee39967339df Mon Sep 17 00:00:00 2001 From: Alina Sbirlea Date: Wed, 2 Oct 2019 18:42:33 +0000 Subject: [PATCH 07/82] [MemorySSA] Update Phi creation when inserting a Def. MemoryPhis should be added in the IDF of the blocks newly gaining Defs. This includes the blocks that gained a Phi and the block gaining a Def, if the block did not have one before. Resolves PR43427. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373505 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Analysis/MemorySSAUpdater.cpp | 77 ++++++++++++++++-------------- test/Analysis/MemorySSA/pr43427.ll | 42 ++++++++++++++++ 2 files changed, 82 insertions(+), 37 deletions(-) create mode 100644 test/Analysis/MemorySSA/pr43427.ll diff --git a/lib/Analysis/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp index 6018968c199d..d103c3a8b831 100644 --- a/lib/Analysis/MemorySSAUpdater.cpp +++ b/lib/Analysis/MemorySSAUpdater.cpp @@ -347,51 +347,54 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) { // If this is the first def in the block and this insert is in an arbitrary // place, compute IDF and place phis. + SmallPtrSet DefiningBlocks; + + // If this is the last Def in the block, also compute IDF based on MD, since + // this may a new Def added, and we may need additional Phis. auto Iter = MD->getDefsIterator(); ++Iter; auto IterEnd = MSSA->getBlockDefs(MD->getBlock())->end(); - if (Iter == IterEnd) { - SmallPtrSet DefiningBlocks; + if (Iter == IterEnd) DefiningBlocks.insert(MD->getBlock()); - for (const auto &VH : InsertedPHIs) - if (const auto *RealPHI = cast_or_null(VH)) - DefiningBlocks.insert(RealPHI->getBlock()); - ForwardIDFCalculator IDFs(*MSSA->DT); - SmallVector IDFBlocks; - IDFs.setDefiningBlocks(DefiningBlocks); - IDFs.calculate(IDFBlocks); - SmallVector, 4> NewInsertedPHIs; - for (auto *BBIDF : IDFBlocks) { - auto *MPhi = MSSA->getMemoryAccess(BBIDF); - if (!MPhi) { - MPhi = MSSA->createMemoryPhi(BBIDF); - NewInsertedPHIs.push_back(MPhi); - } - // Add the phis created into the IDF blocks to NonOptPhis, so they are - // not optimized out as trivial by the call to getPreviousDefFromEnd - // below. Once they are complete, all these Phis are added to the - // FixupList, and removed from NonOptPhis inside fixupDefs(). Existing - // Phis in IDF may need fixing as well, and potentially be trivial - // before this insertion, hence add all IDF Phis. See PR43044. - NonOptPhis.insert(MPhi); + + for (const auto &VH : InsertedPHIs) + if (const auto *RealPHI = cast_or_null(VH)) + DefiningBlocks.insert(RealPHI->getBlock()); + ForwardIDFCalculator IDFs(*MSSA->DT); + SmallVector IDFBlocks; + IDFs.setDefiningBlocks(DefiningBlocks); + IDFs.calculate(IDFBlocks); + SmallVector, 4> NewInsertedPHIs; + for (auto *BBIDF : IDFBlocks) { + auto *MPhi = MSSA->getMemoryAccess(BBIDF); + if (!MPhi) { + MPhi = MSSA->createMemoryPhi(BBIDF); + NewInsertedPHIs.push_back(MPhi); } - for (auto &MPhi : NewInsertedPHIs) { - auto *BBIDF = MPhi->getBlock(); - for (auto *Pred : predecessors(BBIDF)) { - DenseMap> CachedPreviousDef; - MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), - Pred); - } + // Add the phis created into the IDF blocks to NonOptPhis, so they are not + // optimized out as trivial by the call to getPreviousDefFromEnd below. + // Once they are complete, all these Phis are added to the FixupList, and + // removed from NonOptPhis inside fixupDefs(). Existing Phis in IDF may + // need fixing as well, and potentially be trivial before this insertion, + // hence add all IDF Phis. See PR43044. + NonOptPhis.insert(MPhi); + } + for (auto &MPhi : NewInsertedPHIs) { + auto *BBIDF = MPhi->getBlock(); + for (auto *Pred : predecessors(BBIDF)) { + DenseMap> CachedPreviousDef; + MPhi->addIncoming(getPreviousDefFromEnd(Pred, CachedPreviousDef), Pred); } + } - // Re-take the index where we're adding the new phis, because the above - // call to getPreviousDefFromEnd, may have inserted into InsertedPHIs. - NewPhiIndex = InsertedPHIs.size(); - for (auto &MPhi : NewInsertedPHIs) { - InsertedPHIs.push_back(&*MPhi); - FixupList.push_back(&*MPhi); - } + // Re-take the index where we're adding the new phis, because the above call + // to getPreviousDefFromEnd, may have inserted into InsertedPHIs. + NewPhiIndex = InsertedPHIs.size(); + for (auto &MPhi : NewInsertedPHIs) { + InsertedPHIs.push_back(&*MPhi); + FixupList.push_back(&*MPhi); } + FixupList.push_back(MD); } diff --git a/test/Analysis/MemorySSA/pr43427.ll b/test/Analysis/MemorySSA/pr43427.ll new file mode 100644 index 000000000000..f70887822171 --- /dev/null +++ b/test/Analysis/MemorySSA/pr43427.ll @@ -0,0 +1,42 @@ +; RUN: opt -disable-output -licm -print-memoryssa -enable-mssa-loop-dependency=true < %s 2>&1 | FileCheck %s + +; CHECK-LABEL: @f() +; CHECK: 8 = MemoryPhi( +; CHECK: 7 = MemoryPhi( +; CHECK: 9 = MemoryPhi( +define void @f() { +entry: + %e = alloca i16, align 1 + br label %lbl1 + +lbl1: ; preds = %if.else, %cleanup, %entry + store i16 undef, i16* %e, align 1 + call void @g() + br i1 undef, label %for.end, label %if.else + +for.end: ; preds = %lbl1 + br i1 undef, label %lbl3, label %lbl2 + +lbl2: ; preds = %lbl3, %for.end + br label %lbl3 + +lbl3: ; preds = %lbl2, %for.end + br i1 undef, label %lbl2, label %cleanup + +cleanup: ; preds = %lbl3 + %cleanup.dest = load i32, i32* undef, align 1 + %switch = icmp ult i32 %cleanup.dest, 1 + br i1 %switch, label %cleanup.cont, label %lbl1 + +cleanup.cont: ; preds = %cleanup + call void @llvm.lifetime.end.p0i8(i64 1, i8* null) + ret void + +if.else: ; preds = %lbl1 + br label %lbl1 +} + +declare void @g() + +; Function Attrs: argmemonly nounwind willreturn +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) From 2351e6ea75c36004c8c0822881bfc683333b1a63 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 2 Oct 2019 19:38:24 +0000 Subject: [PATCH 08/82] [Local] Handle terminators with users in removeUnreachableBlocks. Terminators like invoke can have users outside the current basic block. We have to replace those users with undef, before replacing the terminator. This fixes a crash exposed by rL373430. Reviewers: brzycki, asbirlea, davide, spatel Reviewed By: asbirlea Differential Revision: https://reviews.llvm.org/D68327 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373513 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Utils/Local.cpp | 10 +++++--- unittests/Transforms/Utils/LocalTest.cpp | 32 ++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index 3a6b7a6a6555..94339c2ba00f 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -2246,9 +2246,13 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU, } BB->dropAllReferences(); if (DTU) { - // Remove the terminator of BB to clear the successor list of BB. - if (BB->getTerminator()) - BB->getInstList().pop_back(); + Instruction *TI = BB->getTerminator(); + assert(TI && "Basic block should have a terminator"); + // Terminators like invoke can have users. We have to replace their users, + // before removing them. + if (!TI->use_empty()) + TI->replaceAllUsesWith(UndefValue::get(TI->getType())); + TI->eraseFromParent(); new UnreachableInst(BB->getContext(), BB); assert(succ_empty(BB) && "The successor list of BB isn't empty before " "applying corresponding DTU updates."); diff --git a/unittests/Transforms/Utils/LocalTest.cpp b/unittests/Transforms/Utils/LocalTest.cpp index ff2eda1cec03..1f67a1ec84c7 100644 --- a/unittests/Transforms/Utils/LocalTest.cpp +++ b/unittests/Transforms/Utils/LocalTest.cpp @@ -867,6 +867,36 @@ TEST(Local, RemoveUnreachableBlocks) { bb2: br label %bb1 } + + declare i32 @__gxx_personality_v0(...) + + define void @invoke_terminator() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) { + entry: + br i1 undef, label %invoke.block, label %exit + + invoke.block: + %cond = invoke zeroext i1 @invokable() + to label %continue.block unwind label %lpad.block + + continue.block: + br i1 %cond, label %if.then, label %if.end + + if.then: + unreachable + + if.end: + unreachable + + lpad.block: + %lp = landingpad { i8*, i32 } + catch i8* null + br label %exit + + exit: + ret void + } + + declare i1 @invokable() )"); auto runEager = [&](Function &F, DominatorTree *DT) { @@ -890,12 +920,14 @@ TEST(Local, RemoveUnreachableBlocks) { runWithDomTree(*M, "br_self_loop", runEager); runWithDomTree(*M, "br_constant", runEager); runWithDomTree(*M, "br_loop", runEager); + runWithDomTree(*M, "invoke_terminator", runEager); // Test removeUnreachableBlocks under Lazy UpdateStrategy. runWithDomTree(*M, "br_simple", runLazy); runWithDomTree(*M, "br_self_loop", runLazy); runWithDomTree(*M, "br_constant", runLazy); runWithDomTree(*M, "br_loop", runLazy); + runWithDomTree(*M, "invoke_terminator", runLazy); M = parseIR(C, R"( From 45ff6456d26c47fb8e0099dfee223783722bad46 Mon Sep 17 00:00:00 2001 From: Evandro Menezes Date: Wed, 2 Oct 2019 19:44:53 +0000 Subject: [PATCH 09/82] [TableGen] Improve error reporting of overlapping definitions (NFC) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373514 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/TableGen/CodeGenSchedule.cpp | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp index cb05f78fba41..f12d7d484a8e 100644 --- a/utils/TableGen/CodeGenSchedule.cpp +++ b/utils/TableGen/CodeGenSchedule.cpp @@ -1083,9 +1083,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { if (RWD->getValueAsDef("SchedModel") == RWModelDef && RWModelDef->getValueAsBit("FullInstRWOverlapCheck")) { for (Record *Inst : InstDefs) { - PrintFatalError(InstRWDef->getLoc(), "Overlapping InstRW def " + - Inst->getName() + " also matches " + - RWD->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + Inst->getName() + + "\" also matches previous \"" + + RWD->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } } @@ -1115,9 +1119,13 @@ void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) { for (Record *OldRWDef : SchedClasses[OldSCIdx].InstRWs) { if (OldRWDef->getValueAsDef("SchedModel") == RWModelDef) { for (Record *InstDef : InstDefs) { - PrintFatalError(OldRWDef->getLoc(), "Overlapping InstRW def " + - InstDef->getName() + " also matches " + - OldRWDef->getValue("Instrs")->getValue()->getAsString()); + PrintFatalError + (InstRWDef->getLoc(), + "Overlapping InstRW definition for \"" + + InstDef->getName() + + "\" also matches previous \"" + + OldRWDef->getValue("Instrs")->getValue()->getAsString() + + "\"."); } } assert(OldRWDef != InstRWDef && From f35391129a1684ddbb8916a7a75f7ff546fbd6bc Mon Sep 17 00:00:00 2001 From: Evgeniy Stepanov Date: Wed, 2 Oct 2019 19:53:19 +0000 Subject: [PATCH 10/82] Handle llvm.launder.invariant.group in msan. Summary: [MSan] handle llvm.launder.invariant.group Msan used to give false-positives in class Foo { public: virtual ~Foo() {}; }; // Return true iff *x is set. bool f1(void **x, bool flag); Foo* f() { void *p; bool found; found = f1(&p,flag); if (found) { // p is always set here. return static_cast(p); // False positive here. } return nullptr; } Patch by Ilya Tokar. Reviewers: #sanitizers, eugenis Reviewed By: #sanitizers, eugenis Subscribers: eugenis, Prazek, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68236 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373515 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../Instrumentation/MemorySanitizer.cpp | 9 +++++ .../msan_llvm_launder_invariant.ll | 38 +++++++++++++++++++ .../msan_llvm_strip_invariant.ll | 21 ++++++++++ 3 files changed, 68 insertions(+) create mode 100644 test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll create mode 100644 test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 81bd2f3c18ac..f9354069da32 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2562,6 +2562,11 @@ struct MemorySanitizerVisitor : public InstVisitor { return false; } + void handleInvariantGroup(IntrinsicInst &I) { + setShadow(&I, getShadow(&I, 0)); + setOrigin(&I, getOrigin(&I, 0)); + } + void handleLifetimeStart(IntrinsicInst &I) { if (!PoisonStack) return; @@ -2993,6 +2998,10 @@ struct MemorySanitizerVisitor : public InstVisitor { case Intrinsic::lifetime_start: handleLifetimeStart(I); break; + case Intrinsic::launder_invariant_group: + case Intrinsic::strip_invariant_group: + handleInvariantGroup(I); + break; case Intrinsic::bswap: handleBswap(I); break; diff --git a/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll b/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll new file mode 100644 index 000000000000..63de8663e077 --- /dev/null +++ b/test/Instrumentation/MemorySanitizer/msan_llvm_launder_invariant.ll @@ -0,0 +1,38 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%class.Foo = type { i32 (...)** } +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local %class.Foo* @_Z1fv() local_unnamed_addr #0 { +entry: + %p = alloca i8*, align 8 + %0 = bitcast i8** %p to i8* + call void @llvm.lifetime.start.p0i8(i64 8, i8* nonnull %0) + %1 = load i8, i8* @flag, align 1 + %tobool = icmp ne i8 %1, 0 + %call = call zeroext i1 @_Z2f1PPvb(i8** nonnull %p, i1 zeroext %tobool) + %2 = load i8*, i8** %p, align 8 + %3 = call i8* @llvm.launder.invariant.group.p0i8(i8* %2) + %4 = bitcast i8* %3 to %class.Foo* + %retval.0 = select i1 %call, %class.Foo* %4, %class.Foo* null + call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull %0) + ret %class.Foo* %retval.0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare dso_local zeroext i1 @_Z2f1PPvb(i8**, i1 zeroext) local_unnamed_addr + +declare i8* @llvm.launder.invariant.group.p0i8(i8*) + +declare void @llvm.lifetime.start.p0i8(i64 immarg, i8* nocapture) + +declare void @llvm.lifetime.end.p0i8(i64 immarg, i8* nocapture) + +attributes #0 = { sanitize_memory uwtable } diff --git a/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll b/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll new file mode 100644 index 000000000000..f3b5c0d722c8 --- /dev/null +++ b/test/Instrumentation/MemorySanitizer/msan_llvm_strip_invariant.ll @@ -0,0 +1,21 @@ +; Make sure MSan handles llvm.launder.invariant.group correctly. + +; RUN: opt < %s -msan -msan-kernel=1 -O1 -S | FileCheck -check-prefixes=CHECK %s +; RUN: opt < %s -msan -O1 -S | FileCheck -check-prefixes=CHECK %s + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@flag = dso_local local_unnamed_addr global i8 0, align 1 + +define dso_local i8* @f(i8* %x) local_unnamed_addr #0 { +entry: + %0 = call i8* @llvm.strip.invariant.group.p0i8(i8* %x) + ret i8* %0 +} + +; CHECK-NOT: call void @__msan_warning_noreturn + +declare i8* @llvm.strip.invariant.group.p0i8(i8*) + +attributes #0 = { sanitize_memory uwtable } From c9f1b7f4c3cbfd7628a2069f57a2e9a9888e79ec Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 2 Oct 2019 19:56:04 +0000 Subject: [PATCH 11/82] Fix inconsistent indentation in TableGen.cpp The anonymous namespace starts out (incorrectly) indented but isn't indented from the TimeRegionsOpt declaration onwards. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373516 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/TableGen/TableGen.cpp | 132 +++++++++++++++++------------------- 1 file changed, 64 insertions(+), 68 deletions(-) diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp index 29ef46fd7fcc..d8dc27e7f725 100644 --- a/utils/TableGen/TableGen.cpp +++ b/utils/TableGen/TableGen.cpp @@ -62,75 +62,71 @@ bool TimeRegions = false; } // end namespace llvm namespace { - cl::opt - Action(cl::desc("Action to perform:"), - cl::values(clEnumValN(PrintRecords, "print-records", - "Print all records to stdout (default)"), - clEnumValN(DumpJSON, "dump-json", - "Dump all records as machine-readable JSON"), - clEnumValN(GenEmitter, "gen-emitter", - "Generate machine code emitter"), - clEnumValN(GenRegisterInfo, "gen-register-info", - "Generate registers and register classes info"), - clEnumValN(GenInstrInfo, "gen-instr-info", - "Generate instruction descriptions"), - clEnumValN(GenInstrDocs, "gen-instr-docs", - "Generate instruction documentation"), - clEnumValN(GenCallingConv, "gen-callingconv", - "Generate calling convention descriptions"), - clEnumValN(GenAsmWriter, "gen-asm-writer", - "Generate assembly writer"), - clEnumValN(GenDisassembler, "gen-disassembler", - "Generate disassembler"), - clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", - "Generate pseudo instruction lowering"), - clEnumValN(GenCompressInst, "gen-compress-inst-emitter", - "Generate RISCV compressed instructions."), - clEnumValN(GenAsmMatcher, "gen-asm-matcher", - "Generate assembly instruction matcher"), - clEnumValN(GenDAGISel, "gen-dag-isel", - "Generate a DAG instruction selector"), - clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", - "Generate DFA Packetizer for VLIW targets"), - clEnumValN(GenFastISel, "gen-fast-isel", - "Generate a \"fast\" instruction selector"), - clEnumValN(GenSubtarget, "gen-subtarget", - "Generate subtarget enumerations"), - clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", - "Generate intrinsic enums"), - clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", - "Generate intrinsic information"), - clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", - "Generate target intrinsic enums"), - clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", - "Generate target intrinsic information"), - clEnumValN(PrintEnums, "print-enums", - "Print enum values for a class"), - clEnumValN(PrintSets, "print-sets", - "Print expanded sets for testing DAG exprs"), - clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", - "Generate option definitions"), - clEnumValN(GenCTags, "gen-ctags", - "Generate ctags-compatible index"), - clEnumValN(GenAttributes, "gen-attrs", - "Generate attributes"), - clEnumValN(GenSearchableTables, "gen-searchable-tables", - "Generate generic binary-searchable table"), - clEnumValN(GenGlobalISel, "gen-global-isel", - "Generate GlobalISel selector"), - clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", - "Generate X86 EVEX to VEX compress tables"), - clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", - "Generate X86 fold tables"), - clEnumValN(GenRegisterBank, "gen-register-bank", - "Generate registers bank descriptions"), - clEnumValN(GenExegesis, "gen-exegesis", - "Generate llvm-exegesis tables"))); +cl::opt Action( + cl::desc("Action to perform:"), + cl::values( + clEnumValN(PrintRecords, "print-records", + "Print all records to stdout (default)"), + clEnumValN(DumpJSON, "dump-json", + "Dump all records as machine-readable JSON"), + clEnumValN(GenEmitter, "gen-emitter", "Generate machine code emitter"), + clEnumValN(GenRegisterInfo, "gen-register-info", + "Generate registers and register classes info"), + clEnumValN(GenInstrInfo, "gen-instr-info", + "Generate instruction descriptions"), + clEnumValN(GenInstrDocs, "gen-instr-docs", + "Generate instruction documentation"), + clEnumValN(GenCallingConv, "gen-callingconv", + "Generate calling convention descriptions"), + clEnumValN(GenAsmWriter, "gen-asm-writer", "Generate assembly writer"), + clEnumValN(GenDisassembler, "gen-disassembler", + "Generate disassembler"), + clEnumValN(GenPseudoLowering, "gen-pseudo-lowering", + "Generate pseudo instruction lowering"), + clEnumValN(GenCompressInst, "gen-compress-inst-emitter", + "Generate RISCV compressed instructions."), + clEnumValN(GenAsmMatcher, "gen-asm-matcher", + "Generate assembly instruction matcher"), + clEnumValN(GenDAGISel, "gen-dag-isel", + "Generate a DAG instruction selector"), + clEnumValN(GenDFAPacketizer, "gen-dfa-packetizer", + "Generate DFA Packetizer for VLIW targets"), + clEnumValN(GenFastISel, "gen-fast-isel", + "Generate a \"fast\" instruction selector"), + clEnumValN(GenSubtarget, "gen-subtarget", + "Generate subtarget enumerations"), + clEnumValN(GenIntrinsicEnums, "gen-intrinsic-enums", + "Generate intrinsic enums"), + clEnumValN(GenIntrinsicImpl, "gen-intrinsic-impl", + "Generate intrinsic information"), + clEnumValN(GenTgtIntrinsicEnums, "gen-tgt-intrinsic-enums", + "Generate target intrinsic enums"), + clEnumValN(GenTgtIntrinsicImpl, "gen-tgt-intrinsic-impl", + "Generate target intrinsic information"), + clEnumValN(PrintEnums, "print-enums", "Print enum values for a class"), + clEnumValN(PrintSets, "print-sets", + "Print expanded sets for testing DAG exprs"), + clEnumValN(GenOptParserDefs, "gen-opt-parser-defs", + "Generate option definitions"), + clEnumValN(GenCTags, "gen-ctags", "Generate ctags-compatible index"), + clEnumValN(GenAttributes, "gen-attrs", "Generate attributes"), + clEnumValN(GenSearchableTables, "gen-searchable-tables", + "Generate generic binary-searchable table"), + clEnumValN(GenGlobalISel, "gen-global-isel", + "Generate GlobalISel selector"), + clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", + "Generate X86 EVEX to VEX compress tables"), + clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", + "Generate X86 fold tables"), + clEnumValN(GenRegisterBank, "gen-register-bank", + "Generate registers bank descriptions"), + clEnumValN(GenExegesis, "gen-exegesis", + "Generate llvm-exegesis tables"))); - cl::OptionCategory PrintEnumsCat("Options for -print-enums"); - cl::opt - Class("class", cl::desc("Print Enum list for this class"), - cl::value_desc("class name"), cl::cat(PrintEnumsCat)); +cl::OptionCategory PrintEnumsCat("Options for -print-enums"); +cl::opt Class("class", cl::desc("Print Enum list for this class"), + cl::value_desc("class name"), + cl::cat(PrintEnumsCat)); cl::opt TimeRegionsOpt("time-regions", From 92aeda089b49b1e15955b4eca715b67b28dd43bd Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Wed, 2 Oct 2019 20:25:16 +0000 Subject: [PATCH 12/82] [PowerPC] Fix SH field overflow issue Store rlwinm Rx, Ry, 32, 0, 31 as rlwinm Rx, Ry, 0, 0, 31 and store rldicl Rx, Ry, 64, 0 as rldicl Rx, Ry, 0, 0. Otherwise SH field is overflow and fails assertion in assembly printing stage. Differential Revision: https://reviews.llvm.org/D66991 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373519 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/PowerPC/PPCInstrInfo.cpp | 12 ++++-- test/CodeGen/PowerPC/sh-overflow.mir | 58 ++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/PowerPC/sh-overflow.mir diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 2b413d0b97ab..dc013c8ff9a0 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -3571,16 +3571,20 @@ bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI, } else { // The 32 bit and 64 bit instructions are quite different. if (SpecialShift32) { - // Left shifts use (N, 0, 31-N), right shifts use (32-N, N, 31). - uint64_t SH = RightShift ? 32 - ShAmt : ShAmt; + // Left shifts use (N, 0, 31-N). + // Right shifts use (32-N, N, 31) if 0 < N < 32. + // use (0, 0, 31) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 32 - ShAmt : ShAmt; uint64_t MB = RightShift ? ShAmt : 0; uint64_t ME = RightShift ? 31 : 31 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(MB) .addImm(ME); } else { - // Left shifts use (N, 63-N), right shifts use (64-N, N). - uint64_t SH = RightShift ? 64 - ShAmt : ShAmt; + // Left shifts use (N, 63-N). + // Right shifts use (64-N, N) if 0 < N < 64. + // use (0, 0) if N == 0. + uint64_t SH = ShAmt == 0 ? 0 : RightShift ? 64 - ShAmt : ShAmt; uint64_t ME = RightShift ? ShAmt : 63 - ShAmt; replaceInstrOperandWithImm(MI, III.OpNoForForwarding, SH); MachineInstrBuilder(*MI.getParent()->getParent(), MI).addImm(ME); diff --git a/test/CodeGen/PowerPC/sh-overflow.mir b/test/CodeGen/PowerPC/sh-overflow.mir new file mode 100644 index 000000000000..31cd710c39ea --- /dev/null +++ b/test/CodeGen/PowerPC/sh-overflow.mir @@ -0,0 +1,58 @@ +# RUN: llc -O3 -mtriple=powerpc64le-unknown-linux-gnu -start-after ppc-mi-peepholes -ppc-late-peephole -ppc-asm-full-reg-names -verify-machineinstrs %s -o - | FileCheck %s + +--- +name: special_right_shift32_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: gprc } + - { id: 1, class: gprc } + - { id: 2, class: gprc } +liveins: + - { reg: '$r3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $r3 + + ; Ensure we do not attempt to transform this into srwi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rlwinm $r3, $r3, 32 - 0, 0, 31) + + ; CHECK-LABEL: special_right_shift32_0: + ; CHECK: slwi r[[#]], r[[#]], 0 + + %0:gprc = COPY killed $r3 + %1:gprc = LI 0 + %2:gprc = SRW killed %0, killed %1 + $r3 = COPY killed %2 + BLR implicit $lr, implicit $rm, implicit killed $r3 + +... +--- +name: special_right_shift64_0 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: g8rc } + - { id: 1, class: gprc } + - { id: 2, class: g8rc } +liveins: + - { reg: '$x3', virtual-reg: '%0' } +machineFunctionInfo: {} +body: | + bb.0.entry: + liveins: $x3 + + ; Ensure we do not attempt to transform this into srdi $r3, $r3, 0 in the + ; form specified by ISA 3.0b (rldicl $r3, $r3, 64 - 0, 0) + + ; CHECK-LABEL: special_right_shift64_0: + ; CHECK: rotldi r[[#]], r[[#]], 0 + + %0:g8rc = COPY killed $x3 + %1:gprc = LI 0 + %2:g8rc = SRD killed %0, killed %1 + $x3 = COPY killed %2 + BLR8 implicit $lr8, implicit $rm, implicit killed $x3 + +... From d6cf1c6f817bbafd27befcc821b0df58f4cedf08 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 2 Oct 2019 20:40:13 +0000 Subject: [PATCH 13/82] Recommit "[GlobalOpt] Pass DTU to removeUnreachableBlocks instead of recomputing." The cause for the revert should be fixed by r373513 / a80b6c15425f82521c624ff24c5c0a34cd534d54 This reverts commit 47dbcbd8ec6bf6c0b9cbe5811e81a37cc55e73ef. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373522 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/IPO/GlobalOpt.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp index 9c7fd5e1a813..feac1b608848 100644 --- a/lib/Transforms/IPO/GlobalOpt.cpp +++ b/lib/Transforms/IPO/GlobalOpt.cpp @@ -2285,14 +2285,10 @@ OptimizeFunctions(Module &M, // So, remove unreachable blocks from the function, because a) there's // no point in analyzing them and b) GlobalOpt should otherwise grow // some more complicated logic to break these cycles. - // Removing unreachable blocks might invalidate the dominator so we - // recalculate it. if (!F->isDeclaration()) { - if (removeUnreachableBlocks(*F)) { - auto &DT = LookupDomTree(*F); - DT.recalculate(*F); - Changed = true; - } + auto &DT = LookupDomTree(*F); + DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); + Changed |= removeUnreachableBlocks(*F, &DTU); } Changed |= processGlobal(*F, GetTLI, LookupDomTree); From 03bb65ceece39a9c44d33577ca5cc66b91c973d9 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 2 Oct 2019 21:13:07 +0000 Subject: [PATCH 14/82] [gicombiner] Add the boring boilerplate for the declarative combiner Summary: This is the first of a series of patches extracted from a much bigger WIP patch. It merely establishes the tblgen pass and the way empty combiner helpers are declared and integrated into a combiner info. The tablegen pass takes a -combiners option to select the combiner helper that will be generated. This can be given multiple values to generate multiple combiner helpers at once. Doing so helps to minimize parsing overhead. The reason for creating a GlobalISel subdirectory in utils/TableGen is that there will be quite a lot of non-pass files (~15) by the time the patch series is done. Reviewers: volkan Subscribers: mgorny, hiraditya, simoncook, Petar.Avramovic, s.egerton, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68286 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373527 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Target/GlobalISel/Combine.td | 17 ++++ lib/Target/AArch64/AArch64.td | 1 + lib/Target/AArch64/AArch64Combine.td | 15 +++ .../AArch64/AArch64PreLegalizerCombiner.cpp | 17 ++++ lib/Target/AArch64/CMakeLists.txt | 2 + utils/TableGen/CMakeLists.txt | 1 + utils/TableGen/GICombinerEmitter.cpp | 94 +++++++++++++++++++ utils/TableGen/TableGen.cpp | 6 ++ utils/TableGen/TableGenBackends.h | 1 + 9 files changed, 154 insertions(+) create mode 100644 include/llvm/Target/GlobalISel/Combine.td create mode 100644 lib/Target/AArch64/AArch64Combine.td create mode 100644 utils/TableGen/GICombinerEmitter.cpp diff --git a/include/llvm/Target/GlobalISel/Combine.td b/include/llvm/Target/GlobalISel/Combine.td new file mode 100644 index 000000000000..065e28eca8a6 --- /dev/null +++ b/include/llvm/Target/GlobalISel/Combine.td @@ -0,0 +1,17 @@ +//===- Combine.td - Combine rule definitions ---------------*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declare GlobalISel combine rules and provide mechanisms to opt-out. +// +//===----------------------------------------------------------------------===// + +// Declares a combiner helper class +class GICombinerHelper { + // The class name to use in the generated output. + string Classname = classname; +} diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 6689ee48200e..51bf35d4a161 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -406,6 +406,7 @@ include "AArch64Schedule.td" include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; diff --git a/lib/Target/AArch64/AArch64Combine.td b/lib/Target/AArch64/AArch64Combine.td new file mode 100644 index 000000000000..c4658f73b8dd --- /dev/null +++ b/lib/Target/AArch64/AArch64Combine.td @@ -0,0 +1,15 @@ +//=- AArch64.td - Define AArch64 Combine Rules ---------------*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +include "llvm/Target/GlobalISel/Combine.td" + +def AArch64PreLegalizerCombinerHelper: GICombinerHelper< + "AArch64GenPreLegalizerCombinerHelper">; diff --git a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index 6df3f944f8c3..bea75b83517b 100644 --- a/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -27,12 +27,22 @@ using namespace llvm; using namespace MIPatternMatch; +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS + namespace { +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H + class AArch64PreLegalizerCombinerInfo : public CombinerInfo { GISelKnownBits *KB; MachineDominatorTree *MDT; public: + AArch64GenPreLegalizerCombinerHelper Generated; + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize, GISelKnownBits *KB, MachineDominatorTree *MDT) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, @@ -81,9 +91,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } } + if (Generated.tryCombineAll(Observer, MI, B)) + return true; + return false; } +#define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP +#include "AArch64GenGICombiner.inc" +#undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP + // Pass boilerplate // ================ diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 0da057ea9973..8473ddfca4cb 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -8,6 +8,8 @@ tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel) tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) +tablegen(LLVM AArch64GenGICombiner.inc -gen-global-isel-combiner + -combiners='AArch64PreLegalizerCombinerHelper') tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index c88365a2b8ce..d97f9359f54d 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -24,6 +24,7 @@ add_tablegen(llvm-tblgen LLVM ExegesisEmitter.cpp FastISelEmitter.cpp FixedLenDecoderEmitter.cpp + GICombinerEmitter.cpp GlobalISelEmitter.cpp InfoByHwMode.cpp InstrInfoEmitter.cpp diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp new file mode 100644 index 000000000000..7a9c87b6b936 --- /dev/null +++ b/utils/TableGen/GICombinerEmitter.cpp @@ -0,0 +1,94 @@ +//===- GlobalCombinerEmitter.cpp - Generate a combiner --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Generate a combiner implementation for GlobalISel from a declarative +/// syntax +/// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Timer.h" +#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/TableGenBackend.h" +#include "CodeGenTarget.h" + +using namespace llvm; + +#define DEBUG_TYPE "gicombiner-emitter" + +cl::OptionCategory + GICombinerEmitterCat("Options for -gen-global-isel-combiner"); +static cl::list + SelectedCombiners("combiners", cl::desc("Emit the specified combiners"), + cl::cat(GICombinerEmitterCat), cl::CommaSeparated); +namespace { +class GICombinerEmitter { + StringRef Name; + Record *Combiner; +public: + explicit GICombinerEmitter(RecordKeeper &RK, StringRef Name); + ~GICombinerEmitter() {} + + StringRef getClassName() const { + return Combiner->getValueAsString("Classname"); + } + void run(raw_ostream &OS); + +}; + +GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK, StringRef Name) + : Name(Name), Combiner(RK.getDef(Name)) {} + +void GICombinerEmitter::run(raw_ostream &OS) { + NamedRegionTimer T("Emit", "Time spent emitting the combiner", + "Code Generation", "Time spent generating code", + TimeRegions); + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_DEPS\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n" + << "class " << getClassName() << " {\n" + << "public:\n" + << " bool tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const;\n" + << "};\n"; + OS << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_H\n\n"; + + OS << "#ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n" + << "\n" + << "bool " << getClassName() << "::tryCombineAll(\n" + << " GISelChangeObserver &Observer,\n" + << " MachineInstr &MI,\n" + << " MachineIRBuilder &B) const {\n" + << " MachineBasicBlock *MBB = MI.getParent();\n" + << " MachineFunction *MF = MBB->getParent();\n" + << " MachineRegisterInfo &MRI = MF->getRegInfo();\n" + << " (void)MBB; (void)MF; (void)MRI;\n\n"; + OS << "\n return false;\n" + << "}\n" + << "#endif // ifdef " << Name.upper() << "_GENCOMBINERHELPER_CPP\n"; +} + +} // end anonymous namespace + +//===----------------------------------------------------------------------===// + +namespace llvm { +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS) { + CodeGenTarget Target(RK); + emitSourceFileHeader("Global Combiner", OS); + + if (SelectedCombiners.empty()) + PrintFatalError("No combiners selected with -combiners"); + for (const auto &Combiner : SelectedCombiners) + GICombinerEmitter(RK, Combiner).run(OS); +} + +} // namespace llvm diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp index d8dc27e7f725..817663b06e29 100644 --- a/utils/TableGen/TableGen.cpp +++ b/utils/TableGen/TableGen.cpp @@ -49,6 +49,7 @@ enum ActionType { GenAttributes, GenSearchableTables, GenGlobalISel, + GenGICombiner, GenX86EVEX2VEXTables, GenX86FoldTables, GenRegisterBank, @@ -114,6 +115,8 @@ cl::opt Action( "Generate generic binary-searchable table"), clEnumValN(GenGlobalISel, "gen-global-isel", "Generate GlobalISel selector"), + clEnumValN(GenGICombiner, "gen-global-isel-combiner", + "Generate GlobalISel combiner"), clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables", "Generate X86 EVEX to VEX compress tables"), clEnumValN(GenX86FoldTables, "gen-x86-fold-tables", @@ -231,6 +234,9 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) { case GenGlobalISel: EmitGlobalISel(Records, OS); break; + case GenGICombiner: + EmitGICombiner(Records, OS); + break; case GenRegisterBank: EmitRegisterBank(Records, OS); break; diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h index 135ec65c0f95..9c21ef354407 100644 --- a/utils/TableGen/TableGenBackends.h +++ b/utils/TableGen/TableGenBackends.h @@ -85,6 +85,7 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS); void EmitAttributes(RecordKeeper &RK, raw_ostream &OS); void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS); void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS); +void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS); void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS); void EmitX86FoldTables(RecordKeeper &RK, raw_ostream &OS); void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS); From 376a50ec1cfa1d2275405a29c3a4d6a9d9f23c28 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Wed, 2 Oct 2019 22:27:24 +0000 Subject: [PATCH 15/82] DebugInfo: Simplify RangeSpan to be a plain struct This is an effort to make RangeSpan and DebugLocStream::Entry more similar to share code for their emission (to reuse the more complicated code for using (& choosing when to use) base address selection entries, etc). It didn't seem like this struct was worth the complexity of encapsulation - when the members could be initialized by the ctor to any value (no validation) and the type is assignable (so there's no mutability or other constraint being implemented by its interface). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373533 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 12 ++++++------ lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 12 ++++++------ lib/CodeGen/AsmPrinter/DwarfFile.h | 12 +++--------- 3 files changed, 15 insertions(+), 21 deletions(-) diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index c1c5c4f010c7..69c4d3fb5b44 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -326,13 +326,13 @@ void DwarfCompileUnit::addRange(RangeSpan Range) { // emitted into and the subprogram was contained within. If these are the // same then extend our current range, otherwise add this as a new range. if (CURanges.empty() || !SameAsPrevCU || - (&CURanges.back().getEnd()->getSection() != - &Range.getEnd()->getSection())) { + (&CURanges.back().End->getSection() != + &Range.End->getSection())) { CURanges.push_back(Range); return; } - CURanges.back().setEnd(Range.getEnd()); + CURanges.back().End = Range.End; } void DwarfCompileUnit::initStmtList() { @@ -506,7 +506,7 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( if (Ranges.size() == 1 || !DD->useRangesSection()) { const RangeSpan &Front = Ranges.front(); const RangeSpan &Back = Ranges.back(); - attachLowHighPC(Die, Front.getStart(), Back.getEnd()); + attachLowHighPC(Die, Front.Begin, Back.End); } else addScopeRangeList(Die, std::move(Ranges)); } @@ -516,8 +516,8 @@ void DwarfCompileUnit::attachRangesOrLowHighPC( SmallVector List; List.reserve(Ranges.size()); for (const InsnRange &R : Ranges) - List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first), - DD->getLabelAfterInsn(R.second))); + List.push_back( + {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)}); attachRangesOrLowHighPC(Die, std::move(List)); } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 8237404bf8e9..4501f46dceb7 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -1098,7 +1098,7 @@ void DwarfDebug::finalizeModuleInfo() { // 2.17.3). U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0); else - U.setBaseAddress(TheCU.getRanges().front().getStart()); + U.setBaseAddress(TheCU.getRanges().front().Begin); U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges()); } @@ -1807,7 +1807,7 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { collectEntityInfo(TheCU, SP, Processed); // Add the range of this function to the list of ranges for the CU. - TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd())); + TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()}); // Under -gmlt, skip building the subprogram if there are no inlined // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram @@ -2570,7 +2570,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, auto Size = Asm->MAI->getCodePointerSize(); for (const RangeSpan &Range : List.getRanges()) - SectionRanges[&Range.getStart()->getSection()].push_back(&Range); + SectionRanges[&Range.Begin->getSection()].push_back(&Range); const DwarfCompileUnit &CU = List.getCU(); const MCSymbol *CUBase = CU.getBaseAddress(); @@ -2586,7 +2586,7 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, if (!Base && (P.second.size() > 1 || DwarfVersion < 5) && (CU.getCUNode()->getRangesBaseAddress() || DwarfVersion >= 5)) { BaseIsSet = true; - Base = DD.getSectionLabel(&P.second.front()->getStart()->getSection()); + Base = DD.getSectionLabel(&P.second.front()->Begin->getSection()); if (DwarfVersion >= 5) { Asm->OutStreamer->AddComment("DW_RLE_base_addressx"); Asm->OutStreamer->EmitIntValue(dwarf::DW_RLE_base_addressx, 1); @@ -2605,8 +2605,8 @@ static void emitRangeList(DwarfDebug &DD, AsmPrinter *Asm, } for (const auto *RS : P.second) { - const MCSymbol *Begin = RS->getStart(); - const MCSymbol *End = RS->getEnd(); + const MCSymbol *Begin = RS->Begin; + const MCSymbol *End = RS->End; assert(Begin && "Range without a begin symbol?"); assert(End && "Range without an end symbol?"); if (Base) { diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index 244678ce9dc1..25ed8da970a4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -32,15 +32,9 @@ class LexicalScope; class MCSection; // Data structure to hold a range for range lists. -class RangeSpan { -public: - RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {} - const MCSymbol *getStart() const { return Start; } - const MCSymbol *getEnd() const { return End; } - void setEnd(const MCSymbol *E) { End = E; } - -private: - const MCSymbol *Start, *End; +struct RangeSpan { + const MCSymbol *Begin; + const MCSymbol *End; }; class RangeSpanList { From 70897f70e0cf9837c5fe2a200a2f184639a72471 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Wed, 2 Oct 2019 22:33:07 +0000 Subject: [PATCH 16/82] gn build: (manually) merge r373527 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373534 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn | 10 ++++++++++ utils/gn/secondary/llvm/utils/TableGen/BUILD.gn | 1 + 2 files changed, 11 insertions(+) diff --git a/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn index b05e0891d124..0c27c11e2e0b 100644 --- a/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn @@ -30,6 +30,15 @@ tablegen("AArch64GenGlobalISel") { td_file = "AArch64.td" } +tablegen("AArch64GenGICombiner") { + visibility = [ ":LLVMAArch64CodeGen" ] + args = [ + "-gen-global-isel-combiner", + "-combiners=AArch64PreLegalizerCombinerHelper", + ] + td_file = "AArch64.td" +} + tablegen("AArch64GenMCPseudoLowering") { visibility = [ ":LLVMAArch64CodeGen" ] args = [ "-gen-pseudo-lowering" ] @@ -48,6 +57,7 @@ static_library("LLVMAArch64CodeGen") { ":AArch64GenCallingConv", ":AArch64GenDAGISel", ":AArch64GenFastISel", + ":AArch64GenGICombiner", ":AArch64GenGlobalISel", ":AArch64GenMCPseudoLowering", ":AArch64GenRegisterBank", diff --git a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 01219543d2db..9f5043faeed8 100644 --- a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -30,6 +30,7 @@ executable("llvm-tblgen") { "ExegesisEmitter.cpp", "FastISelEmitter.cpp", "FixedLenDecoderEmitter.cpp", + "GICombinerEmitter.cpp", "GlobalISelEmitter.cpp", "InfoByHwMode.cpp", "InstrDocsEmitter.cpp", From c73599fde257ea8edd289a45919b290b0a396614 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Oct 2019 22:34:49 +0000 Subject: [PATCH 17/82] [LegalizeTypes] Check for already split condition before calilng SplitVecRes_SETCC in SplitRes_SELECT. No point in manually splitting the SETCC if it was already done. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373535 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp index eadc388fc9d5..560b5729e3de 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp @@ -514,15 +514,15 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) { if (Cond.getValueType().isVector()) { if (SDValue Res = WidenVSELECTAndMask(N)) std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl); - // It seems to improve code to generate two narrow SETCCs as opposed to - // splitting a wide result vector. - else if (Cond.getOpcode() == ISD::SETCC) - SplitVecRes_SETCC(Cond.getNode(), CL, CH); // Check if there are already splitted versions of the vector available and // use those instead of splitting the mask operand again. else if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector) GetSplitVector(Cond, CL, CH); + // It seems to improve code to generate two narrow SETCCs as opposed to + // splitting a wide result vector. + else if (Cond.getOpcode() == ISD::SETCC) + SplitVecRes_SETCC(Cond.getNode(), CL, CH); else std::tie(CL, CH) = DAG.SplitVector(Cond, dl); } From a4a6122eaeb9ce339649ce984a5579d55f2c8b03 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 2 Oct 2019 22:35:03 +0000 Subject: [PATCH 18/82] [X86] Add test cases for suboptimal vselect+setcc splitting. If the vselect result type needs to be split, it will try to also try to split the condition if it happens to be a setcc. With avx512 where k-registers are legal, its probably better to just use a kshift to split the mask register. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373536 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/avx512-vselect.ll | 171 +++++++++++++++++++++ test/CodeGen/X86/min-legal-vector-width.ll | 92 +++++++++++ 2 files changed, 263 insertions(+) diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll index d61e4e13df9c..7ee4e6674e0e 100644 --- a/test/CodeGen/X86/avx512-vselect.ll +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -47,3 +47,174 @@ entry: %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b ret <16 x double> %ret } + +define <16 x i64> @test3(<16 x i8> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test3: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vpshufd {{.*#+}} xmm5 = xmm0[2,3,0,1] +; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test3: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %xmm5, %xmm0, %xmm0 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i8> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test4(<16 x i16> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test4: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-SKX-NEXT: vptestnmw %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmw %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test4: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqw %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k1} +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k1 +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i16> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <16 x i64> @test5(<16 x i32> %x, <16 x i64> %a, <16 x i64> %b) { +; CHECK-SKX-LABEL: test5: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmd %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmd %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test5: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vptestnmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vptestnmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmq %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmq %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <16 x i32> %x, zeroinitializer + %ret = select <16 x i1> %c, <16 x i64> %a, <16 x i64> %b + ret <16 x i64> %ret +} + +define <32 x i32> @test6(<32 x i8> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test6: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-SKX-NEXT: vptestnmb %xmm5, %xmm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %xmm0, %xmm0, %k2 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test6: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; CHECK-KNL-NEXT: vpcmpeqb %ymm5, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 +; CHECK-KNL-NEXT: vpmovsxbd %xmm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpmovsxbd %xmm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i8> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <32 x i32> @test7(<32 x i16> %x, <32 x i32> %a, <32 x i32> %b) { +; CHECK-SKX-LABEL: test7: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmw %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmw %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test7: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-KNL-NEXT: vpxor %xmm6, %xmm6, %xmm6 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm5, %ymm5 +; CHECK-KNL-NEXT: vpmovsxwd %ymm5, %zmm5 +; CHECK-KNL-NEXT: vptestmd %zmm5, %zmm5, %k1 +; CHECK-KNL-NEXT: vpcmpeqw %ymm6, %ymm0, %ymm0 +; CHECK-KNL-NEXT: vpmovsxwd %ymm0, %zmm0 +; CHECK-KNL-NEXT: vptestmd %zmm0, %zmm0, %k2 +; CHECK-KNL-NEXT: vpblendmd %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-KNL-NEXT: vpblendmd %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq + %c = icmp eq <32 x i16> %x, zeroinitializer + %ret = select <32 x i1> %c, <32 x i32> %a, <32 x i32> %b + ret <32 x i32> %ret +} + +define <64 x i16> @test8(<64 x i8> %x, <64 x i16> %a, <64 x i16> %b) { +; CHECK-SKX-LABEL: test8: +; CHECK-SKX: # %bb.0: +; CHECK-SKX-NEXT: vextracti64x4 $1, %zmm0, %ymm5 +; CHECK-SKX-NEXT: vptestnmb %ymm5, %ymm5, %k1 +; CHECK-SKX-NEXT: vptestnmb %ymm0, %ymm0, %k2 +; CHECK-SKX-NEXT: vpblendmw %zmm1, %zmm3, %zmm0 {%k2} +; CHECK-SKX-NEXT: vpblendmw %zmm2, %zmm4, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test8: +; CHECK-KNL: # %bb.0: +; CHECK-KNL-NEXT: pushq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_offset 16 +; CHECK-KNL-NEXT: .cfi_offset %rbp, -16 +; CHECK-KNL-NEXT: movq %rsp, %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa_register %rbp +; CHECK-KNL-NEXT: andq $-32, %rsp +; CHECK-KNL-NEXT: subq $32, %rsp +; CHECK-KNL-NEXT: vextracti64x4 $1, %zmm0, %ymm8 +; CHECK-KNL-NEXT: vmovdqa 16(%rbp), %ymm9 +; CHECK-KNL-NEXT: vpxor %xmm10, %xmm10, %xmm10 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm0, %ymm11 +; CHECK-KNL-NEXT: vpmovsxbw %xmm11, %ymm0 +; CHECK-KNL-NEXT: vpblendvb %ymm0, %ymm1, %ymm5, %ymm0 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm11, %xmm1 +; CHECK-KNL-NEXT: vpmovsxbw %xmm1, %ymm1 +; CHECK-KNL-NEXT: vpblendvb %ymm1, %ymm2, %ymm6, %ymm1 +; CHECK-KNL-NEXT: vpcmpeqb %ymm10, %ymm8, %ymm5 +; CHECK-KNL-NEXT: vpmovsxbw %xmm5, %ymm2 +; CHECK-KNL-NEXT: vpblendvb %ymm2, %ymm3, %ymm7, %ymm2 +; CHECK-KNL-NEXT: vextracti128 $1, %ymm5, %xmm3 +; CHECK-KNL-NEXT: vpmovsxbw %xmm3, %ymm3 +; CHECK-KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm9, %ymm3 +; CHECK-KNL-NEXT: movq %rbp, %rsp +; CHECK-KNL-NEXT: popq %rbp +; CHECK-KNL-NEXT: .cfi_def_cfa %rsp, 8 +; CHECK-KNL-NEXT: retq + %c = icmp eq <64 x i8> %x, zeroinitializer + %ret = select <64 x i1> %c, <64 x i16> %a, <64 x i16> %b + ret <64 x i16> %ret +} diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index b69525deb41e..deb261151402 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -982,3 +982,95 @@ define void @sext_v16i8_v16i64(<16 x i8> %x, <16 x i64>* %y) nounwind "min-legal store <16 x i64> %a, <16 x i64>* %y ret void } + +define void @vselect_split_v8i16_setcc(<8 x i16> %s, <8 x i16> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i16> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v8i32_setcc(<8 x i32> %s, <8 x i32> %t, <8 x i64>* %p, <8 x i64>* %q, <8 x i64>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v8i32_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k1 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpcmpeqd %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa64 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa64 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <8 x i64>, <8 x i64>* %p + %y = load <8 x i64>, <8 x i64>* %q + %a = icmp eq <8 x i32> %s, %t + %b = select <8 x i1> %a, <8 x i64> %x, <8 x i64> %y + store <8 x i64> %b, <8 x i64>* %r + ret void +} + +define void @vselect_split_v16i8_setcc(<16 x i8> %s, <16 x i8> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i8_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k1 +; CHECK-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: vpcmpeqb %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i8> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} + +define void @vselect_split_v16i16_setcc(<16 x i16> %s, <16 x i16> %t, <16 x i32>* %p, <16 x i32>* %q, <16 x i32>* %r) "min-legal-vector-width"="256" { +; CHECK-LABEL: vselect_split_v16i16_setcc: +; CHECK: # %bb.0: +; CHECK-NEXT: vmovdqa (%rsi), %ymm2 +; CHECK-NEXT: vmovdqa 32(%rsi), %ymm3 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k1 +; CHECK-NEXT: vextracti128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm0 +; CHECK-NEXT: vpcmpeqw %xmm1, %xmm0, %k2 +; CHECK-NEXT: vmovdqa32 32(%rdi), %ymm3 {%k2} +; CHECK-NEXT: vmovdqa32 (%rdi), %ymm2 {%k1} +; CHECK-NEXT: vmovdqa %ymm2, (%rdx) +; CHECK-NEXT: vmovdqa %ymm3, 32(%rdx) +; CHECK-NEXT: vzeroupper +; CHECK-NEXT: retq + %x = load <16 x i32>, <16 x i32>* %p + %y = load <16 x i32>, <16 x i32>* %q + %a = icmp eq <16 x i16> %s, %t + %b = select <16 x i1> %a, <16 x i32> %x, <16 x i32> %y + store <16 x i32> %b, <16 x i32>* %r + ret void +} From 863bc82a9a116edd09658a0d3c4a964609f053b6 Mon Sep 17 00:00:00 2001 From: David Bolvansky Date: Wed, 2 Oct 2019 22:49:20 +0000 Subject: [PATCH 19/82] [InstCombine] Transform bcopy to memmove bcopy is still widely used mainly for network apps. Sadly, LLVM has no optimizations for bcopy, but there are some for memmove. Since bcopy == memmove, it is profitable to transform bcopy to memmove and use current optimizations for memmove for free here. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373537 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm/Transforms/Utils/SimplifyLibCalls.h | 1 + lib/Transforms/Utils/SimplifyLibCalls.cpp | 8 ++++++ test/Transforms/InstCombine/bcopy.ll | 25 +++++++++++++++++++ 3 files changed, 34 insertions(+) create mode 100644 test/Transforms/InstCombine/bcopy.ll diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h index b722c47c1cab..88c2ef787ad8 100644 --- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h +++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h @@ -181,6 +181,7 @@ class LibCallSimplifier { Value *optimizeMemSet(CallInst *CI, IRBuilder<> &B); Value *optimizeRealloc(CallInst *CI, IRBuilder<> &B); Value *optimizeWcslen(CallInst *CI, IRBuilder<> &B); + Value *optimizeBCopy(CallInst *CI, IRBuilder<> &B); // Wrapper for all String/Memory Library Call Optimizations Value *optimizeStringMemoryLibCall(CallInst *CI, IRBuilder<> &B); diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 3af754a3eb03..1fb4f28f3364 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -2792,6 +2792,12 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) { return nullptr; } +Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) { + // bcopy(src, dst, n) -> llvm.memmove(dst, src, n) + return B.CreateMemMove(CI->getArgOperand(1), 1, CI->getArgOperand(0), 1, + CI->getArgOperand(2)); +} + bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) { LibFunc Func; SmallString<20> FloatFuncName = FuncName; @@ -2870,6 +2876,8 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI, return optimizeRealloc(CI, Builder); case LibFunc_wcslen: return optimizeWcslen(CI, Builder); + case LibFunc_bcopy: + return optimizeBCopy(CI, Builder); default: break; } diff --git a/test/Transforms/InstCombine/bcopy.ll b/test/Transforms/InstCombine/bcopy.ll new file mode 100644 index 000000000000..6a53bad7eeb0 --- /dev/null +++ b/test/Transforms/InstCombine/bcopy.ll @@ -0,0 +1,25 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare void @bcopy(i8* nocapture readonly, i8* nocapture, i32) + +define void @bcopy_memmove(i8* nocapture readonly %a, i8* nocapture %b) { +; CHECK-LABEL: @bcopy_memmove( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[A:%.*]] to i64* +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[B:%.*]] to i64* +; CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[TMP1]], align 1 +; CHECK-NEXT: store i64 [[TMP3]], i64* [[TMP2]], align 1 +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 8) + ret void +} + +define void @bcopy_memmove2(i8* nocapture readonly %a, i8* nocapture %b, i32 %len) { +; CHECK-LABEL: @bcopy_memmove2( +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i32(i8* align 1 [[B:%.*]], i8* align 1 [[A:%.*]], i32 [[LEN:%.*]], i1 false) +; CHECK-NEXT: ret void +; + tail call void @bcopy(i8* %a, i8* %b, i32 %len) + ret void +} From 84c9bb2b3e4b6eca45abbab19c357272ad97f955 Mon Sep 17 00:00:00 2001 From: David Blaikie Date: Wed, 2 Oct 2019 22:58:02 +0000 Subject: [PATCH 20/82] DebugInfo: Rename DebugLocStream::Entry::Begin/EndSym to just Begin/End Brings this struct in line with the RangeSpan class so they might eventually be used by common template code for generating range/loc lists with less duplicate code. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373540 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/AsmPrinter/DebugLocStream.h | 10 +++------- lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 20 ++++++++++---------- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/lib/CodeGen/AsmPrinter/DebugLocStream.h b/lib/CodeGen/AsmPrinter/DebugLocStream.h index 789291771b5a..a062baf7698a 100644 --- a/lib/CodeGen/AsmPrinter/DebugLocStream.h +++ b/lib/CodeGen/AsmPrinter/DebugLocStream.h @@ -38,14 +38,10 @@ class DebugLocStream { : CU(CU), EntryOffset(EntryOffset) {} }; struct Entry { - const MCSymbol *BeginSym; - const MCSymbol *EndSym; + const MCSymbol *Begin; + const MCSymbol *End; size_t ByteOffset; size_t CommentOffset; - Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset, - size_t CommentOffset) - : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset), - CommentOffset(CommentOffset) {} }; private: @@ -93,7 +89,7 @@ class DebugLocStream { /// Until the next call, bytes added to the stream will be added to this /// entry. void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) { - Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size()); + Entries.push_back({BeginSym, EndSym, DWARFBytes.size(), Comments.size()}); } /// Finalize a .debug_loc entry, deleting if it's empty. diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 4501f46dceb7..aeca172cfdb4 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -2325,12 +2325,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_offset_pair"); Asm->OutStreamer->EmitIntValue(dwarf::DW_LLE_offset_pair, 1); Asm->OutStreamer->AddComment(" starting offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.BeginSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.Begin, Base); Asm->OutStreamer->AddComment(" ending offset"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Base); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Base); } else { - Asm->EmitLabelDifference(Entry.BeginSym, Base, Size); - Asm->EmitLabelDifference(Entry.EndSym, Base, Size); + Asm->EmitLabelDifference(Entry.Begin, Base, Size); + Asm->EmitLabelDifference(Entry.End, Base, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2346,12 +2346,12 @@ void DwarfDebug::emitDebugLoc() { Asm->OutStreamer->AddComment("DW_LLE_startx_length"); Asm->emitInt8(dwarf::DW_LLE_startx_length); Asm->OutStreamer->AddComment(" start idx"); - Asm->EmitULEB128(AddrPool.getIndex(Entry.BeginSym)); + Asm->EmitULEB128(AddrPool.getIndex(Entry.Begin)); Asm->OutStreamer->AddComment(" length"); - Asm->EmitLabelDifferenceAsULEB128(Entry.EndSym, Entry.BeginSym); + Asm->EmitLabelDifferenceAsULEB128(Entry.End, Entry.Begin); } else { - Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size); - Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.Begin, Size); + Asm->OutStreamer->EmitSymbolValue(Entry.End, Size); } emitDebugLocEntryLocation(Entry, CU); @@ -2386,9 +2386,9 @@ void DwarfDebug::emitDebugLocDWO() { // Ideally/in v5, this could use SectionLabels to reuse existing addresses // in the address pool to minimize object size/relocations. Asm->emitInt8(dwarf::DW_LLE_startx_length); - unsigned idx = AddrPool.getIndex(Entry.BeginSym); + unsigned idx = AddrPool.getIndex(Entry.Begin); Asm->EmitULEB128(idx); - Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4); + Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4); emitDebugLocEntryLocation(Entry, List.CU); } From 3607fde43ebda11daa1da0e913762552d28c3e5e Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 2 Oct 2019 23:01:58 +0000 Subject: [PATCH 21/82] [NFC][InstCombine] Add tests for 'variable sext of variable high bit extract' pattern (PR43523) https://bugs.llvm.org/show_bug.cgi?id=43523 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373541 91177308-0d34-0410-b5e6-96231b3b80d8 --- ...signext-of-variable-high-bit-extraction.ll | 584 ++++++++++++++++++ 1 file changed, 584 insertions(+) create mode 100644 test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll diff --git a/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll new file mode 100644 index 000000000000..61343c7feb8a --- /dev/null +++ b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll @@ -0,0 +1,584 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use16(i16) +declare void @use32(i32) +declare void @use64(i64) + +define i32 @t0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_zext_of_nbits(i64 %data, i8 %nbits_narrow) { +; CHECK-LABEL: @t0_zext_of_nbits( +; CHECK-NEXT: [[NBITS:%.*]] = zext i8 [[NBITS_NARROW:%.*]] to i16 +; CHECK-NEXT: call void @use16(i16 [[NBITS]]) +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub nsw i16 64, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i16 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW:%.*]] = sub nsw i16 32, [[NBITS]] +; CHECK-NEXT: call void @use16(i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = zext i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]] to i32 +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %nbits = zext i8 %nbits_narrow to i16 + call void @use16(i16 %nbits) + %skip_high = sub i16 64, %nbits + call void @use16(i16 %skip_high) + %skip_high_wide = zext i16 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow_narrow = sub i16 32, %nbits + call void @use16(i16 %num_high_bits_to_smear_narrow_narrow) + %num_high_bits_to_smear_narrow = zext i16 %num_high_bits_to_smear_narrow_narrow to i32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t0_exact(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t0_exact( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr exact i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr exact i64 %data, %skip_high_wide ; We can preserve `exact`-ness of the original shift. + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @t1_redundant_sext(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t1_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED_WITH_SIGNEXTENSION]]) +; CHECK-NEXT: [[EXTRACTED_WITH_SIGNEXTENSION_NARROW:%.*]] = trunc i64 [[EXTRACTED_WITH_SIGNEXTENSION]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted_with_signextension = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted_with_signextension) + %extracted_with_signextension_narrow = trunc i64 %extracted_with_signextension to i32 ; this is already the answer. + call void @use32(i32 %extracted_with_signextension_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_with_signextension_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i64 @t2_notrunc(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t2_notrunc( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = lshr i64 %data, %skip_high + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear ; + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear ; can just shift %data itself. + ret i64 %signextended +} + +define i64 @t3_notrunc_redundant_sext(i64 %data, i64 %nbits) { +; CHECK-LABEL: @t3_notrunc_redundant_sext( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i64 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR:%.*]] = sub i64 64, [[NBITS]] +; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] +; + %skip_high = sub i64 64, %nbits + call void @use64(i64 %skip_high) + %extracted = ashr i64 %data, %skip_high ; this is already the answer. + call void @use64(i64 %extracted) + %num_high_bits_to_smear = sub i64 64, %nbits + call void @use64(i64 %num_high_bits_to_smear) + %signbit_positioned = shl i64 %extracted, %num_high_bits_to_smear + call void @use64(i64 %signbit_positioned) + %signextended = ashr i64 %signbit_positioned, %num_high_bits_to_smear + ret i64 %signextended +} + +define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) { +; CHECK-LABEL: @t4_vec( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <2 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <2 x i32> [[SKIP_HIGH]] to <2 x i64> +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr <2 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc <2 x i64> [[EXTRACTED]] to <2 x i32> +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub <2 x i32> , [[NBITS]] +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl <2 x i32> [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr <2 x i32> [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret <2 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <2 x i32> , %nbits + %skip_high_wide = zext <2 x i32> %skip_high to <2 x i64> + %extracted = lshr <2 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <2 x i64> %extracted to <2 x i32> + %num_high_bits_to_smear_narrow = sub <2 x i32> , %nbits + %signbit_positioned = shl <2 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr <2 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow + ret <2 x i32> %signextended +} + +define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) { +; CHECK-LABEL: @t5_vec_undef( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <3 x i32> [[SKIP_HIGH]] to <3 x i64> +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc <3 x i64> [[EXTRACTED]] to <3 x i32> +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub <3 x i32> , [[NBITS]] +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub <3 x i32> , [[NBITS]] +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl <3 x i32> [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr <3 x i32> [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: ret <3 x i32> [[SIGNEXTENDED]] +; + %skip_high = sub <3 x i32> , %nbits + %skip_high_wide = zext <3 x i32> %skip_high to <3 x i64> + %extracted = lshr <3 x i64> %data, %skip_high_wide + %extracted_narrow = trunc <3 x i64> %extracted to <3 x i32> + %num_high_bits_to_smear_narrow0 = sub <3 x i32> , %nbits + %num_high_bits_to_smear_narrow1 = sub <3 x i32> , %nbits + %signbit_positioned = shl <3 x i32> %extracted_narrow, %num_high_bits_to_smear_narrow0 + %signextended = ashr <3 x i32> %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret <3 x i32> %signextended +} + +; Extra-uses +define i32 @t6_extrause_good0(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t6_extrause_good0( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow ; will go away + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} +define i32 @t7_extrause_good1(i64 %data, i32 %nbits) { +; CHECK-LABEL: @t7_extrause_good1( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow0 = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow0) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits ; will go away. + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow0 + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow1 + ret i32 %signextended +} +define i32 @n8_extrause_bad(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n8_extrause_bad( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %signbit_positioned) + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow ; neither of operands will go away. + ret i32 %signextended +} + +; Negative tests +define i32 @n9(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n9( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 63, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 63, %nbits ; not 64 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n10(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 31, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 31, %nbits ; not 32 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n11(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 ; not %nbits2 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow + ret i32 %signextended +} + +define i32 @n12(i64 %data, i32 %nbits1, i32 %nbits2) { +; CHECK-LABEL: @n12( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS1:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub i32 32, [[NBITS1]] +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW2:%.*]] = sub i32 32, [[NBITS2:%.*]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]]) +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]]) +; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW2]] +; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; + %skip_high = sub i32 64, %nbits1 + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow1 = sub i32 32, %nbits1 ; not %nbits2 + %num_high_bits_to_smear_narrow2 = sub i32 32, %nbits2 ; not %nbits1 + call void @use32(i32 %num_high_bits_to_smear_narrow1) + call void @use32(i32 %num_high_bits_to_smear_narrow2) + %signbit_positioned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow1 + %signextended = ashr i32 %signbit_positioned, %num_high_bits_to_smear_narrow2 + ret i32 %signextended +} + +define i32 @n13(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n13_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n13_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = lshr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[TMP1:%.*]] = lshr i32 -1, [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[RES:%.*]] = and i32 [[TMP1]], [[EXTRACTED_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} +define i32 @n14_extrause(i64 %data, i32 %nbits) { +; CHECK-LABEL: @n14_extrause( +; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub i32 64, [[NBITS:%.*]] +; CHECK-NEXT: call void @use32(i32 [[SKIP_HIGH]]) +; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext i32 [[SKIP_HIGH]] to i64 +; CHECK-NEXT: call void @use64(i64 [[SKIP_HIGH_WIDE]]) +; CHECK-NEXT: [[EXTRACTED:%.*]] = ashr i64 [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: call void @use64(i64 [[EXTRACTED]]) +; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc i64 [[EXTRACTED]] to i32 +; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) +; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] +; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) +; CHECK-NEXT: [[HIGHBITS_CLEANED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: call void @use32(i32 [[HIGHBITS_CLEANED]]) +; CHECK-NEXT: [[RES:%.*]] = lshr i32 [[HIGHBITS_CLEANED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: ret i32 [[RES]] +; + %skip_high = sub i32 64, %nbits + call void @use32(i32 %skip_high) + %skip_high_wide = zext i32 %skip_high to i64 + call void @use64(i64 %skip_high_wide) + %extracted = ashr i64 %data, %skip_high_wide + call void @use64(i64 %extracted) + %extracted_narrow = trunc i64 %extracted to i32 + call void @use32(i32 %extracted_narrow) + %num_high_bits_to_smear_narrow = sub i32 32, %nbits + call void @use32(i32 %num_high_bits_to_smear_narrow) + %highbits_cleaned = shl i32 %extracted_narrow, %num_high_bits_to_smear_narrow + call void @use32(i32 %highbits_cleaned) + %res = lshr i32 %highbits_cleaned, %num_high_bits_to_smear_narrow ; not ashr + ret i32 %res +} From a213b6762a51e964cb97d4749291a859096805bd Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Wed, 2 Oct 2019 23:02:12 +0000 Subject: [PATCH 22/82] [InstCombine] Bypass high bit extract before variable sign-extension (PR43523) https://rise4fun.com/Alive/8BY - valid for lshr+trunc+variable sext https://rise4fun.com/Alive/7jk - the variable sext can be redundant https://rise4fun.com/Alive/Qslu - 'exact'-ness of first shift can be preserver https://rise4fun.com/Alive/IF63 - without trunc we could view this as more general "drop redundant mask before right-shift", but let's handle it here for now https://rise4fun.com/Alive/iip - likewise, without trunc, variable sext can be redundant. There's more patterns for sure - e.g. we can have 'lshr' as the final shift, but that might be best handled by some more generic transform, e.g. "drop redundant masking before right-shift" (PR42456) I'm singling-out this sext patch because you can only extract high bits with `*shr` (unlike abstract bit masking), and i *know* this fold is wanted by existing code. I don't believe there is much to review here, so i'm gonna opt into post-review mode here. https://bugs.llvm.org/show_bug.cgi?id=43523 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373542 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/InstCombineInternal.h | 2 + .../InstCombine/InstCombineShifts.cpp | 72 +++++++++++++++++++ ...signext-of-variable-high-bit-extraction.ll | 43 +++++------ 3 files changed, 91 insertions(+), 26 deletions(-) diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 673099436b79..dcdbee15fe56 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -351,6 +351,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombiner Instruction *visitOr(BinaryOperator &I); Instruction *visitXor(BinaryOperator &I); Instruction *visitShl(BinaryOperator &I); + Instruction *foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr); Instruction *visitAShr(BinaryOperator &I); Instruction *visitLShr(BinaryOperator &I); Instruction *commonShiftTransforms(BinaryOperator &I); diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp index bc4affbecdfa..9d96ddc4040d 100644 --- a/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -1039,6 +1039,75 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) { return nullptr; } +Instruction * +InstCombiner::foldVariableSignZeroExtensionOfVariableHighBitExtract( + BinaryOperator &OldAShr) { + assert(OldAShr.getOpcode() == Instruction::AShr && + "Must be called with arithmetic right-shift instruction only."); + + // Check that constant C is a splat of the element-wise bitwidth of V. + auto BitWidthSplat = [](Constant *C, Value *V) { + return match( + C, m_SpecificInt_ICMP(ICmpInst::Predicate::ICMP_EQ, + APInt(C->getType()->getScalarSizeInBits(), + V->getType()->getScalarSizeInBits()))); + }; + + // It should look like variable-length sign-extension on the outside: + // (Val << (bitwidth(Val)-Nbits)) a>> (bitwidth(Val)-Nbits) + Value *NBits; + Instruction *MaybeTrunc; + Constant *C1, *C2; + if (!match(&OldAShr, + m_AShr(m_Shl(m_Instruction(MaybeTrunc), + m_ZExtOrSelf(m_Sub(m_Constant(C1), + m_ZExtOrSelf(m_Value(NBits))))), + m_ZExtOrSelf(m_Sub(m_Constant(C2), + m_ZExtOrSelf(m_Deferred(NBits)))))) || + !BitWidthSplat(C1, &OldAShr) || !BitWidthSplat(C2, &OldAShr)) + return nullptr; + + // There may or may not be a truncation after outer two shifts. + Instruction *HighBitExtract; + match(MaybeTrunc, m_TruncOrSelf(m_Instruction(HighBitExtract))); + bool HadTrunc = MaybeTrunc != HighBitExtract; + + // And finally, the innermost part of the pattern must be a right-shift. + Value *X, *NumLowBitsToSkip; + if (!match(HighBitExtract, m_Shr(m_Value(X), m_Value(NumLowBitsToSkip)))) + return nullptr; + + // Said right-shift must extract high NBits bits - C0 must be it's bitwidth. + Constant *C0; + if (!match(NumLowBitsToSkip, + m_ZExtOrSelf( + m_Sub(m_Constant(C0), m_ZExtOrSelf(m_Specific(NBits))))) || + !BitWidthSplat(C0, HighBitExtract)) + return nullptr; + + // Since the NBits is identical for all shifts, if the outermost and + // innermost shifts are identical, then outermost shifts are redundant. + // If we had truncation, do keep it though. + if (HighBitExtract->getOpcode() == OldAShr.getOpcode()) + return replaceInstUsesWith(OldAShr, MaybeTrunc); + + // Else, if there was a truncation, then we need to ensure that one + // instruction will go away. + if (HadTrunc && !match(&OldAShr, m_c_BinOp(m_OneUse(m_Value()), m_Value()))) + return nullptr; + + // Finally, bypass two innermost shifts, and perform the outermost shift on + // the operands of the innermost shift. + Instruction *NewAShr = + BinaryOperator::Create(OldAShr.getOpcode(), X, NumLowBitsToSkip); + NewAShr->copyIRFlags(HighBitExtract); // We can preserve 'exact'-ness. + if (!HadTrunc) + return NewAShr; + + Builder.Insert(NewAShr); + return TruncInst::CreateTruncOrBitCast(NewAShr, OldAShr.getType()); +} + Instruction *InstCombiner::visitAShr(BinaryOperator &I) { if (Value *V = SimplifyAShrInst(I.getOperand(0), I.getOperand(1), I.isExact(), SQ.getWithInstruction(&I))) @@ -1113,6 +1182,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) { } } + if (Instruction *R = foldVariableSignZeroExtensionOfVariableHighBitExtract(I)) + return R; + // See if we can turn a signed shr into an unsigned shr. if (MaskedValueIsZero(Op0, APInt::getSignMask(BitWidth), 0, &I)) return BinaryOperator::CreateLShr(Op0, Op1); diff --git a/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll index 61343c7feb8a..a5f38735a373 100644 --- a/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll +++ b/test/Transforms/InstCombine/variable-signext-of-variable-high-bit-extraction.ll @@ -17,8 +17,8 @@ define i32 @t0(i64 %data, i32 %nbits) { ; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) ; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] ; %skip_high = sub i32 64, %nbits @@ -51,8 +51,8 @@ define i32 @t0_zext_of_nbits(i64 %data, i8 %nbits_narrow) { ; CHECK-NEXT: call void @use16(i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]]) ; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = zext i16 [[NUM_HIGH_BITS_TO_SMEAR_NARROW_NARROW]] to i32 ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] ; %nbits = zext i8 %nbits_narrow to i16 @@ -85,8 +85,8 @@ define i32 @t0_exact(i64 %data, i32 %nbits) { ; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) ; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr exact i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] ; %skip_high = sub i32 64, %nbits @@ -118,8 +118,7 @@ define i32 @t1_redundant_sext(i64 %data, i32 %nbits) { ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) ; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] ; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] +; CHECK-NEXT: ret i32 [[EXTRACTED_WITH_SIGNEXTENSION_NARROW]] ; %skip_high = sub i32 64, %nbits call void @use32(i32 %skip_high) @@ -147,7 +146,7 @@ define i64 @t2_notrunc(i64 %data, i64 %nbits) { ; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) ; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] ; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH]] ; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] ; %skip_high = sub i64 64, %nbits @@ -172,8 +171,7 @@ define i64 @t3_notrunc_redundant_sext(i64 %data, i64 %nbits) { ; CHECK-NEXT: call void @use64(i64 [[NUM_HIGH_BITS_TO_SMEAR]]) ; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i64 [[EXTRACTED]], [[NUM_HIGH_BITS_TO_SMEAR]] ; CHECK-NEXT: call void @use64(i64 [[SIGNBIT_POSITIONED]]) -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i64 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR]] -; CHECK-NEXT: ret i64 [[SIGNEXTENDED]] +; CHECK-NEXT: ret i64 [[EXTRACTED]] ; %skip_high = sub i64 64, %nbits call void @use64(i64 %skip_high) @@ -191,11 +189,8 @@ define <2 x i32> @t4_vec(<2 x i64> %data, <2 x i32> %nbits) { ; CHECK-LABEL: @t4_vec( ; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <2 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <2 x i32> [[SKIP_HIGH]] to <2 x i64> -; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr <2 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] -; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc <2 x i64> [[EXTRACTED]] to <2 x i32> -; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub <2 x i32> , [[NBITS]] -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl <2 x i32> [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr <2 x i32> [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <2 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <2 x i64> [[TMP1]] to <2 x i32> ; CHECK-NEXT: ret <2 x i32> [[SIGNEXTENDED]] ; %skip_high = sub <2 x i32> , %nbits @@ -212,12 +207,8 @@ define <3 x i32> @t5_vec_undef(<3 x i64> %data, <3 x i32> %nbits) { ; CHECK-LABEL: @t5_vec_undef( ; CHECK-NEXT: [[SKIP_HIGH:%.*]] = sub <3 x i32> , [[NBITS:%.*]] ; CHECK-NEXT: [[SKIP_HIGH_WIDE:%.*]] = zext <3 x i32> [[SKIP_HIGH]] to <3 x i64> -; CHECK-NEXT: [[EXTRACTED:%.*]] = lshr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] -; CHECK-NEXT: [[EXTRACTED_NARROW:%.*]] = trunc <3 x i64> [[EXTRACTED]] to <3 x i32> -; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub <3 x i32> , [[NBITS]] -; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub <3 x i32> , [[NBITS]] -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl <3 x i32> [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr <3 x i32> [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr <3 x i64> [[DATA:%.*]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc <3 x i64> [[TMP1]] to <3 x i32> ; CHECK-NEXT: ret <3 x i32> [[SIGNEXTENDED]] ; %skip_high = sub <3 x i32> , %nbits @@ -244,8 +235,8 @@ define i32 @t6_extrause_good0(i64 %data, i32 %nbits) { ; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) ; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW]]) -; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] ; %skip_high = sub i32 64, %nbits @@ -274,10 +265,10 @@ define i32 @t7_extrause_good1(i64 %data, i32 %nbits) { ; CHECK-NEXT: call void @use32(i32 [[EXTRACTED_NARROW]]) ; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW0:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: call void @use32(i32 [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]]) -; CHECK-NEXT: [[NUM_HIGH_BITS_TO_SMEAR_NARROW1:%.*]] = sub i32 32, [[NBITS]] ; CHECK-NEXT: [[SIGNBIT_POSITIONED:%.*]] = shl i32 [[EXTRACTED_NARROW]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW0]] ; CHECK-NEXT: call void @use32(i32 [[SIGNBIT_POSITIONED]]) -; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = ashr i32 [[SIGNBIT_POSITIONED]], [[NUM_HIGH_BITS_TO_SMEAR_NARROW1]] +; CHECK-NEXT: [[TMP1:%.*]] = ashr i64 [[DATA]], [[SKIP_HIGH_WIDE]] +; CHECK-NEXT: [[SIGNEXTENDED:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: ret i32 [[SIGNEXTENDED]] ; %skip_high = sub i32 64, %nbits From e49e59bb45e26f9983847f18dba32d2cfd986da7 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 2 Oct 2019 23:03:21 +0000 Subject: [PATCH 23/82] [gicombiner] Fix a nullptr dereference when -combiners is given a name that isn't defined This is unlikely to be the root cause for the windows bot failures but it would explain the stack trace seen. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373543 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/TableGen/GICombinerEmitter.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp index 7a9c87b6b936..a85462b5aa89 100644 --- a/utils/TableGen/GICombinerEmitter.cpp +++ b/utils/TableGen/GICombinerEmitter.cpp @@ -31,7 +31,8 @@ class GICombinerEmitter { StringRef Name; Record *Combiner; public: - explicit GICombinerEmitter(RecordKeeper &RK, StringRef Name); + explicit GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner); ~GICombinerEmitter() {} StringRef getClassName() const { @@ -41,8 +42,9 @@ class GICombinerEmitter { }; -GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK, StringRef Name) - : Name(Name), Combiner(RK.getDef(Name)) {} +GICombinerEmitter::GICombinerEmitter(RecordKeeper &RK, StringRef Name, + Record *Combiner) + : Name(Name), Combiner(Combiner) {} void GICombinerEmitter::run(raw_ostream &OS) { NamedRegionTimer T("Emit", "Time spent emitting the combiner", @@ -87,8 +89,12 @@ void EmitGICombiner(RecordKeeper &RK, raw_ostream &OS) { if (SelectedCombiners.empty()) PrintFatalError("No combiners selected with -combiners"); - for (const auto &Combiner : SelectedCombiners) - GICombinerEmitter(RK, Combiner).run(OS); + for (const auto &Combiner : SelectedCombiners) { + Record *CombinerDef = RK.getDef(Combiner); + if (!CombinerDef) + PrintFatalError("Could not find " + Combiner); + GICombinerEmitter(RK, Combiner, CombinerDef).run(OS); + } } } // namespace llvm From 60eeded1a40b0d2132dcc7a2dcfead7e94d4ed52 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Wed, 2 Oct 2019 23:23:46 +0000 Subject: [PATCH 24/82] [AMDGPU] Fix illegal agpr use by VALU When SIFixSGPRCopies attempts to fix an illegal copy from vector to scalar register it calls moveToVALU(). A copy from an agpr to sgpr becomes a copy from agpr to agpr, which may result in the illegal register class at a use of this copy. Solution is to copy it always into a vgpr. This may result in a subsequent copy into an agpr if that is what really needed, however should not happen too often and likely will be folded later. The opposite situation may not happen because an sgpr is always illegal where agpr is legal, so such user instructions may not exist. Differential Revision: https://reviews.llvm.org/D68358 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373544 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SIInstrInfo.cpp | 11 ++++++++++- test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir | 15 +++++++++++++++ test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll | 9 ++++++--- 3 files changed, 31 insertions(+), 4 deletions(-) create mode 100644 test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index d5f2902f18a8..7a6bb0e20b79 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5665,7 +5665,16 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass( if (RI.hasAGPRs(NewDstRC)) return nullptr; - NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + switch (Inst.getOpcode()) { + case AMDGPU::PHI: + case AMDGPU::REG_SEQUENCE: + case AMDGPU::INSERT_SUBREG: + NewDstRC = RI.getEquivalentAGPRClass(NewDstRC); + break; + default: + NewDstRC = RI.getEquivalentVGPRClass(NewDstRC); + } + if (!NewDstRC) return nullptr; } else { diff --git a/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir b/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir new file mode 100644 index 000000000000..11af6e19ecb2 --- /dev/null +++ b/test/CodeGen/AMDGPU/fold_acc_copy_into_valu.mir @@ -0,0 +1,15 @@ +# RUN: llc -march=amdgcn -mcpu=gfx908 -o - -run-pass si-fix-sgpr-copies -verify-machineinstrs %s | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: fold_acc_copy_into_valu +# GCN: [[COPY:%[0-9]+]]:vgpr_32 = COPY %0.sub0 +# GCN: %2:vgpr_32 = V_AND_B32_e32 [[COPY]], undef %3:vgpr_32, implicit $exec +--- +name: fold_acc_copy_into_valu +body: | + bb.0.entry: + + %0:areg_1024 = IMPLICIT_DEF + %1:sreg_32_xm0 = COPY %0.sub0 + %3:vgpr_32 = V_AND_B32_e32 %1, undef %2:vgpr_32, implicit $exec + +... diff --git a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll index abcf3342fcf4..e7ad0bd0122e 100644 --- a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll +++ b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll @@ -1,5 +1,5 @@ ; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s -; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s +; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=GCN %s ; ERR: error: :0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy ; GCN: ; illegal copy v1 to s9 @@ -43,7 +43,8 @@ define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a1 to s9 +; GCN: v_accvgpr_read_b32 [[COPY1:v[0-9]+]], a1 +; GCN: ; illegal copy [[COPY1]] to s9 define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { %agpr = call i32 asm sideeffect "; def $0", "=${a1}"() call void asm sideeffect "; use $0", "${s9}"(i32 %agpr) @@ -51,7 +52,9 @@ define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_i32() #1 { } ; ERR: error: :0:0: in function illegal_agpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy -; GCN: ; illegal copy a[0:1] to s[10:11] +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1L:[0-9]+]], a0 +; GCN-DAG: v_accvgpr_read_b32 v[[COPY1H:[0-9]+]], a1 +; GCN: ; illegal copy v{{\[}}[[COPY1L]]:[[COPY1H]]] to s[10:11] define amdgpu_kernel void @illegal_agpr_to_sgpr_copy_v2i32() #1 { %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${a[0:1]}"() call void asm sideeffect "; use $0", "${s[10:11]}"(<2 x i32> %vgpr) From 672abbceff6f4724d0367fb5f5be076620d40513 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Wed, 2 Oct 2019 23:38:06 +0000 Subject: [PATCH 25/82] [gicombiner] Fix windows issue where single quotes in the command are passed through to tablegen git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373545 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 8473ddfca4cb..103925d45d51 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -9,7 +9,7 @@ tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler) tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel) tablegen(LLVM AArch64GenGlobalISel.inc -gen-global-isel) tablegen(LLVM AArch64GenGICombiner.inc -gen-global-isel-combiner - -combiners='AArch64PreLegalizerCombinerHelper') + -combiners="AArch64PreLegalizerCombinerHelper") tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info) tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter) tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering) From 4af15128c956e7888621afee20a26bf915a70381 Mon Sep 17 00:00:00 2001 From: GN Sync Bot Date: Thu, 3 Oct 2019 00:47:13 +0000 Subject: [PATCH 26/82] gn build: Merge r373538 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373550 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/utils/gn/secondary/clang/lib/Driver/BUILD.gn index 66d209717f68..a70c69fc1b8c 100644 --- a/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -67,6 +67,7 @@ static_library("Driver") { "ToolChains/Haiku.cpp", "ToolChains/Hexagon.cpp", "ToolChains/Hurd.cpp", + "ToolChains/InterfaceStubs.cpp", "ToolChains/Linux.cpp", "ToolChains/MSP430.cpp", "ToolChains/MSVC.cpp", From e85df0ba54bf7772656b55e8eb29bd3e09ed6952 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Thu, 3 Oct 2019 01:04:42 +0000 Subject: [PATCH 27/82] [gicombiner] Add a CodeExpander to handle C++ fragments with variable expansion Summary: This will handle expansion of C++ fragments in the declarative combiner including custom predicates, and escapes into C++ to aid the migration effort. Reviewers: bogner, volkan Subscribers: mgorny, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68288 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373551 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/CMakeLists.txt | 3 +- unittests/TableGen/CMakeLists.txt | 11 ++ unittests/TableGen/CodeExpanderTest.cpp | 203 +++++++++++++++++++++ utils/TableGen/CMakeLists.txt | 3 + utils/TableGen/GICombinerEmitter.cpp | 5 + utils/TableGen/GlobalISel/CMakeLists.txt | 7 + utils/TableGen/GlobalISel/CodeExpander.cpp | 93 ++++++++++ utils/TableGen/GlobalISel/CodeExpander.h | 55 ++++++ utils/TableGen/GlobalISel/CodeExpansions.h | 43 +++++ 9 files changed, 422 insertions(+), 1 deletion(-) create mode 100644 unittests/TableGen/CMakeLists.txt create mode 100644 unittests/TableGen/CodeExpanderTest.cpp create mode 100644 utils/TableGen/GlobalISel/CMakeLists.txt create mode 100644 utils/TableGen/GlobalISel/CodeExpander.cpp create mode 100644 utils/TableGen/GlobalISel/CodeExpander.h create mode 100644 utils/TableGen/GlobalISel/CodeExpansions.h diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index 6bb2fb8eb923..9384bdad0434 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -31,8 +31,9 @@ add_subdirectory(Remarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) -add_subdirectory(TextAPI) +add_subdirectory(TableGen) add_subdirectory(Target) +add_subdirectory(TextAPI) add_subdirectory(Transforms) add_subdirectory(XRay) add_subdirectory(tools) diff --git a/unittests/TableGen/CMakeLists.txt b/unittests/TableGen/CMakeLists.txt new file mode 100644 index 000000000000..d90955786f86 --- /dev/null +++ b/unittests/TableGen/CMakeLists.txt @@ -0,0 +1,11 @@ +set(LLVM_LINK_COMPONENTS + TableGen + Support + ) + +add_llvm_unittest(TableGenTests + CodeExpanderTest.cpp + $ + ) + +include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) diff --git a/unittests/TableGen/CodeExpanderTest.cpp b/unittests/TableGen/CodeExpanderTest.cpp new file mode 100644 index 000000000000..75b9b7373707 --- /dev/null +++ b/unittests/TableGen/CodeExpanderTest.cpp @@ -0,0 +1,203 @@ +//===- llvm/unittest/TableGen/CodeExpanderTest.cpp - Tests ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "GlobalISel/CodeExpander.h" +#include "GlobalISel/CodeExpansions.h" + +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" +#include "gtest/gtest.h" + +using namespace llvm; + +static StringRef bufferize(StringRef Str) { + std::unique_ptr Buffer = + MemoryBuffer::getMemBufferCopy(Str, "TestBuffer"); + StringRef StrBufferRef = Buffer->getBuffer(); + SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); + return StrBufferRef; +} + +class RAIIDiagnosticChecker { + std::string EmittedDiags; + raw_string_ostream OS; + std::vector Expected; + std::vector Received; + +public: + RAIIDiagnosticChecker() : OS(EmittedDiags) { + SrcMgr.setDiagHandler(handler, this); + } + ~RAIIDiagnosticChecker() { + SrcMgr.setDiagHandler(nullptr); + EXPECT_EQ(Received.size(), Expected.size()); + for (unsigned i = 0; i < Received.size() && i < Expected.size(); ++i) { + EXPECT_EQ(Received[i].getLoc(), Expected[i].getLoc()); + EXPECT_EQ(Received[i].getFilename(), Expected[i].getFilename()); + EXPECT_EQ(Received[i].getKind(), Expected[i].getKind()); + EXPECT_EQ(Received[i].getLineNo(), Expected[i].getLineNo()); + EXPECT_EQ(Received[i].getColumnNo(), Expected[i].getColumnNo()); + EXPECT_EQ(Received[i].getMessage(), Expected[i].getMessage()); + EXPECT_EQ(Received[i].getLineContents(), Expected[i].getLineContents()); + EXPECT_EQ(Received[i].getRanges(), Expected[i].getRanges()); + } + + if (testing::Test::HasFailure()) + errs() << "Emitted diagnostic:\n" << OS.str(); + } + + void expect(SMDiagnostic D) { Expected.push_back(D); } + + void diag(const SMDiagnostic &D) { + Received.push_back(D); + } + + static void handler(const SMDiagnostic &D, void *Context) { + RAIIDiagnosticChecker *Self = static_cast(Context); + Self->diag(D); + SrcMgr.setDiagHandler(nullptr); + SrcMgr.PrintMessage(Self->OS, D); + SrcMgr.setDiagHandler(handler, Context); + }; +}; + +TEST(CodeExpander, NoExpansions) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("No expansions", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "No expansions"); +} + +// Indentation is applied to all lines except the first +TEST(CodeExpander, Indentation) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("No expansions\nsecond line\nthird line", Expansions, SMLoc(), + false, " ") + .emit(OS); + EXPECT_EQ(OS.str(), "No expansions\n second line\n third line"); +} + +// \ is an escape character that removes special meanings from the next +// character. +TEST(CodeExpander, Escape) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\\\\\a\\$", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "\\a$"); +} + +// $foo is not an expansion. It should warn though. +TEST(CodeExpander, NotAnExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize(" $foo"); + CodeExpander(" $foo", Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), " $foo"); + DiagChecker.expect(SMDiagnostic( + SrcMgr, SMLoc::getFromPointer(In.data() + 1), "TestBuffer", 1, 1, + SourceMgr::DK_Warning, "Assuming missing escape character", " $foo", {})); +} + +// \$foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedNotAnExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\$foo", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "$foo"); +} + +// \${foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedUnterminatedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\${foo", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "${foo"); +} + +// \${foo is not an expansion but shouldn't warn as it's using the escape. +TEST(CodeExpander, EscapedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("\\${foo}", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "${foo}"); +} + +// ${foo} is an undefined expansion and should error. +TEST(CodeExpander, UndefinedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + CodeExpander("${foo}${bar}", Expansions, SMLoc(), false).emit(OS); + EXPECT_EQ(OS.str(), "expansion"); + DiagChecker.expect( + SMDiagnostic(SrcMgr, SMLoc(), "", 0, -1, SourceMgr::DK_Error, + "Attempting to expand an undeclared variable foo", "", {})); +} + +// ${foo} is an undefined expansion and should error. When given a valid +// location for the start of the buffer it should correctly point at the +// expansion being performed. +TEST(CodeExpander, UndefinedExpansionWithLoc) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize("Padding ${foo}${bar}"); + CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), "Padding expansion"); + DiagChecker.expect(SMDiagnostic( + SrcMgr, SMLoc::getFromPointer(In.data() + 8), "TestBuffer", 1, 8, + SourceMgr::DK_Error, "Attempting to expand an undeclared variable foo", + "Padding ${foo}${bar}", {})); +} + +// ${bar is an unterminated expansion. Warn and implicitly terminate it. +TEST(CodeExpander, UnterminatedExpansion) { + std::string Result; + raw_string_ostream OS(Result); + CodeExpansions Expansions; + Expansions.declare("bar", "expansion"); + + RAIIDiagnosticChecker DiagChecker; + StringRef In = bufferize(" ${bar"); + CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) + .emit(OS); + EXPECT_EQ(OS.str(), " expansion"); + DiagChecker.expect(SMDiagnostic(SrcMgr, SMLoc::getFromPointer(In.data() + 1), + "TestBuffer", 1, 1, SourceMgr::DK_Warning, + "Unterminated expansion", " ${bar", {})); +} diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index d97f9359f54d..8a79d5757b2a 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -1,3 +1,5 @@ +add_subdirectory(GlobalISel) + set(LLVM_LINK_COMPONENTS Support) add_tablegen(llvm-tblgen LLVM @@ -49,5 +51,6 @@ add_tablegen(llvm-tblgen LLVM X86RecognizableInstr.cpp WebAssemblyDisassemblerEmitter.cpp CTagsEmitter.cpp + $ ) set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning") diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp index a85462b5aa89..c2b64bcfb7c5 100644 --- a/utils/TableGen/GICombinerEmitter.cpp +++ b/utils/TableGen/GICombinerEmitter.cpp @@ -26,6 +26,11 @@ cl::OptionCategory static cl::list SelectedCombiners("combiners", cl::desc("Emit the specified combiners"), cl::cat(GICombinerEmitterCat), cl::CommaSeparated); +static cl::opt ShowExpansions( + "gicombiner-show-expansions", + cl::desc("Use C++ comments to indicate occurence of code expansion"), + cl::cat(GICombinerEmitterCat)); + namespace { class GICombinerEmitter { StringRef Name; diff --git a/utils/TableGen/GlobalISel/CMakeLists.txt b/utils/TableGen/GlobalISel/CMakeLists.txt new file mode 100644 index 000000000000..cd6453482403 --- /dev/null +++ b/utils/TableGen/GlobalISel/CMakeLists.txt @@ -0,0 +1,7 @@ +set(LLVM_LINK_COMPONENTS + Support + ) + +llvm_add_library(LLVMTableGenGlobalISel OBJECT + CodeExpander.cpp + ) diff --git a/utils/TableGen/GlobalISel/CodeExpander.cpp b/utils/TableGen/GlobalISel/CodeExpander.cpp new file mode 100644 index 000000000000..d59a9b8e3b65 --- /dev/null +++ b/utils/TableGen/GlobalISel/CodeExpander.cpp @@ -0,0 +1,93 @@ +//===- CodeExpander.cpp - Expand variables in a string --------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Expand the variables in a string. +// +//===----------------------------------------------------------------------===// + +#include "CodeExpander.h" +#include "CodeExpansions.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/TableGen/Error.h" + +using namespace llvm; + +void CodeExpander::emit(raw_ostream &OS) const { + StringRef Current = Code; + + while (!Current.empty()) { + size_t Pos = Current.find_first_of("$\n\\"); + if (Pos == StringRef::npos) { + OS << Current; + Current = ""; + continue; + } + + OS << Current.substr(0, Pos); + Current = Current.substr(Pos); + + if (Current.startswith("\n")) { + OS << "\n" << Indent; + Current = Current.drop_front(1); + continue; + } + + if (Current.startswith("\\$") || Current.startswith("\\\\")) { + OS << Current[1]; + Current = Current.drop_front(2); + continue; + } + + if (Current.startswith("\\")) { + Current = Current.drop_front(1); + continue; + } + + if (Current.startswith("${")) { + StringRef StartVar = Current; + Current = Current.drop_front(2); + StringRef Var; + std::tie(Var, Current) = Current.split("}"); + + // Warn if we split because no terminator was found. + StringRef EndVar = StartVar.drop_front(2 /* ${ */ + Var.size()); + if (EndVar.empty()) { + size_t LocOffset = StartVar.data() - Code.data(); + PrintWarning( + Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Unterminated expansion"); + } + + auto ValueI = Expansions.find(Var); + if (ValueI == Expansions.end()) { + size_t LocOffset = StartVar.data() - Code.data(); + PrintError(Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Attempting to expand an undeclared variable " + Var); + } + if (ShowExpansions) + OS << "/*$" << Var << "{*/"; + OS << Expansions.lookup(Var); + if (ShowExpansions) + OS << "/*}*/"; + continue; + } + + size_t LocOffset = Current.data() - Code.data(); + PrintWarning(Loc.size() > 0 && Loc[0].isValid() + ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) + : SMLoc(), + "Assuming missing escape character"); + OS << "$"; + Current = Current.drop_front(1); + } +} diff --git a/utils/TableGen/GlobalISel/CodeExpander.h b/utils/TableGen/GlobalISel/CodeExpander.h new file mode 100644 index 000000000000..bd6946de5925 --- /dev/null +++ b/utils/TableGen/GlobalISel/CodeExpander.h @@ -0,0 +1,55 @@ +//===- CodeExpander.h - Expand variables in a string ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Expand the variables in a string. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H +#define LLVM_UTILS_TABLEGEN_CODEEXPANDER_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/SMLoc.h" + +namespace llvm { +class CodeExpansions; +class raw_ostream; + +/// Emit the given code with all '${foo}' placeholders expanded to their +/// replacements. +/// +/// It's an error to use an undefined expansion and expansion-like output that +/// needs to be emitted verbatim can be escaped as '\${foo}' +/// +/// The emitted code can be given a custom indent to enable both indentation by +/// an arbitrary amount of whitespace and emission of the code as a comment. +class CodeExpander { + StringRef Code; + const CodeExpansions &Expansions; + const ArrayRef &Loc; + bool ShowExpansions; + StringRef Indent; + +public: + CodeExpander(StringRef Code, const CodeExpansions &Expansions, + const ArrayRef &Loc, bool ShowExpansions, + StringRef Indent = " ") + : Code(Code), Expansions(Expansions), Loc(Loc), + ShowExpansions(ShowExpansions), Indent(Indent) {} + + void emit(raw_ostream &OS) const; +}; + +inline raw_ostream &operator<<(raw_ostream &OS, const CodeExpander &Expander) { + Expander.emit(OS); + return OS; +} +} // end namespace llvm + +#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H diff --git a/utils/TableGen/GlobalISel/CodeExpansions.h b/utils/TableGen/GlobalISel/CodeExpansions.h new file mode 100644 index 000000000000..bb890ec8f57e --- /dev/null +++ b/utils/TableGen/GlobalISel/CodeExpansions.h @@ -0,0 +1,43 @@ +//===- CodeExpansions.h - Record expansions for CodeExpander --------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file Record the expansions to use in a CodeExpander. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringMap.h" + +#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H +#define LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H +namespace llvm { +class CodeExpansions { +public: + using const_iterator = StringMap::const_iterator; + +protected: + StringMap Expansions; + +public: + void declare(StringRef Name, StringRef Expansion) { + bool Inserted = Expansions.try_emplace(Name, Expansion).second; + assert(Inserted && "Declared variable twice"); + (void)Inserted; + } + + std::string lookup(StringRef Variable) const { + return Expansions.lookup(Variable); + } + + const_iterator begin() const { return Expansions.begin(); } + const_iterator end() const { return Expansions.end(); } + const_iterator find(StringRef Variable) const { + return Expansions.find(Variable); + } +}; +} // end namespace llvm +#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H From 3fd76aeb1f732150bbcd39b96131a6c5d56a9c2f Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 3 Oct 2019 01:32:51 +0000 Subject: [PATCH 28/82] gn build: (manually) merge r373551 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373554 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/unittests/BUILD.gn | 1 + utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn | 13 +++++++++++++ utils/gn/secondary/llvm/utils/TableGen/BUILD.gn | 1 + .../llvm/utils/TableGen/GlobalISel/BUILD.gn | 8 ++++++++ 4 files changed, 23 insertions(+) create mode 100644 utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn create mode 100644 utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn diff --git a/utils/gn/secondary/llvm/unittests/BUILD.gn b/utils/gn/secondary/llvm/unittests/BUILD.gn index f03456dca4a8..3d607dd77471 100644 --- a/utils/gn/secondary/llvm/unittests/BUILD.gn +++ b/utils/gn/secondary/llvm/unittests/BUILD.gn @@ -34,6 +34,7 @@ group("unittests") { "Remarks:RemarksTests", "Support:SupportTests", "Support/DynamicLibrary:DynamicLibraryTests", + "TableGen:TableGenTests", "TextAPI:TextAPITests", "Transforms/IPO:IPOTests", "Transforms/Scalar:ScalarTests", diff --git a/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn b/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn new file mode 100644 index 000000000000..3f128f8ce2aa --- /dev/null +++ b/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn @@ -0,0 +1,13 @@ +import("//llvm/utils/unittest/unittest.gni") + +unittest("TableGenTests") { + deps = [ + "//llvm/lib/Support", + "//llvm/lib/TableGen", + "//llvm/utils/TableGen/GlobalISel", + ] + include_dirs = [ "//llvm/utils/TableGen" ] + sources = [ + "CodeExpanderTest.cpp", + ] +} diff --git a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 9f5043faeed8..952b2f916062 100644 --- a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -4,6 +4,7 @@ executable("llvm-tblgen") { "//llvm/lib/MC", "//llvm/lib/Support", "//llvm/lib/TableGen", + "//llvm/utils/TableGen/GlobalISel", ] sources = [ "AsmMatcherEmitter.cpp", diff --git a/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn new file mode 100644 index 000000000000..fe703cf9cad8 --- /dev/null +++ b/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn @@ -0,0 +1,8 @@ +source_set("GlobalISel") { + deps = [ + "//llvm/lib/Support", + ] + sources = [ + "CodeExpander.cpp", + ] +} From 65e15736cd3b2b20d3426a87ea9bff2d6edab743 Mon Sep 17 00:00:00 2001 From: Daniel Sanders Date: Thu, 3 Oct 2019 01:49:04 +0000 Subject: [PATCH 29/82] [gicombiner] Make rL373551 compatible with older cmakes Newer cmakes appear to be more flexible w.r.t object libraries. Convert to a static library so that it works with older cmakes too git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373555 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/TableGen/CMakeLists.txt | 3 +-- utils/TableGen/CMakeLists.txt | 2 +- utils/TableGen/GlobalISel/CMakeLists.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/unittests/TableGen/CMakeLists.txt b/unittests/TableGen/CMakeLists.txt index d90955786f86..47bde04cad0d 100644 --- a/unittests/TableGen/CMakeLists.txt +++ b/unittests/TableGen/CMakeLists.txt @@ -5,7 +5,6 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(TableGenTests CodeExpanderTest.cpp - $ ) - include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) +target_link_libraries(TableGenTests PRIVATE LLVMTableGenGlobalISel) diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index 8a79d5757b2a..77ef764d2f22 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -51,6 +51,6 @@ add_tablegen(llvm-tblgen LLVM X86RecognizableInstr.cpp WebAssemblyDisassemblerEmitter.cpp CTagsEmitter.cpp - $ ) +target_link_libraries(llvm-tblgen PRIVATE LLVMTableGenGlobalISel) set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning") diff --git a/utils/TableGen/GlobalISel/CMakeLists.txt b/utils/TableGen/GlobalISel/CMakeLists.txt index cd6453482403..ea4713d8b29a 100644 --- a/utils/TableGen/GlobalISel/CMakeLists.txt +++ b/utils/TableGen/GlobalISel/CMakeLists.txt @@ -2,6 +2,6 @@ set(LLVM_LINK_COMPONENTS Support ) -llvm_add_library(LLVMTableGenGlobalISel OBJECT +llvm_add_library(LLVMTableGenGlobalISel STATIC CodeExpander.cpp ) From 7ae88cfbed859199863dc1148466545b5bbb92ea Mon Sep 17 00:00:00 2001 From: GN Sync Bot Date: Thu, 3 Oct 2019 02:43:27 +0000 Subject: [PATCH 30/82] gn build: Merge r373556 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373558 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/clang/lib/Driver/BUILD.gn | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/gn/secondary/clang/lib/Driver/BUILD.gn b/utils/gn/secondary/clang/lib/Driver/BUILD.gn index a70c69fc1b8c..66d209717f68 100644 --- a/utils/gn/secondary/clang/lib/Driver/BUILD.gn +++ b/utils/gn/secondary/clang/lib/Driver/BUILD.gn @@ -67,7 +67,6 @@ static_library("Driver") { "ToolChains/Haiku.cpp", "ToolChains/Hexagon.cpp", "ToolChains/Hurd.cpp", - "ToolChains/InterfaceStubs.cpp", "ToolChains/Linux.cpp", "ToolChains/MSP430.cpp", "ToolChains/MSVC.cpp", From 66e58f01396306b3af0954ebb8fefcd0e540a370 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Oct 2019 03:16:21 +0000 Subject: [PATCH 31/82] [X86] Remove a couple redundant isel patterns that look to have been copy/pasted from right above them. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373559 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 2d3b8a556816..7030e3bf76ef 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5069,23 +5069,6 @@ let Predicates = [HasDQI, NoVLX] in { sub_xmm)>; } -// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasDQI, NoVLX] in { - def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), - sub_ymm)>; - - def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), - (EXTRACT_SUBREG - (VPMULLQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), - sub_xmm)>; -} - multiclass avx512_min_max_lowering { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG From 1653d09ad34f76596c46ca5d5caa069498f831d1 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Oct 2019 03:16:27 +0000 Subject: [PATCH 32/82] [X86] Add broadcast load folding patterns to NoVLX VPMULLQ/VPMAXSQ/VPMAXUQ/VPMINSQ/VPMINUQ patterns. More fixes for PR36191. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373560 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86InstrAVX512.td | 38 +++++++-- test/CodeGen/X86/avx512-arith.ll | 94 +++++++++++++++++++++ test/CodeGen/X86/masked_store_trunc_ssat.ll | 36 +++----- test/CodeGen/X86/masked_store_trunc_usat.ll | 18 ++-- test/CodeGen/X86/vector-trunc-packus.ll | 6 +- test/CodeGen/X86/vector-trunc-ssat.ll | 12 +-- 6 files changed, 149 insertions(+), 55 deletions(-) diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 7030e3bf76ef..65a9cb621742 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5060,6 +5060,12 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), (EXTRACT_SUBREG @@ -5067,29 +5073,47 @@ let Predicates = [HasDQI, NoVLX] in { (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (VPMULLQZrmb + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } -multiclass avx512_min_max_lowering { +multiclass avx512_min_max_lowering { def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), sub_ymm)>; + def : Pat<(v4i64 (OpNode (v4i64 VR256X:$src1), (v4i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + addr:$src2), + sub_ymm)>; def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), (EXTRACT_SUBREG - (Instr + (!cast(Instr#"rr") (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), sub_xmm)>; + def : Pat<(v2i64 (OpNode (v2i64 VR128X:$src1), (v2i64 (X86VBroadcastld64 addr:$src2)))), + (EXTRACT_SUBREG + (!cast(Instr#"rmb") + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + addr:$src2), + sub_xmm)>; } let Predicates = [HasAVX512, NoVLX] in { - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; - defm : avx512_min_max_lowering; + defm : avx512_min_max_lowering<"VPMAXUQZ", umax>; + defm : avx512_min_max_lowering<"VPMINUQZ", umin>; + defm : avx512_min_max_lowering<"VPMAXSQZ", smax>; + defm : avx512_min_max_lowering<"VPMINSQZ", smin>; } //===----------------------------------------------------------------------===// diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll index 29793a7e0bc6..be88e3530a3c 100644 --- a/test/CodeGen/X86/avx512-arith.ll +++ b/test/CodeGen/X86/avx512-arith.ll @@ -190,6 +190,52 @@ define <4 x i64> @imulq256(<4 x i64> %y, <4 x i64> %x) { ret <4 x i64>%z } +define <4 x i64> @imulq256_bcast(<4 x i64> %x) { +; AVX512F-LABEL: imulq256_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512F-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq256_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq256_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [1337,1337,1337,1337] +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm2 +; AVX512BW-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 +; AVX512BW-NEXT: vpsllq $32, %ymm0, %ymm0 +; AVX512BW-NEXT: vpaddq %ymm0, %ymm2, %ymm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq256_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 +; AVX512DQ-NEXT: vpmullq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0 +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq256_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip){1to4}, %ymm0, %ymm0 +; SKX-NEXT: retq + %z = mul <4 x i64> %x, + ret <4 x i64>%z +} + define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ; AVX512F-LABEL: imulq128: ; AVX512F: # %bb.0: @@ -244,6 +290,54 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) { ret <2 x i64>%z } +define <2 x i64> @imulq128_bcast(<2 x i64> %x) { +; AVX512F-LABEL: imulq128_bcast: +; AVX512F: # %bb.0: +; AVX512F-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512F-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512F-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512F-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512F-NEXT: retq +; +; AVX512VL-LABEL: imulq128_bcast: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512VL-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512VL-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512VL-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512VL-NEXT: retq +; +; AVX512BW-LABEL: imulq128_bcast: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm2 +; AVX512BW-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpmuludq %xmm1, %xmm0, %xmm0 +; AVX512BW-NEXT: vpsllq $32, %xmm0, %xmm0 +; AVX512BW-NEXT: vpaddq %xmm0, %xmm2, %xmm0 +; AVX512BW-NEXT: retq +; +; AVX512DQ-LABEL: imulq128_bcast: +; AVX512DQ: # %bb.0: +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0 +; AVX512DQ-NEXT: vmovdqa {{.*#+}} xmm1 = [8086,8086] +; AVX512DQ-NEXT: vpmullq %zmm1, %zmm0, %zmm0 +; AVX512DQ-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 +; AVX512DQ-NEXT: vzeroupper +; AVX512DQ-NEXT: retq +; +; SKX-LABEL: imulq128_bcast: +; SKX: # %bb.0: +; SKX-NEXT: vpmullq {{.*}}(%rip), %xmm0, %xmm0 +; SKX-NEXT: retq + %z = mul <2 x i64> %x, + ret <2 x i64>%z +} + define <8 x double> @mulpd512(<8 x double> %y, <8 x double> %x) { ; CHECK-LABEL: mulpd512: ; CHECK: # %bb.0: # %entry diff --git a/test/CodeGen/X86/masked_store_trunc_ssat.ll b/test/CodeGen/X86/masked_store_trunc_ssat.ll index 777d4d14e4e4..af413e85be36 100644 --- a/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -1719,10 +1719,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1744,10 +1742,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2027,10 +2023,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2071,10 +2065,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2361,10 +2353,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2405,10 +2395,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/test/CodeGen/X86/masked_store_trunc_usat.ll b/test/CodeGen/X86/masked_store_trunc_usat.ll index 254f0cda48fc..1f47ed5d156d 100644 --- a/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1465,8 +1465,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512F-NEXT: kshiftlw $12, %k0, %k0 ; AVX512F-NEXT: kshiftrw $12, %k0, %k1 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512F-NEXT: vzeroupper @@ -1487,8 +1486,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, <4 x i32>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlw $12, %k0, %k0 ; AVX512BW-NEXT: kshiftrw $12, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -1734,8 +1732,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -1776,8 +1773,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, <4 x i16>* %p, <4 x i32> %mask ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftld $28, %k0, %k0 ; AVX512BW-NEXT: kshiftrd $28, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [65535,65535,65535,65535] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqw %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu16 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper @@ -2028,8 +2024,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512F-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1 ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512F-NEXT: kmovw %k0, %eax ; AVX512F-NEXT: testb $1, %al @@ -2070,8 +2065,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, <4 x i8>* %p, <4 x i32> %mask) ; AVX512BW-NEXT: vptestmd %zmm1, %zmm1, %k0 ; AVX512BW-NEXT: kshiftlq $60, %k0, %k0 ; AVX512BW-NEXT: kshiftrq $60, %k0, %k1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [255,255,255,255] -; AVX512BW-NEXT: vpminuq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminuq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqb %zmm0, %xmm0 ; AVX512BW-NEXT: vmovdqu8 %zmm0, (%rdi) {%k1} ; AVX512BW-NEXT: vzeroupper diff --git a/test/CodeGen/X86/vector-trunc-packus.ll b/test/CodeGen/X86/vector-trunc-packus.ll index a0306dc1cd4c..a5fa41c521f8 100644 --- a/test/CodeGen/X86/vector-trunc-packus.ll +++ b/test/CodeGen/X86/vector-trunc-packus.ll @@ -223,8 +223,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_packus_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 @@ -243,8 +242,7 @@ define <4 x i32> @trunc_packus_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_packus_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 diff --git a/test/CodeGen/X86/vector-trunc-ssat.ll b/test/CodeGen/X86/vector-trunc-ssat.ll index bb734bb8e329..a298382cfb01 100644 --- a/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/test/CodeGen/X86/vector-trunc-ssat.ll @@ -233,10 +233,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512F-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512F-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512F-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512F-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512F-NEXT: vzeroupper @@ -251,10 +249,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) { ; AVX512BW-LABEL: trunc_ssat_v4i64_v4i32: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647] -; AVX512BW-NEXT: vpminsq %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968] -; AVX512BW-NEXT: vpmaxsq %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: vpminsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmaxsq {{.*}}(%rip){1to8}, %zmm0, %zmm0 ; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0 ; AVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 ; AVX512BW-NEXT: vzeroupper From a0905148d0702b455787864f08c6aa81cafcacc0 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Oct 2019 05:30:02 +0000 Subject: [PATCH 33/82] [X86] Add DAG combine to turn (bitcast (vbroadcast_load)) into just a vbroadcast_load if the scalar size is the same. This improves broadcast load folding of i64 elements on 32-bit targets where i64 isn't legal. Previously we had to represent these as vXf64 vbroadcast_loads and a bitcast to vXi64. But we didn't have any isel patterns looking for that. This also allows us to remove or simplify some isel patterns that were looking for bitcasted vbroadcast_loads. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373566 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 15 +++ lib/Target/X86/X86InstrAVX512.td | 105 +----------------- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 34 +++--- .../X86/avx512dqvl-intrinsics-upgrade.ll | 27 ++--- .../X86/avx512ifma-intrinsics-upgrade.ll | 34 +++--- test/CodeGen/X86/avx512ifma-intrinsics.ll | 34 +++--- .../X86/avx512vl-intrinsics-upgrade.ll | 28 ++--- .../X86/avx512vlvp2intersect-intrinsics.ll | 7 +- .../X86/avx512vp2intersect-intrinsics.ll | 3 +- 9 files changed, 83 insertions(+), 204 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 466a33cb6c1e..088af6c5e3e5 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35415,6 +35415,21 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, DAG.getBitcast(MVT::i16, N0.getOperand(0))); + // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT + // determines // the number of bits loaded. Remaining bits are zero. + if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() && + VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) { + auto *BCast = cast(N0); + SDVTList Tys = DAG.getVTList(VT, MVT::Other); + SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() }; + SDValue ResNode = + DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops, + VT.getVectorElementType(), + BCast->getMemOperand()); + DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1)); + return ResNode; + } + // Since MMX types are special and don't usually play with other vector types, // it's better to handle them early to be sure we emit efficient code by // avoiding store-load conversions. diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 65a9cb621742..490d0c3048ff 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -5289,22 +5289,17 @@ multiclass avx512_logical_lowering_bcast { // Register-broadcast logical operations. - def : Pat<(IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT (_.BroadcastLdFrag addr:$src2))))), - (!cast(InstrStr#rmb) _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.RC:$src0)), (!cast(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (bitconvert (IntInfo.VT (OpNode _.RC:$src1, - (bitconvert (_.VT - (_.BroadcastLdFrag addr:$src2)))))), + (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))), _.ImmAllZerosV)), (!cast(InstrStr#rmbkz) _.KRCWM:$mask, _.RC:$src1, addr:$src2)>; @@ -11495,102 +11490,6 @@ defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SchedWriteVecALU, defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SchedWriteVecALU, avx512vl_i64_info>, VEX_W; -// Patterns to fold bitcasted FP broadcasts. -// FIXME: Need better DAG canonicalization. -let Predicates = [HasVLX] in { - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v4i32 (v4f32 (X86VBroadcastld32 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR128X:$src1, VR128X:$src2, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, VR128X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR128X:$src1, - (bc_v2i64 (v2f64 (X86VBroadcastld64 addr:$src3))), - VR128X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v8i32 (v8f32 (X86VBroadcastld32 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR256X:$src1, VR256X:$src2, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, VR256X:$src1, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR256X:$src1, - (bc_v4i64 (v4f64 (X86VBroadcastld64 addr:$src3))), - VR256X:$src2, (i8 timm:$src4)), - (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - -let Predicates = [HasAVX512] in { - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v16i32 (v16f32 (X86VBroadcastld32 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; - - def : Pat<(X86vpternlog VR512:$src1, VR512:$src2, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - timm:$src4)>; - def : Pat<(X86vpternlog (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, VR512:$src1, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG321_imm8 timm:$src4))>; - def : Pat<(X86vpternlog VR512:$src1, - (bc_v8i64 (v8f64 (X86VBroadcastld64 addr:$src3))), - VR512:$src2, (i8 timm:$src4)), - (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3, - (VPTERNLOG132_imm8 timm:$src4))>; -} - // Patterns to use VPTERNLOG for vXi16/vXi8 vectors. let Predicates = [HasVLX] in { def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3, diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index a526518c3fe6..e7f132bcdc67 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -2250,8 +2250,7 @@ define <8 x i64> @test_mask_add_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_add_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xd4,0xc1] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmb: @@ -2269,10 +2268,9 @@ define <8 x i64> @test_mask_add_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_add_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xd4,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xd4,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2293,10 +2291,9 @@ define <8 x i64> @test_mask_add_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_add_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpaddq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xd4,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpaddq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xd4,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_add_epi64_rmbkz: @@ -2418,8 +2415,7 @@ define <8 x i64> @test_mask_sub_epi64_rmb(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_sub_epi64_rmb: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0xfb,0xc1] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x58,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmb: @@ -2437,10 +2433,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbk(<8 x i64> %a, i64* %ptr_b, <8 x i64> ; X86-LABEL: test_mask_sub_epi64_rmbk: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm2, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x49,0xfb,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x59,0xfb,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl ## encoding: [0xc3] ; @@ -2461,10 +2456,9 @@ define <8 x i64> @test_mask_sub_epi64_rmbkz(<8 x i64> %a, i64* %ptr_b, i8 %mask) ; X86-LABEL: test_mask_sub_epi64_rmbkz: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 ## encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpsubq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xc9,0xfb,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ## encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 ## encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpsubq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xd9,0xfb,0x00] ; X86-NEXT: retl ## encoding: [0xc3] ; ; X64-LABEL: test_mask_sub_epi64_rmbkz: diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 63402d801993..b645098582ff 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -2011,8 +2011,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x58,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_512: @@ -2030,9 +2029,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm2, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x49,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x59,0x40,0x08] ; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2053,9 +2051,8 @@ define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_512: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to8}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_512: @@ -2172,8 +2169,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x28,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 # encoding: [0x62,0xf2,0xfd,0x38,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_256: @@ -2191,9 +2187,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x29,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x39,0x40,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2214,9 +2209,8 @@ define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_256: @@ -2334,8 +2328,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) { ; X86-LABEL: test_mask_mullo_epi64_rmb_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x08,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 # encoding: [0x62,0xf2,0xfd,0x18,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmb_128: @@ -2353,9 +2346,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x ; X86-LABEL: test_mask_mullo_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x09,0x40,0xca] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0x40,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -2376,9 +2368,8 @@ define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_mullo_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1] +; X86-NEXT: vpmullq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0x40,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_mullo_epi64_rmbkz_128: diff --git a/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll index 5b90bdb8311b..bf85814f3197 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics-upgrade.ll @@ -199,8 +199,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -236,8 +235,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -276,10 +274,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -319,10 +316,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -362,10 +358,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -405,10 +400,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 077269fde95c..6884666a296c 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -219,8 +219,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_bcast(<8 x i64> %x0, ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x48,0xb5,0xc2] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_bcast: @@ -256,8 +255,7 @@ define <8 x i64>@test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast(<8 x i6 ; X86-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 # encoding: [0x62,0xf2,0xed,0x48,0xb5,0xc1] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 # encoding: [0x62,0xf2,0xf5,0x58,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_vpmadd52h_uq_512_load_commute_bcast: @@ -298,10 +296,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x49,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_bcast: @@ -345,10 +342,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast(<8 ; X86-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} # encoding: [0x62,0xf2,0xed,0x49,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} # encoding: [0x62,0xf2,0xf5,0x59,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512_load_commute_bcast: @@ -392,10 +388,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast(<8 x i64> ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xc9,0xb5,0xc2] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_bcast: @@ -439,10 +434,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast(< ; X86-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %zmm2 # encoding: [0x62,0xf2,0xfd,0x48,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpmadd52huq %zmm1, %zmm2, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xed,0xc9,0xb5,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpmadd52huq (%eax){1to8}, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0xf5,0xd9,0xb5,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512_load_commute_bcast: diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 8684d1f568fd..233b9162c926 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -3922,10 +3922,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 ; X86-LABEL: test_mask_andnot_epi64_rmbk_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm2, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x09,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x19,0xdf,0x08] ; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -3946,10 +3945,9 @@ define <2 x i64> @test_mask_andnot_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_128: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x89,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to2}, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0x99,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_128: @@ -4089,10 +4087,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 ; X86-LABEL: test_mask_andnot_epi64_rmbk_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm2 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x10] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm2, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x29,0xdf,0xca] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0xfd,0x39,0xdf,0x08] ; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] ; X86-NEXT: retl # encoding: [0xc3] ; @@ -4113,10 +4110,9 @@ define <4 x i64> @test_mask_andnot_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 ; X86-LABEL: test_mask_andnot_epi64_rmbkz_256: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpbroadcastq (%eax), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0x08] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x08] -; X86-NEXT: kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8] -; X86-NEXT: vpandnq %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xa9,0xdf,0xc1] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovw %ecx, %k1 # encoding: [0xc5,0xf8,0x92,0xc9] +; X86-NEXT: vpandnq (%eax){1to4}, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0xfd,0xb9,0xdf,0x00] ; X86-NEXT: retl # encoding: [0xc3] ; ; X64-LABEL: test_mask_andnot_epi64_rmbkz_256: diff --git a/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll b/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll index c4c70fcb2b5c..fe3662d49aa5 100644 --- a/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll +++ b/test/CodeGen/X86/avx512vlvp2intersect-intrinsics.ll @@ -229,8 +229,7 @@ define void @test_mm256_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x0c] ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vbroadcastsd (%esi), %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x06] -; X86-NEXT: vbroadcastsd (%edx), %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0x0a] -; X86-NEXT: vp2intersectq %ymm1, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x28,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to4}, %ymm0, %k0 # encoding: [0x62,0xf2,0xff,0x38,0x68,0x02] ; X86-NEXT: kshiftlw $12, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0c] ; X86-NEXT: kshiftrw $12, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0c] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] @@ -535,9 +534,7 @@ define void @test_mm_2intersect_epi64_b(i64* nocapture readonly %a, i64* nocaptu ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi # encoding: [0x8b,0x74,0x24,0x08] ; X86-NEXT: vmovddup (%esi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x06] ; X86-NEXT: # xmm0 = mem[0,0] -; X86-NEXT: vmovddup (%edx), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0x0a] -; X86-NEXT: # xmm1 = mem[0,0] -; X86-NEXT: vp2intersectq %xmm1, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x08,0x68,0xc1] +; X86-NEXT: vp2intersectq (%edx){1to2}, %xmm0, %k0 # encoding: [0x62,0xf2,0xff,0x18,0x68,0x02] ; X86-NEXT: kshiftlw $14, %k0, %k2 # encoding: [0xc4,0xe3,0xf9,0x32,0xd0,0x0e] ; X86-NEXT: kshiftrw $14, %k2, %k2 # encoding: [0xc4,0xe3,0xf9,0x30,0xd2,0x0e] ; X86-NEXT: kmovw %k2, %edx # encoding: [0xc5,0xf8,0x93,0xd2] diff --git a/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll b/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll index 3e37c7c5b6ac..7e7a46db75ed 100644 --- a/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll +++ b/test/CodeGen/X86/avx512vp2intersect-intrinsics.ll @@ -199,8 +199,7 @@ define void @test_mm512_2intersect_epi64_b(i64* nocapture readonly %a, i64* noca ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x08] ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx # encoding: [0x8b,0x54,0x24,0x04] ; X86-NEXT: vbroadcastsd (%edx), %zmm0 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x02] -; X86-NEXT: vbroadcastsd (%ecx), %zmm1 # encoding: [0x62,0xf2,0xfd,0x48,0x19,0x09] -; X86-NEXT: vp2intersectq %zmm1, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x48,0x68,0xc1] +; X86-NEXT: vp2intersectq (%ecx){1to8}, %zmm0, %k0 # encoding: [0x62,0xf2,0xff,0x58,0x68,0x01] ; X86-NEXT: kmovw %k1, %ecx # encoding: [0xc5,0xf8,0x93,0xc9] ; X86-NEXT: kmovw %k0, %edx # encoding: [0xc5,0xf8,0x93,0xd0] ; X86-NEXT: movb %dl, (%eax) # encoding: [0x88,0x10] From 118db2620f36a47ac3039ad9472293afc8bd2079 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 05:46:08 +0000 Subject: [PATCH 34/82] AMDGPU/GlobalISel: Expand G_BITCAST legality git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373567 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 5 +- .../AMDGPU/GlobalISel/legalize-bitcast.mir | 102 ++++++++++++++++++ 2 files changed, 103 insertions(+), 4 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index b8b54a2ef1a5..8cf5a54177da 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -279,11 +279,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .scalarize(0); // TODO: Implement. getActionDefinitionsBuilder(G_BITCAST) - .legalForCartesianProduct({S32, V2S16}) - .legalForCartesianProduct({S64, V2S32, V4S16}) - .legalForCartesianProduct({V2S64, V4S32}) // Don't worry about the size constraint. - .legalIf(all(isPointer(0), isPointer(1))) + .legalIf(all(isRegisterType(0), isRegisterType(1))) // FIXME: Testing hack .legalForCartesianProduct({S16, LLT::vector(2, 8), }); diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir index 2c3995a49d1c..a7c62f74c216 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-bitcast.mir @@ -121,6 +121,36 @@ body: | $vgpr0_vgpr1 = COPY %1 ... +--- +name: test_bitcast_v2s64_to_v8s16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v2s64_to_v8s16 + ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s16>) = G_BITCAST [[COPY]](<2 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<8 x s16>) + %0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<8 x s16>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + +--- +name: test_bitcast_v8s16_to_v2s64 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + + ; CHECK-LABEL: name: test_bitcast_v8s16_to_v2s64 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<2 x s64>) = G_BITCAST [[COPY]](<8 x s16>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BITCAST]](<2 x s64>) + %0:_(<8 x s16>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(<2 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %1 +... + --- name: test_bitcast_p0_to_p1 body: | @@ -180,3 +210,75 @@ body: | %1:_(p999) = G_BITCAST %0 $vgpr0_vgpr1 = COPY %1 ... + +--- +name: test_bitcast_v4s64_to_v8s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v4s64_to_v8s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s32>) = G_BITCAST [[COPY]](<4 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<8 x s32>) + %0:_(<4 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<8 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s32_to_v4s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + + ; CHECK-LABEL: name: test_bitcast_v8s32_to_v4s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<4 x s64>) = G_BITCAST [[COPY]](<8 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BITCAST]](<4 x s64>) + %0:_(<8 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:_(<4 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %1 +... + +--- +name: test_bitcast_v8s64_to_v16s32 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v8s64_to_v16s32 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<16 x s32>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<16 x s32>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... + +--- +name: test_bitcast_v16s32_to_v8s64 +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + + ; CHECK-LABEL: name: test_bitcast_v16s32_to_v8s64 + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s64>) = G_BITCAST [[COPY]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST]](<8 x s64>) + %0:_(<16 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(<8 x s64>) = G_BITCAST %0 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %1 +... From 6e45beba86221def2571af2b4e3f00ea8b8f5643 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 05:46:10 +0000 Subject: [PATCH 35/82] AMDGPU/GlobalISel: Don't re-get subtarget It's already available in the class. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373568 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index badcd77aaef1..8aa296b1132d 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -662,9 +662,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef OpIndices) const { - MachineFunction *MF = MI.getParent()->getParent(); - const GCNSubtarget &ST = MF->getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); + MachineFunction *MF = &B.getMF(); MachineBasicBlock::iterator I(MI); MachineBasicBlock &MBB = *MI.getParent(); @@ -2126,8 +2124,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { return getDefaultMappingVOP(MI); case AMDGPU::G_UMULH: case AMDGPU::G_SMULH: { - if (MF.getSubtarget().hasScalarMulHiInsts() && - isSALUMapping(MI)) + if (Subtarget.hasScalarMulHiInsts() && isSALUMapping(MI)) return getDefaultMappingSOP(MI); return getDefaultMappingVOP(MI); } @@ -2301,7 +2298,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { Op3Bank == AMDGPU::SGPRRegBankID && (Size == 32 || (Size == 64 && (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) && - MF.getSubtarget().hasScalarCompareEq64())); + Subtarget.hasScalarCompareEq64())); unsigned Op0Bank = CanUseSCC ? AMDGPU::SCCRegBankID : AMDGPU::VCCRegBankID; From d698e4446a2d8e683f7fe289b7d31f0780a38407 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Oct 2019 06:18:45 +0000 Subject: [PATCH 36/82] [X86] Add test case for v8i64->v8i8 truncate with avx512 and prefer-vector-width/min-legal-vector-width=256. NFC With vpmovqb, we should be able to do better here until we get AVX512VBMI on Cannonlake/Icelake. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373569 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/X86/min-legal-vector-width.ll | 31 ++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index deb261151402..d0bc67a4485e 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -828,6 +828,37 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector- ret <16 x i8> %b } +define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { +; CHECK-AVX512-LABEL: trunc_v8i64_v8i8: +; CHECK-AVX512: # %bb.0: +; CHECK-AVX512-NEXT: vmovdqa (%rdi), %xmm0 +; CHECK-AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 +; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 +; CHECK-AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 +; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = +; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> +; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; CHECK-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; CHECK-AVX512-NEXT: retq +; +; CHECK-VBMI-LABEL: trunc_v8i64_v8i8: +; CHECK-VBMI: # %bb.0: +; CHECK-VBMI-NEXT: vmovdqa (%rdi), %ymm1 +; CHECK-VBMI-NEXT: vpbroadcastq {{.*#+}} ymm0 = [4048780183313844224,4048780183313844224,4048780183313844224,4048780183313844224] +; CHECK-VBMI-NEXT: vpermi2b 32(%rdi), %ymm1, %ymm0 +; CHECK-VBMI-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; CHECK-VBMI-NEXT: vzeroupper +; CHECK-VBMI-NEXT: retq + %a = load <8 x i64>, <8 x i64>* %x + %b = trunc <8 x i64> %a to <8 x i8> + ret <8 x i8> %b +} + define <8 x i16> @trunc_v8i64_v8i16(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-LABEL: trunc_v8i64_v8i16: ; CHECK: # %bb.0: From fa63bf3e2024d850e4501d8b9a362b9b201e816a Mon Sep 17 00:00:00 2001 From: Clement Courbet Date: Thu, 3 Oct 2019 07:56:56 +0000 Subject: [PATCH 37/82] [llvm-exegesis][NFC] Rename ExegesisTarget::decrementLoopCounterAndLoop() Summary: To decrementLoopCounterAndJump, and explicitely take the jump target. Reviewers: gchatelet Subscribers: tschuett, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68375 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373571 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-exegesis/lib/SnippetRepetitor.cpp | 3 ++- tools/llvm-exegesis/lib/Target.h | 3 ++- tools/llvm-exegesis/lib/X86/Target.cpp | 10 ++++++---- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/tools/llvm-exegesis/lib/SnippetRepetitor.cpp index 954471659e59..860db92a81da 100644 --- a/tools/llvm-exegesis/lib/SnippetRepetitor.cpp +++ b/tools/llvm-exegesis/lib/SnippetRepetitor.cpp @@ -79,7 +79,8 @@ class LoopSnippetRepetitor : public SnippetRepetitor { for (const auto &LiveIn : Entry.MBB->liveins()) Loop.MBB->addLiveIn(LiveIn); Loop.addInstructions(Instructions); - ET.decrementLoopCounterAndLoop(*Loop.MBB, State.getInstrInfo()); + ET.decrementLoopCounterAndJump(*Loop.MBB, *Loop.MBB, + State.getInstrInfo()); // Set up the exit basic block. Loop.MBB->addSuccessor(Exit.MBB, llvm::BranchProbability::getZero()); diff --git a/tools/llvm-exegesis/lib/Target.h b/tools/llvm-exegesis/lib/Target.h index 4b0c9d17dd7f..70313a7a2f7a 100644 --- a/tools/llvm-exegesis/lib/Target.h +++ b/tools/llvm-exegesis/lib/Target.h @@ -95,7 +95,8 @@ class ExegesisTarget { } // Adds the code to decrement the loop counter and - virtual void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + virtual void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const { llvm_unreachable("decrementLoopCounterAndBranch() requires " "getLoopCounterRegister() > 0"); diff --git a/tools/llvm-exegesis/lib/X86/Target.cpp b/tools/llvm-exegesis/lib/X86/Target.cpp index bf008e8bbc7a..ce66610891d0 100644 --- a/tools/llvm-exegesis/lib/X86/Target.cpp +++ b/tools/llvm-exegesis/lib/X86/Target.cpp @@ -448,7 +448,8 @@ class ExegesisX86Target : public ExegesisTarget { void fillMemoryOperands(InstructionTemplate &IT, unsigned Reg, unsigned Offset) const override; - void decrementLoopCounterAndLoop(MachineBasicBlock &MBB, + void decrementLoopCounterAndJump(MachineBasicBlock &MBB, + MachineBasicBlock &TargetMBB, const llvm::MCInstrInfo &MII) const override; std::vector setRegTo(const llvm::MCSubtargetInfo &STI, @@ -558,14 +559,15 @@ void ExegesisX86Target::fillMemoryOperands(InstructionTemplate &IT, SetOp(MemOpIdx + 4, MCOperand::createReg(0)); // Segment } -void ExegesisX86Target::decrementLoopCounterAndLoop( - MachineBasicBlock &MBB, const llvm::MCInstrInfo &MII) const { +void ExegesisX86Target::decrementLoopCounterAndJump( + MachineBasicBlock &MBB, MachineBasicBlock &TargetMBB, + const llvm::MCInstrInfo &MII) const { BuildMI(&MBB, DebugLoc(), MII.get(X86::ADD64ri8)) .addDef(kLoopCounterReg) .addUse(kLoopCounterReg) .addImm(-1); BuildMI(&MBB, DebugLoc(), MII.get(X86::JCC_1)) - .addMBB(&MBB) + .addMBB(&TargetMBB) .addImm(X86::COND_NE); } From 39010ac5225f7d7a71b231f2af76892e1b69fdd8 Mon Sep 17 00:00:00 2001 From: Sylvestre Ledru Date: Thu, 3 Oct 2019 09:43:54 +0000 Subject: [PATCH 38/82] Update the FAQ: remove stuff related to the previous license + update info about the portability of LLVM. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373576 91177308-0d34-0410-b5e6-96231b3b80d8 --- docs/FAQ.rst | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/docs/FAQ.rst b/docs/FAQ.rst index 2c69abfdd0bc..1afba7557bd7 100644 --- a/docs/FAQ.rst +++ b/docs/FAQ.rst @@ -9,17 +9,10 @@ Frequently Asked Questions (FAQ) License ======= -Does the University of Illinois Open Source License really qualify as an "open source" license? ------------------------------------------------------------------------------------------------ -Yes, the license is `certified -`_ by the Open Source -Initiative (OSI). - - Can I modify LLVM source code and redistribute the modified source? ------------------------------------------------------------------- Yes. The modified source distribution must retain the copyright notice and -follow the three bulleted conditions listed in the `LLVM license +follow the conditions listed in the `LLVM license `_. @@ -41,10 +34,12 @@ the STL. How portable is the LLVM source code? ------------------------------------- The LLVM source code should be portable to most modern Unix-like operating -systems. Most of the code is written in standard C++ with operating system +systems. LLVM has also excellent support on Windows systems. +Most of the code is written in standard C++ with operating system services abstracted to a support library. The tools required to build and test LLVM have been ported to a plethora of platforms. + What API do I use to store a value to one of the virtual registers in LLVM IR's SSA representation? --------------------------------------------------------------------------------------------------- From ca10309b33be5a81096953d5323e61081228af65 Mon Sep 17 00:00:00 2001 From: Kristina Brooks Date: Thu, 3 Oct 2019 10:48:37 +0000 Subject: [PATCH 39/82] Revert 373555: libLLVM+modules failure with CMake 3.10.2 This reverts rL373555. I've sent an email out regarding the issue. Commit on GitHub: https://github.com/llvm/llvm-project/commit/45f682f47129c05414d4c5ae7be851772273978f git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373579 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/TableGen/CMakeLists.txt | 3 ++- utils/TableGen/CMakeLists.txt | 2 +- utils/TableGen/GlobalISel/CMakeLists.txt | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/unittests/TableGen/CMakeLists.txt b/unittests/TableGen/CMakeLists.txt index 47bde04cad0d..d90955786f86 100644 --- a/unittests/TableGen/CMakeLists.txt +++ b/unittests/TableGen/CMakeLists.txt @@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS add_llvm_unittest(TableGenTests CodeExpanderTest.cpp + $ ) + include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) -target_link_libraries(TableGenTests PRIVATE LLVMTableGenGlobalISel) diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index 77ef764d2f22..8a79d5757b2a 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -51,6 +51,6 @@ add_tablegen(llvm-tblgen LLVM X86RecognizableInstr.cpp WebAssemblyDisassemblerEmitter.cpp CTagsEmitter.cpp + $ ) -target_link_libraries(llvm-tblgen PRIVATE LLVMTableGenGlobalISel) set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning") diff --git a/utils/TableGen/GlobalISel/CMakeLists.txt b/utils/TableGen/GlobalISel/CMakeLists.txt index ea4713d8b29a..cd6453482403 100644 --- a/utils/TableGen/GlobalISel/CMakeLists.txt +++ b/utils/TableGen/GlobalISel/CMakeLists.txt @@ -2,6 +2,6 @@ set(LLVM_LINK_COMPONENTS Support ) -llvm_add_library(LLVMTableGenGlobalISel STATIC +llvm_add_library(LLVMTableGenGlobalISel OBJECT CodeExpander.cpp ) From 3a0e85180cc2a38fe7503e96944154360e702c7f Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Thu, 3 Oct 2019 10:53:10 +0000 Subject: [PATCH 40/82] [Alignment][NFC] Allow constexpr Align Summary: This is patch is part of a series to introduce an Alignment type. See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html See this patch for the introduction of the type: https://reviews.llvm.org/D64790 Reviewers: courbet Subscribers: hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D68329 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373580 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/Alignment.h | 26 ++++++++++++++++++---- include/llvm/Support/MathExtras.h | 9 ++++++++ lib/Target/AArch64/AArch64StackTagging.cpp | 2 +- unittests/Support/AlignmentTest.cpp | 10 +++++++++ unittests/Support/MathExtrasTest.cpp | 19 ++++++++++++++++ 5 files changed, 61 insertions(+), 5 deletions(-) diff --git a/include/llvm/Support/Alignment.h b/include/llvm/Support/Alignment.h index 3d8a4235b0e6..f94d7cdc9a14 100644 --- a/include/llvm/Support/Alignment.h +++ b/include/llvm/Support/Alignment.h @@ -58,10 +58,10 @@ struct Align { constexpr Align() = default; /// Do not perform checks in case of copy/move construct/assign, because the /// checks have been performed when building `Other`. - Align(const Align &Other) = default; - Align &operator=(const Align &Other) = default; - Align(Align &&Other) = default; - Align &operator=(Align &&Other) = default; + constexpr Align(const Align &Other) = default; + constexpr Align &operator=(const Align &Other) = default; + constexpr Align(Align &&Other) = default; + constexpr Align &operator=(Align &&Other) = default; explicit Align(uint64_t Value) { assert(Value > 0 && "Value must not be 0"); @@ -80,6 +80,24 @@ struct Align { /// would be better than /// `if (A > Align(1))` constexpr static const Align None() { return Align(); } + + /// This function is useful when initializing constexpr Align constants. + /// e.g. static constexpr Align kAlign16 = Align::Constant<16>(); + /// Most compilers (clang, gcc, icc) will be able to compute `ShiftValue` + /// at compile time with `Align::Align(uint64_t Value)` but to be + /// able to use Align as a constexpr constant use this method. + /// FIXME: When LLVM is C++17 ready `Align::Align(uint64_t Value)` + /// can be constexpr and we can dispatch between runtime (Log2_64) vs + /// compile time (CTLog2) versions using constexpr-if. Then this + /// function is no more necessary and we can add user defined literals + /// for convenience. + template constexpr static Align Constant() { + static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), + "Not a valid alignment"); + Align A; + A.ShiftValue = CTLog2(); + return A; + } }; /// Treats the value 0 as a 1, so Align is always at least 1. diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h index 775d19a698f4..9570ae67a9d1 100644 --- a/include/llvm/Support/MathExtras.h +++ b/include/llvm/Support/MathExtras.h @@ -532,6 +532,15 @@ inline double Log2(double Value) { #endif } +/// Return the compile time log base 2 of the specified Value. +/// `kValue` has to be a power of two. +template static constexpr inline uint8_t CTLog2() { + static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), + "Value is not a valid power of 2"); + return 1 + CTLog2(); +} +template <> constexpr inline uint8_t CTLog2<1>() { return 0; } + /// Return the floor log base 2 of the specified value, -1 if the value is zero. /// (32 bit edition.) /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp index 0c52711a8d7e..55c7afbd69f7 100644 --- a/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/lib/Target/AArch64/AArch64StackTagging.cpp @@ -62,7 +62,7 @@ static cl::opt ClMergeInit( static cl::opt ClScanLimit("stack-tagging-merge-init-scan-limit", cl::init(40), cl::Hidden); -static const Align kTagGranuleSize = Align(16); +static constexpr Align kTagGranuleSize = Align::Constant<16>(); namespace { diff --git a/unittests/Support/AlignmentTest.cpp b/unittests/Support/AlignmentTest.cpp index 0b1435912b93..3d35a612b832 100644 --- a/unittests/Support/AlignmentTest.cpp +++ b/unittests/Support/AlignmentTest.cpp @@ -44,6 +44,16 @@ TEST(AlignmentTest, ValidCTors) { } } +TEST(AlignmentTest, CompileTimeConstant) { + EXPECT_EQ(Align::Constant<1>(), Align(1)); + EXPECT_EQ(Align::Constant<2>(), Align(2)); + EXPECT_EQ(Align::Constant<4>(), Align(4)); + EXPECT_EQ(Align::Constant<8>(), Align(8)); + EXPECT_EQ(Align::Constant<16>(), Align(16)); + EXPECT_EQ(Align::Constant<32>(), Align(32)); + EXPECT_EQ(Align::Constant<64>(), Align(64)); +} + TEST(AlignmentTest, CheckMaybeAlignHasValue) { EXPECT_TRUE(MaybeAlign(1)); EXPECT_TRUE(MaybeAlign(1).hasValue()); diff --git a/unittests/Support/MathExtrasTest.cpp b/unittests/Support/MathExtrasTest.cpp index 01c83c9e14d3..00d037ad110b 100644 --- a/unittests/Support/MathExtrasTest.cpp +++ b/unittests/Support/MathExtrasTest.cpp @@ -203,6 +203,25 @@ TEST(MathExtras, PowerOf2Floor) { EXPECT_EQ(4U, PowerOf2Floor(7U)); } +TEST(MathExtras, CTLog2) { + EXPECT_EQ(CTLog2<1ULL << 0>(), 0); + EXPECT_EQ(CTLog2<1ULL << 1>(), 1); + EXPECT_EQ(CTLog2<1ULL << 2>(), 2); + EXPECT_EQ(CTLog2<1ULL << 3>(), 3); + EXPECT_EQ(CTLog2<1ULL << 4>(), 4); + EXPECT_EQ(CTLog2<1ULL << 5>(), 5); + EXPECT_EQ(CTLog2<1ULL << 6>(), 6); + EXPECT_EQ(CTLog2<1ULL << 7>(), 7); + EXPECT_EQ(CTLog2<1ULL << 8>(), 8); + EXPECT_EQ(CTLog2<1ULL << 9>(), 9); + EXPECT_EQ(CTLog2<1ULL << 10>(), 10); + EXPECT_EQ(CTLog2<1ULL << 11>(), 11); + EXPECT_EQ(CTLog2<1ULL << 12>(), 12); + EXPECT_EQ(CTLog2<1ULL << 13>(), 13); + EXPECT_EQ(CTLog2<1ULL << 14>(), 14); + EXPECT_EQ(CTLog2<1ULL << 15>(), 15); +} + TEST(MathExtras, ByteSwap_32) { EXPECT_EQ(0x44332211u, ByteSwap_32(0x11223344)); EXPECT_EQ(0xDDCCBBAAu, ByteSwap_32(0xAABBCCDD)); From ae2b7d76de7a3a9da17bedb40a2bc67566df83a7 Mon Sep 17 00:00:00 2001 From: Kristina Brooks Date: Thu, 3 Oct 2019 11:04:48 +0000 Subject: [PATCH 41/82] Revert 373551 (CodeExpander.cpp CMake issue) Fix buildbots and revert the CodeExpander commit. (See http://lists.llvm.org/pipermail/llvm-commits/Week-of-Mon-20190930/699857.html ) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373581 91177308-0d34-0410-b5e6-96231b3b80d8 --- unittests/CMakeLists.txt | 3 +- unittests/TableGen/CMakeLists.txt | 11 -- unittests/TableGen/CodeExpanderTest.cpp | 203 --------------------- utils/TableGen/CMakeLists.txt | 3 - utils/TableGen/GICombinerEmitter.cpp | 5 - utils/TableGen/GlobalISel/CMakeLists.txt | 7 - utils/TableGen/GlobalISel/CodeExpander.cpp | 93 ---------- utils/TableGen/GlobalISel/CodeExpander.h | 55 ------ utils/TableGen/GlobalISel/CodeExpansions.h | 43 ----- 9 files changed, 1 insertion(+), 422 deletions(-) delete mode 100644 unittests/TableGen/CMakeLists.txt delete mode 100644 unittests/TableGen/CodeExpanderTest.cpp delete mode 100644 utils/TableGen/GlobalISel/CMakeLists.txt delete mode 100644 utils/TableGen/GlobalISel/CodeExpander.cpp delete mode 100644 utils/TableGen/GlobalISel/CodeExpander.h delete mode 100644 utils/TableGen/GlobalISel/CodeExpansions.h diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt index 9384bdad0434..6bb2fb8eb923 100644 --- a/unittests/CMakeLists.txt +++ b/unittests/CMakeLists.txt @@ -31,9 +31,8 @@ add_subdirectory(Remarks) add_subdirectory(Passes) add_subdirectory(ProfileData) add_subdirectory(Support) -add_subdirectory(TableGen) -add_subdirectory(Target) add_subdirectory(TextAPI) +add_subdirectory(Target) add_subdirectory(Transforms) add_subdirectory(XRay) add_subdirectory(tools) diff --git a/unittests/TableGen/CMakeLists.txt b/unittests/TableGen/CMakeLists.txt deleted file mode 100644 index d90955786f86..000000000000 --- a/unittests/TableGen/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -set(LLVM_LINK_COMPONENTS - TableGen - Support - ) - -add_llvm_unittest(TableGenTests - CodeExpanderTest.cpp - $ - ) - -include_directories(${CMAKE_SOURCE_DIR}/utils/TableGen) diff --git a/unittests/TableGen/CodeExpanderTest.cpp b/unittests/TableGen/CodeExpanderTest.cpp deleted file mode 100644 index 75b9b7373707..000000000000 --- a/unittests/TableGen/CodeExpanderTest.cpp +++ /dev/null @@ -1,203 +0,0 @@ -//===- llvm/unittest/TableGen/CodeExpanderTest.cpp - Tests ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "GlobalISel/CodeExpander.h" -#include "GlobalISel/CodeExpansions.h" - -#include "llvm/Support/raw_ostream.h" -#include "llvm/TableGen/Error.h" -#include "gtest/gtest.h" - -using namespace llvm; - -static StringRef bufferize(StringRef Str) { - std::unique_ptr Buffer = - MemoryBuffer::getMemBufferCopy(Str, "TestBuffer"); - StringRef StrBufferRef = Buffer->getBuffer(); - SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc()); - return StrBufferRef; -} - -class RAIIDiagnosticChecker { - std::string EmittedDiags; - raw_string_ostream OS; - std::vector Expected; - std::vector Received; - -public: - RAIIDiagnosticChecker() : OS(EmittedDiags) { - SrcMgr.setDiagHandler(handler, this); - } - ~RAIIDiagnosticChecker() { - SrcMgr.setDiagHandler(nullptr); - EXPECT_EQ(Received.size(), Expected.size()); - for (unsigned i = 0; i < Received.size() && i < Expected.size(); ++i) { - EXPECT_EQ(Received[i].getLoc(), Expected[i].getLoc()); - EXPECT_EQ(Received[i].getFilename(), Expected[i].getFilename()); - EXPECT_EQ(Received[i].getKind(), Expected[i].getKind()); - EXPECT_EQ(Received[i].getLineNo(), Expected[i].getLineNo()); - EXPECT_EQ(Received[i].getColumnNo(), Expected[i].getColumnNo()); - EXPECT_EQ(Received[i].getMessage(), Expected[i].getMessage()); - EXPECT_EQ(Received[i].getLineContents(), Expected[i].getLineContents()); - EXPECT_EQ(Received[i].getRanges(), Expected[i].getRanges()); - } - - if (testing::Test::HasFailure()) - errs() << "Emitted diagnostic:\n" << OS.str(); - } - - void expect(SMDiagnostic D) { Expected.push_back(D); } - - void diag(const SMDiagnostic &D) { - Received.push_back(D); - } - - static void handler(const SMDiagnostic &D, void *Context) { - RAIIDiagnosticChecker *Self = static_cast(Context); - Self->diag(D); - SrcMgr.setDiagHandler(nullptr); - SrcMgr.PrintMessage(Self->OS, D); - SrcMgr.setDiagHandler(handler, Context); - }; -}; - -TEST(CodeExpander, NoExpansions) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("No expansions", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "No expansions"); -} - -// Indentation is applied to all lines except the first -TEST(CodeExpander, Indentation) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("No expansions\nsecond line\nthird line", Expansions, SMLoc(), - false, " ") - .emit(OS); - EXPECT_EQ(OS.str(), "No expansions\n second line\n third line"); -} - -// \ is an escape character that removes special meanings from the next -// character. -TEST(CodeExpander, Escape) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("\\\\\\a\\$", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "\\a$"); -} - -// $foo is not an expansion. It should warn though. -TEST(CodeExpander, NotAnExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - StringRef In = bufferize(" $foo"); - CodeExpander(" $foo", Expansions, SMLoc::getFromPointer(In.data()), false) - .emit(OS); - EXPECT_EQ(OS.str(), " $foo"); - DiagChecker.expect(SMDiagnostic( - SrcMgr, SMLoc::getFromPointer(In.data() + 1), "TestBuffer", 1, 1, - SourceMgr::DK_Warning, "Assuming missing escape character", " $foo", {})); -} - -// \$foo is not an expansion but shouldn't warn as it's using the escape. -TEST(CodeExpander, EscapedNotAnExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("\\$foo", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "$foo"); -} - -// \${foo is not an expansion but shouldn't warn as it's using the escape. -TEST(CodeExpander, EscapedUnterminatedExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("\\${foo", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "${foo"); -} - -// \${foo is not an expansion but shouldn't warn as it's using the escape. -TEST(CodeExpander, EscapedExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("\\${foo}", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "${foo}"); -} - -// ${foo} is an undefined expansion and should error. -TEST(CodeExpander, UndefinedExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - Expansions.declare("bar", "expansion"); - - RAIIDiagnosticChecker DiagChecker; - CodeExpander("${foo}${bar}", Expansions, SMLoc(), false).emit(OS); - EXPECT_EQ(OS.str(), "expansion"); - DiagChecker.expect( - SMDiagnostic(SrcMgr, SMLoc(), "", 0, -1, SourceMgr::DK_Error, - "Attempting to expand an undeclared variable foo", "", {})); -} - -// ${foo} is an undefined expansion and should error. When given a valid -// location for the start of the buffer it should correctly point at the -// expansion being performed. -TEST(CodeExpander, UndefinedExpansionWithLoc) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - Expansions.declare("bar", "expansion"); - - RAIIDiagnosticChecker DiagChecker; - StringRef In = bufferize("Padding ${foo}${bar}"); - CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) - .emit(OS); - EXPECT_EQ(OS.str(), "Padding expansion"); - DiagChecker.expect(SMDiagnostic( - SrcMgr, SMLoc::getFromPointer(In.data() + 8), "TestBuffer", 1, 8, - SourceMgr::DK_Error, "Attempting to expand an undeclared variable foo", - "Padding ${foo}${bar}", {})); -} - -// ${bar is an unterminated expansion. Warn and implicitly terminate it. -TEST(CodeExpander, UnterminatedExpansion) { - std::string Result; - raw_string_ostream OS(Result); - CodeExpansions Expansions; - Expansions.declare("bar", "expansion"); - - RAIIDiagnosticChecker DiagChecker; - StringRef In = bufferize(" ${bar"); - CodeExpander(In, Expansions, SMLoc::getFromPointer(In.data()), false) - .emit(OS); - EXPECT_EQ(OS.str(), " expansion"); - DiagChecker.expect(SMDiagnostic(SrcMgr, SMLoc::getFromPointer(In.data() + 1), - "TestBuffer", 1, 1, SourceMgr::DK_Warning, - "Unterminated expansion", " ${bar", {})); -} diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt index 8a79d5757b2a..d97f9359f54d 100644 --- a/utils/TableGen/CMakeLists.txt +++ b/utils/TableGen/CMakeLists.txt @@ -1,5 +1,3 @@ -add_subdirectory(GlobalISel) - set(LLVM_LINK_COMPONENTS Support) add_tablegen(llvm-tblgen LLVM @@ -51,6 +49,5 @@ add_tablegen(llvm-tblgen LLVM X86RecognizableInstr.cpp WebAssemblyDisassemblerEmitter.cpp CTagsEmitter.cpp - $ ) set_target_properties(llvm-tblgen PROPERTIES FOLDER "Tablegenning") diff --git a/utils/TableGen/GICombinerEmitter.cpp b/utils/TableGen/GICombinerEmitter.cpp index c2b64bcfb7c5..a85462b5aa89 100644 --- a/utils/TableGen/GICombinerEmitter.cpp +++ b/utils/TableGen/GICombinerEmitter.cpp @@ -26,11 +26,6 @@ cl::OptionCategory static cl::list SelectedCombiners("combiners", cl::desc("Emit the specified combiners"), cl::cat(GICombinerEmitterCat), cl::CommaSeparated); -static cl::opt ShowExpansions( - "gicombiner-show-expansions", - cl::desc("Use C++ comments to indicate occurence of code expansion"), - cl::cat(GICombinerEmitterCat)); - namespace { class GICombinerEmitter { StringRef Name; diff --git a/utils/TableGen/GlobalISel/CMakeLists.txt b/utils/TableGen/GlobalISel/CMakeLists.txt deleted file mode 100644 index cd6453482403..000000000000 --- a/utils/TableGen/GlobalISel/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -set(LLVM_LINK_COMPONENTS - Support - ) - -llvm_add_library(LLVMTableGenGlobalISel OBJECT - CodeExpander.cpp - ) diff --git a/utils/TableGen/GlobalISel/CodeExpander.cpp b/utils/TableGen/GlobalISel/CodeExpander.cpp deleted file mode 100644 index d59a9b8e3b65..000000000000 --- a/utils/TableGen/GlobalISel/CodeExpander.cpp +++ /dev/null @@ -1,93 +0,0 @@ -//===- CodeExpander.cpp - Expand variables in a string --------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Expand the variables in a string. -// -//===----------------------------------------------------------------------===// - -#include "CodeExpander.h" -#include "CodeExpansions.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/TableGen/Error.h" - -using namespace llvm; - -void CodeExpander::emit(raw_ostream &OS) const { - StringRef Current = Code; - - while (!Current.empty()) { - size_t Pos = Current.find_first_of("$\n\\"); - if (Pos == StringRef::npos) { - OS << Current; - Current = ""; - continue; - } - - OS << Current.substr(0, Pos); - Current = Current.substr(Pos); - - if (Current.startswith("\n")) { - OS << "\n" << Indent; - Current = Current.drop_front(1); - continue; - } - - if (Current.startswith("\\$") || Current.startswith("\\\\")) { - OS << Current[1]; - Current = Current.drop_front(2); - continue; - } - - if (Current.startswith("\\")) { - Current = Current.drop_front(1); - continue; - } - - if (Current.startswith("${")) { - StringRef StartVar = Current; - Current = Current.drop_front(2); - StringRef Var; - std::tie(Var, Current) = Current.split("}"); - - // Warn if we split because no terminator was found. - StringRef EndVar = StartVar.drop_front(2 /* ${ */ + Var.size()); - if (EndVar.empty()) { - size_t LocOffset = StartVar.data() - Code.data(); - PrintWarning( - Loc.size() > 0 && Loc[0].isValid() - ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) - : SMLoc(), - "Unterminated expansion"); - } - - auto ValueI = Expansions.find(Var); - if (ValueI == Expansions.end()) { - size_t LocOffset = StartVar.data() - Code.data(); - PrintError(Loc.size() > 0 && Loc[0].isValid() - ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) - : SMLoc(), - "Attempting to expand an undeclared variable " + Var); - } - if (ShowExpansions) - OS << "/*$" << Var << "{*/"; - OS << Expansions.lookup(Var); - if (ShowExpansions) - OS << "/*}*/"; - continue; - } - - size_t LocOffset = Current.data() - Code.data(); - PrintWarning(Loc.size() > 0 && Loc[0].isValid() - ? SMLoc::getFromPointer(Loc[0].getPointer() + LocOffset) - : SMLoc(), - "Assuming missing escape character"); - OS << "$"; - Current = Current.drop_front(1); - } -} diff --git a/utils/TableGen/GlobalISel/CodeExpander.h b/utils/TableGen/GlobalISel/CodeExpander.h deleted file mode 100644 index bd6946de5925..000000000000 --- a/utils/TableGen/GlobalISel/CodeExpander.h +++ /dev/null @@ -1,55 +0,0 @@ -//===- CodeExpander.h - Expand variables in a string ----------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Expand the variables in a string. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H -#define LLVM_UTILS_TABLEGEN_CODEEXPANDER_H - -#include "llvm/ADT/ArrayRef.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/Support/SMLoc.h" - -namespace llvm { -class CodeExpansions; -class raw_ostream; - -/// Emit the given code with all '${foo}' placeholders expanded to their -/// replacements. -/// -/// It's an error to use an undefined expansion and expansion-like output that -/// needs to be emitted verbatim can be escaped as '\${foo}' -/// -/// The emitted code can be given a custom indent to enable both indentation by -/// an arbitrary amount of whitespace and emission of the code as a comment. -class CodeExpander { - StringRef Code; - const CodeExpansions &Expansions; - const ArrayRef &Loc; - bool ShowExpansions; - StringRef Indent; - -public: - CodeExpander(StringRef Code, const CodeExpansions &Expansions, - const ArrayRef &Loc, bool ShowExpansions, - StringRef Indent = " ") - : Code(Code), Expansions(Expansions), Loc(Loc), - ShowExpansions(ShowExpansions), Indent(Indent) {} - - void emit(raw_ostream &OS) const; -}; - -inline raw_ostream &operator<<(raw_ostream &OS, const CodeExpander &Expander) { - Expander.emit(OS); - return OS; -} -} // end namespace llvm - -#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANDER_H diff --git a/utils/TableGen/GlobalISel/CodeExpansions.h b/utils/TableGen/GlobalISel/CodeExpansions.h deleted file mode 100644 index bb890ec8f57e..000000000000 --- a/utils/TableGen/GlobalISel/CodeExpansions.h +++ /dev/null @@ -1,43 +0,0 @@ -//===- CodeExpansions.h - Record expansions for CodeExpander --------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file Record the expansions to use in a CodeExpander. -// -//===----------------------------------------------------------------------===// - -#include "llvm/ADT/StringMap.h" - -#ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H -#define LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H -namespace llvm { -class CodeExpansions { -public: - using const_iterator = StringMap::const_iterator; - -protected: - StringMap Expansions; - -public: - void declare(StringRef Name, StringRef Expansion) { - bool Inserted = Expansions.try_emplace(Name, Expansion).second; - assert(Inserted && "Declared variable twice"); - (void)Inserted; - } - - std::string lookup(StringRef Variable) const { - return Expansions.lookup(Variable); - } - - const_iterator begin() const { return Expansions.begin(); } - const_iterator end() const { return Expansions.end(); } - const_iterator find(StringRef Variable) const { - return Expansions.find(Variable); - } -}; -} // end namespace llvm -#endif // ifndef LLVM_UTILS_TABLEGEN_CODEEXPANSIONS_H From 594ed1cca96111b65b94d14f4ce06c21ccdb0aa6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Oct 2019 11:21:46 +0000 Subject: [PATCH 42/82] Fix uninitialized variable warning. NFCI git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373582 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index dd2d4ec118aa..4d2c9dad7099 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -11339,7 +11339,7 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) { SmallVector Opcodes; auto parseOne = [&]() -> bool { - const MCExpr *OE; + const MCExpr *OE = nullptr; SMLoc OpcodeLoc = getLexer().getLoc(); if (check(getLexer().is(AsmToken::EndOfStatement) || Parser.parseExpression(OE), From 98389cce9dc0098dcef412fec023391868dd5566 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Oct 2019 11:22:00 +0000 Subject: [PATCH 43/82] Fix uninitialized variable warning. NFCI git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373583 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 7af0de7f647e..4fb409f020d9 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -5276,7 +5276,7 @@ bool AArch64AsmParser::parseDirectiveInst(SMLoc Loc) { auto parseOp = [&]() -> bool { SMLoc L = getLoc(); - const MCExpr *Expr; + const MCExpr *Expr = nullptr; if (check(getParser().parseExpression(Expr), L, "expected expression")) return true; const MCConstantExpr *Value = dyn_cast_or_null(Expr); From 3e9f78170eb0c485d07e5006987835c1917e81f2 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 3 Oct 2019 11:33:50 +0000 Subject: [PATCH 44/82] [AArch64] Static (de)allocation of SVE stack objects. Adds support to AArch64FrameLowering to allocate fixed-stack SVE objects. The focus of this patch is purely to allow the stack frame to allocate/deallocate space for scalable SVE objects. More dynamic allocation (at compile-time, i.e. determining placement of SVE objects on the stack), or resolving frame-index references that include scalable-sized offsets, are left for subsequent patches. SVE objects are allocated in the stack frame as a separate region below the callee-save area, and above the alignment gap. This is done so that the SVE objects can be accessed directly from the FP at (runtime) VL-based offsets to benefit from using the VL-scaled addressing modes. The layout looks as follows: +-------------+ | stack arg | +-------------+ | Callee Saves| | X29, X30 | (if available) |-------------| <- FP (if available) | : | | SVE area | | : | +-------------+ |/////////////| alignment gap. | : | | Stack objs | | : | +-------------+ <- SP after call and frame-setup SVE and non-SVE stack objects are distinguished using different StackIDs. The offsets for objects with TargetStackID::SVEVector should be interpreted as purely scalable offsets within their respective SVE region. Reviewers: thegameg, rovka, t.p.northover, efriedma, rengolin, greened Reviewed By: efriedma Differential Revision: https://reviews.llvm.org/D61437 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373585 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/CodeGen/MIRYamlMapping.h | 1 + include/llvm/CodeGen/TargetFrameLowering.h | 1 + lib/Target/AArch64/AArch64FrameLowering.cpp | 77 ++++++++++- lib/Target/AArch64/AArch64FrameLowering.h | 11 ++ lib/Target/AArch64/AArch64InstrInfo.cpp | 31 ++++- .../AArch64/AArch64MachineFunctionInfo.h | 16 +++ lib/Target/AArch64/AArch64StackOffset.h | 49 +++++-- lib/Target/AMDGPU/SIFrameLowering.cpp | 2 + test/CodeGen/AArch64/framelayout-sve.mir | 121 ++++++++++++++++++ unittests/Target/AArch64/TestStackOffset.cpp | 75 ++++++++++- 10 files changed, 369 insertions(+), 15 deletions(-) create mode 100644 test/CodeGen/AArch64/framelayout-sve.mir diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h index 94e76a75e8da..069d0aa45095 100644 --- a/include/llvm/CodeGen/MIRYamlMapping.h +++ b/include/llvm/CodeGen/MIRYamlMapping.h @@ -314,6 +314,7 @@ struct ScalarEnumerationTraits { static void enumeration(yaml::IO &IO, TargetStackID::Value &ID) { IO.enumCase(ID, "default", TargetStackID::Default); IO.enumCase(ID, "sgpr-spill", TargetStackID::SGPRSpill); + IO.enumCase(ID, "sve-vec", TargetStackID::SVEVector); IO.enumCase(ID, "noalloc", TargetStackID::NoAlloc); } }; diff --git a/include/llvm/CodeGen/TargetFrameLowering.h b/include/llvm/CodeGen/TargetFrameLowering.h index 284f7ba64dba..6e4a723b426f 100644 --- a/include/llvm/CodeGen/TargetFrameLowering.h +++ b/include/llvm/CodeGen/TargetFrameLowering.h @@ -28,6 +28,7 @@ namespace TargetStackID { enum Value { Default = 0, SGPRSpill = 1, + SVEVector = 2, NoAlloc = 255 }; } diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index 8357b763179d..c42c16bc1aad 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -55,6 +55,10 @@ // | callee-saved fp/simd/SVE regs | // | | // |-----------------------------------| +// | | +// | SVE stack objects | +// | | +// |-----------------------------------| // |.empty.space.to.make.part.below....| // |.aligned.in.case.it.needs.more.than| (size of this area is unknown at // |.the.standard.16-byte.alignment....| compile time; if present) @@ -202,6 +206,12 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +/// Returns the size of the entire SVE stackframe (calleesaves + spills). +static StackOffset getSVEStackSize(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); + return {(int64_t)AFI->getStackSizeSVE(), MVT::nxv1i8}; +} + bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { if (!EnableRedZone) return false; @@ -214,7 +224,8 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const AArch64FunctionInfo *AFI = MF.getInfo(); unsigned NumBytes = AFI->getLocalStackSize(); - return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128); + return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || + getSVEStackSize(MF)); } /// hasFP - Return true if the specified function should have a dedicated frame @@ -456,6 +467,11 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( if (canUseRedZone(MF)) return false; + // When there is an SVE area on the stack, always allocate the + // callee-saves and spills/locals separately. + if (getSVEStackSize(MF)) + return false; + return true; } @@ -870,6 +886,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // Ideally it should match SP value after prologue. AFI->setTaggedBasePointerOffset(MFI.getStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // getStackSize() includes all the locals in its size calculation. We don't // include these locals when computing the stack size of a funclet, as they // are allocated in the parent's stack frame and accessed via the frame @@ -880,6 +898,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, : (int)MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); + assert(!SVEStackSize && + "unexpected function without stack frame but with SVE objects"); // All of the stack allocation is for locals. AFI->setLocalStackSize(NumBytes); if (!NumBytes) @@ -926,6 +946,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, AFI->setLocalStackSize(NumBytes - PrologueSaveSize); bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes); if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); @@ -1083,6 +1104,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } + emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + MachineInstr::FrameSetup); + // Allocate space for the rest of the frame. if (NumBytes) { const bool NeedsRealignment = RegInfo->needsStackRealignment(MF); @@ -1431,8 +1455,11 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, .setMIFlag(MachineInstr::FrameDestroy); } + const StackOffset &SVEStackSize = getSVEStackSize(MF); + // If there is a single SP update, insert it before the ret and we're done. if (CombineSPBump) { + assert(!SVEStackSize && "Cannot combine SP bump with SVE"); emitFrameOffset(MBB, MBB.getFirstTerminator(), DL, AArch64::SP, AArch64::SP, {NumBytes + (int64_t)AfterCSRPopSize, MVT::i8}, TII, MachineInstr::FrameDestroy, false, NeedsWinCFI, &HasWinCFI); @@ -1446,6 +1473,12 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Deallocate the SVE area. + if (SVEStackSize) + if (!AFI->isStackRealigned()) + emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, + TII, MachineInstr::FrameDestroy); + if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); // If this was a redzone leaf function, we don't need to restore the @@ -1595,6 +1628,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( bool isCSR = !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + const StackOffset &SVEStackSize = getSVEStackSize(MF); + if (SVEStackSize) + llvm_unreachable("Accessing frame indices in presence of SVE " + "not yet supported"); + // Use frame pointer to reference fixed objects. Use it for locals if // there are VLAs or a dynamically realigned SP (and thus the SP isn't // reliable as a base). Make sure useFPForScavengingIndex() does the @@ -2175,8 +2213,19 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, << ' ' << printReg(Reg, RegInfo); dbgs() << "\n";); + bool HasSVEStackObjects = [&MFI]() { + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) + if (MFI.getStackID(I) == TargetStackID::SVEVector && + MFI.getObjectOffset(I) < 0) + return true; + // Note: We don't take allocatable stack objects into + // account yet, because allocation for those is not yet + // implemented. + return false; + }(); + // If any callee-saved registers are used, the frame cannot be eliminated. - bool CanEliminateFrame = SavedRegs.count() == 0; + bool CanEliminateFrame = (SavedRegs.count() == 0) && !HasSVEStackObjects; // The CSR spill slots have not been allocated yet, so estimateStackSize // won't include them. @@ -2239,12 +2288,34 @@ bool AArch64FrameLowering::enableStackSlotScavenging( void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { + MachineFrameInfo &MFI = MF.getFrameInfo(); + + assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && + "Upwards growing stack unsupported"); + + // Process all fixed stack SVE objects. + int64_t Offset = 0; + for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID == TargetStackID::SVEVector) { + int64_t FixedOffset = -MFI.getObjectOffset(I); + if (FixedOffset > Offset) + Offset = FixedOffset; + } + } + + unsigned MaxAlign = getStackAlignment(); + uint64_t SVEStackSize = alignTo(Offset, MaxAlign); + + AArch64FunctionInfo *AFI = MF.getInfo(); + AFI->setStackSizeSVE(SVEStackSize); + assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. if (!MF.hasEHFunclets()) return; const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); - MachineFrameInfo &MFI = MF.getFrameInfo(); WinEHFuncInfo &EHInfo = *MF.getWinEHFuncInfo(); MachineBasicBlock &MBB = MF.front(); diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h index 7ed20d24607f..99d868a95a70 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.h +++ b/lib/Target/AArch64/AArch64FrameLowering.h @@ -87,6 +87,17 @@ class AArch64FrameLowering : public TargetFrameLowering { int FI) const override; int getSEHFrameIndexOffset(const MachineFunction &MF, int FI) const; + bool isSupportedStackID(TargetStackID::Value ID) const override { + switch (ID) { + default: + return false; + case TargetStackID::Default: + case TargetStackID::SVEVector: + case TargetStackID::NoAlloc: + return true; + } + } + private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, unsigned StackBumpBytes) const; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 097a8ba0ae19..1cc3177b26a7 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3046,6 +3046,16 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, MaxEncoding = 0xfff; ShiftSize = 12; break; + case AArch64::ADDVL_XXI: + case AArch64::ADDPL_XXI: + MaxEncoding = 31; + ShiftSize = 0; + if (Offset < 0) { + MaxEncoding = 32; + Sign = -1; + Offset = -Offset; + } + break; default: llvm_unreachable("Unsupported opcode"); } @@ -3117,8 +3127,8 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, StackOffset Offset, const TargetInstrInfo *TII, MachineInstr::MIFlag Flag, bool SetNZCV, bool NeedsWinCFI, bool *HasWinCFI) { - int64_t Bytes; - Offset.getForFrameOffset(Bytes); + int64_t Bytes, NumPredicateVectors, NumDataVectors; + Offset.getForFrameOffset(Bytes, NumPredicateVectors, NumDataVectors); // First emit non-scalable frame offsets, or a simple 'mov'. if (Bytes || (!Offset && SrcReg != DestReg)) { @@ -3133,6 +3143,23 @@ void llvm::emitFrameOffset(MachineBasicBlock &MBB, NeedsWinCFI, HasWinCFI); SrcReg = DestReg; } + + assert(!(SetNZCV && (NumPredicateVectors || NumDataVectors)) && + "SetNZCV not supported with SVE vectors"); + assert(!(NeedsWinCFI && (NumPredicateVectors || NumDataVectors)) && + "WinCFI not supported with SVE vectors"); + + if (NumDataVectors) { + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumDataVectors, + AArch64::ADDVL_XXI, TII, Flag, NeedsWinCFI, nullptr); + SrcReg = DestReg; + } + + if (NumPredicateVectors) { + assert(DestReg != AArch64::SP && "Unaligned access to SP"); + emitFrameOffsetAdj(MBB, MBBI, DL, DestReg, SrcReg, NumPredicateVectors, + AArch64::ADDPL_XXI, TII, Flag, NeedsWinCFI, nullptr); + } } MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl( diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0efeeb272ec1..a7d0a742573d 100644 --- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -95,6 +95,13 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { /// returned struct in a register. This field holds the virtual register into /// which the sret argument is passed. unsigned SRetReturnReg = 0; + /// SVE stack size (for predicates and data vectors) are maintained here + /// rather than in FrameInfo, as the placement and Stack IDs are target + /// specific. + uint64_t StackSizeSVE = 0; + + /// HasCalculatedStackSizeSVE indicates whether StackSizeSVE is valid. + bool HasCalculatedStackSizeSVE = false; /// Has a value when it is known whether or not the function uses a /// redzone, and no value otherwise. @@ -131,6 +138,15 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { ArgumentStackToRestore = bytes; } + bool hasCalculatedStackSizeSVE() const { return HasCalculatedStackSizeSVE; } + + void setStackSizeSVE(uint64_t S) { + HasCalculatedStackSizeSVE = true; + StackSizeSVE = S; + } + + uint64_t getStackSizeSVE() const { return StackSizeSVE; } + bool hasStackFrame() const { return HasStackFrame; } void setHasStackFrame(bool s) { HasStackFrame = s; } diff --git a/lib/Target/AArch64/AArch64StackOffset.h b/lib/Target/AArch64/AArch64StackOffset.h index 5f5cdfa2fad1..13f12a6c9c30 100644 --- a/lib/Target/AArch64/AArch64StackOffset.h +++ b/lib/Target/AArch64/AArch64StackOffset.h @@ -35,32 +35,38 @@ namespace llvm { /// vector and a 64bit GPR. class StackOffset { int64_t Bytes; + int64_t ScalableBytes; explicit operator int() const; public: using Part = std::pair; - StackOffset() : Bytes(0) {} + StackOffset() : Bytes(0), ScalableBytes(0) {} StackOffset(int64_t Offset, MVT::SimpleValueType T) : StackOffset() { - assert(!MVT(T).isScalableVector() && "Scalable types not supported"); + assert(MVT(T).getSizeInBits() % 8 == 0 && + "Offset type is not a multiple of bytes"); *this += Part(Offset, T); } - StackOffset(const StackOffset &Other) : Bytes(Other.Bytes) {} + StackOffset(const StackOffset &Other) + : Bytes(Other.Bytes), ScalableBytes(Other.ScalableBytes) {} StackOffset &operator=(const StackOffset &) = default; StackOffset &operator+=(const StackOffset::Part &Other) { - assert(Other.second.getSizeInBits() % 8 == 0 && - "Offset type is not a multiple of bytes"); - Bytes += Other.first * (Other.second.getSizeInBits() / 8); + int64_t OffsetInBytes = Other.first * (Other.second.getSizeInBits() / 8); + if (Other.second.isScalableVector()) + ScalableBytes += OffsetInBytes; + else + Bytes += OffsetInBytes; return *this; } StackOffset &operator+=(const StackOffset &Other) { Bytes += Other.Bytes; + ScalableBytes += Other.ScalableBytes; return *this; } @@ -72,6 +78,7 @@ class StackOffset { StackOffset &operator-=(const StackOffset &Other) { Bytes -= Other.Bytes; + ScalableBytes -= Other.ScalableBytes; return *this; } @@ -88,16 +95,42 @@ class StackOffset { return Res; } + /// Returns the scalable part of the offset in bytes. + int64_t getScalableBytes() const { return ScalableBytes; } + /// Returns the non-scalable part of the offset in bytes. int64_t getBytes() const { return Bytes; } /// Returns the offset in parts to which this frame offset can be /// decomposed for the purpose of describing a frame offset. /// For non-scalable offsets this is simply its byte size. - void getForFrameOffset(int64_t &ByteSized) const { ByteSized = Bytes; } + void getForFrameOffset(int64_t &NumBytes, int64_t &NumPredicateVectors, + int64_t &NumDataVectors) const { + assert(isValid() && "Invalid frame offset"); + + NumBytes = Bytes; + NumDataVectors = 0; + NumPredicateVectors = ScalableBytes / 2; + // This method is used to get the offsets to adjust the frame offset. + // If the function requires ADDPL to be used and needs more than two ADDPL + // instructions, part of the offset is folded into NumDataVectors so that it + // uses ADDVL for part of it, reducing the number of ADDPL instructions. + if (NumPredicateVectors % 8 == 0 || NumPredicateVectors < -64 || + NumPredicateVectors > 62) { + NumDataVectors = NumPredicateVectors / 8; + NumPredicateVectors -= NumDataVectors * 8; + } + } /// Returns whether the offset is known zero. - explicit operator bool() const { return Bytes; } + explicit operator bool() const { return Bytes || ScalableBytes; } + + bool isValid() const { + // The smallest scalable element supported by scaled SVE addressing + // modes are predicates, which are 2 scalable bytes in size. So the scalable + // byte offset must always be a multiple of 2. + return ScalableBytes % 2 == 0; + } }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 22f035e7f3e6..ed07ed100a19 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -673,6 +673,8 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const { case TargetStackID::NoAlloc: case TargetStackID::SGPRSpill: return true; + case TargetStackID::SVEVector: + return false; } llvm_unreachable("Invalid TargetStackID::Value"); } diff --git a/test/CodeGen/AArch64/framelayout-sve.mir b/test/CodeGen/AArch64/framelayout-sve.mir new file mode 100644 index 000000000000..9009a6a29bf6 --- /dev/null +++ b/test/CodeGen/AArch64/framelayout-sve.mir @@ -0,0 +1,121 @@ +# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# +# Test allocation and deallocation of SVE objects on the stack, +# as well as using a combination of scalable and non-scalable +# offsets to access the SVE on the stack. +# +# SVE objects are allocated below the (scalar) callee saves, +# and above spills/locals and the alignment gap, e.g. +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | SVE area | +# +-------------+ +# |/////////////| alignment gap. +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_sve() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_callee_saves() nounwind { entry: unreachable } + define void @test_allocate_sve_gpr_realigned() nounwind { entry: unreachable } + +... +# +----------+ +# | %fixed- | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | stack.0 | // to be materialized with 2*ADDVL (<=> 2 * n * 16bytes) +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve +# CHECK: stackSize: 16 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 + +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + RET_ReallyLR +--- +... +# +----------+ +# | x20, x21 | // callee saves +# +----------+ +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# | %stack.0 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_sve_gpr_callee_saves +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $x21, killed $x20, $sp, -2 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $x20 = IMPLICIT_DEF +# CHECK-NEXT: $x21 = IMPLICIT_DEF +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 16, 0 +# CHECK-NEXT: $sp, $x21, $x20 = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_callee_saves +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + $x20 = IMPLICIT_DEF + $x21 = IMPLICIT_DEF + RET_ReallyLR +--- +... +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %fixed- | // scalable objects +# | stack.0 | +# +----------+ +# |//////////| // alignment gap +# | %stack.0 | // not scalable +# +----------+ <- SP +# CHECK-LABEL: name: test_allocate_sve_gpr_realigned +# CHECK: stackSize: 32 + +# CHECK: bb.0.entry: +# CHECK-NEXT: $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 16, 0 +# CHECK-NEXT: $sp = ANDXri killed $[[TMP]] +# CHECK-NEXT: $sp = frame-destroy ADDXri $fp, 0, 0 +# CHECK-NEXT: $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 +# CHECK-NEXT: RET_ReallyLR +name: test_allocate_sve_gpr_realigned +fixedStack: + - { id: 0, stack-id: sve-vec, size: 18, alignment: 2, offset: -18 } +stack: + - { id: 0, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + RET_ReallyLR +--- diff --git a/unittests/Target/AArch64/TestStackOffset.cpp b/unittests/Target/AArch64/TestStackOffset.cpp index 240cec9f2d0b..c85135ef6605 100644 --- a/unittests/Target/AArch64/TestStackOffset.cpp +++ b/unittests/Target/AArch64/TestStackOffset.cpp @@ -20,6 +20,15 @@ TEST(StackOffset, MixedSize) { StackOffset C(2, MVT::v4i64); EXPECT_EQ(64, C.getBytes()); + + StackOffset D(2, MVT::nxv4i64); + EXPECT_EQ(64, D.getScalableBytes()); + + StackOffset E(2, MVT::v4i64); + EXPECT_EQ(0, E.getScalableBytes()); + + StackOffset F(2, MVT::nxv4i64); + EXPECT_EQ(0, F.getBytes()); } TEST(StackOffset, Add) { @@ -31,6 +40,11 @@ TEST(StackOffset, Add) { StackOffset D(1, MVT::i32); D += A; EXPECT_EQ(12, D.getBytes()); + + StackOffset E(1, MVT::nxv1i32); + StackOffset F = C + E; + EXPECT_EQ(12, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, Sub) { @@ -42,6 +56,12 @@ TEST(StackOffset, Sub) { StackOffset D(1, MVT::i64); D -= A; EXPECT_EQ(0, D.getBytes()); + + C += StackOffset(2, MVT::nxv1i32); + StackOffset E = StackOffset(1, MVT::nxv1i32); + StackOffset F = C - E; + EXPECT_EQ(4, F.getBytes()); + EXPECT_EQ(4, F.getScalableBytes()); } TEST(StackOffset, isZero) { @@ -49,12 +69,63 @@ TEST(StackOffset, isZero) { StackOffset B(0, MVT::i32); EXPECT_TRUE(!A); EXPECT_TRUE(!(A + B)); + + StackOffset C(0, MVT::nxv1i32); + EXPECT_TRUE(!(A + C)); + + StackOffset D(1, MVT::nxv1i32); + EXPECT_FALSE(!(A + D)); +} + +TEST(StackOffset, isValid) { + EXPECT_FALSE(StackOffset(1, MVT::nxv8i1).isValid()); + EXPECT_TRUE(StackOffset(2, MVT::nxv8i1).isValid()); + +#ifndef NDEBUG +#ifdef GTEST_HAS_DEATH_TEST + EXPECT_DEATH(StackOffset(1, MVT::i1), + "Offset type is not a multiple of bytes"); + EXPECT_DEATH(StackOffset(1, MVT::nxv1i1), + "Offset type is not a multiple of bytes"); +#endif // defined GTEST_HAS_DEATH_TEST +#endif // not defined NDEBUG } TEST(StackOffset, getForFrameOffset) { StackOffset A(1, MVT::i64); StackOffset B(1, MVT::i32); - int64_t ByteSized; - (A + B).getForFrameOffset(ByteSized); + StackOffset C(1, MVT::nxv4i32); + + // If all offsets can be materialized with only ADDVL, + // make sure PLSized is 0. + int64_t ByteSized, VLSized, PLSized; + (A + B + C).getForFrameOffset(ByteSized, PLSized, VLSized); EXPECT_EQ(12, ByteSized); + EXPECT_EQ(1, VLSized); + EXPECT_EQ(0, PLSized); + + // If we need an ADDPL to materialize the offset, and the number of scalable + // bytes fits the ADDPL immediate, fold the scalable bytes to fit in PLSized. + StackOffset D(1, MVT::nxv16i1); + (C + D).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(9, PLSized); + + StackOffset E(4, MVT::nxv4i32); + StackOffset F(1, MVT::nxv16i1); + (E + F).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(0, VLSized); + EXPECT_EQ(33, PLSized); + + // If the offset requires an ADDPL instruction to materialize, and would + // require more than two instructions, decompose it into both + // ADDVL (n x 16 bytes) and ADDPL (n x 2 bytes) instructions. + StackOffset G(8, MVT::nxv4i32); + StackOffset H(1, MVT::nxv16i1); + (G + H).getForFrameOffset(ByteSized, PLSized, VLSized); + EXPECT_EQ(0, ByteSized); + EXPECT_EQ(8, VLSized); + EXPECT_EQ(1, PLSized); } From 9d6154ec05deeb3a521e07393738eac00ba02128 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 3 Oct 2019 11:57:39 +0000 Subject: [PATCH 45/82] gn build: Revert 373554 "gn build: (manually) merge r373551" r373551 was reverted in r373581. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373586 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/unittests/BUILD.gn | 1 - utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn | 13 ------------- utils/gn/secondary/llvm/utils/TableGen/BUILD.gn | 1 - .../llvm/utils/TableGen/GlobalISel/BUILD.gn | 8 -------- 4 files changed, 23 deletions(-) delete mode 100644 utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn delete mode 100644 utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn diff --git a/utils/gn/secondary/llvm/unittests/BUILD.gn b/utils/gn/secondary/llvm/unittests/BUILD.gn index 3d607dd77471..f03456dca4a8 100644 --- a/utils/gn/secondary/llvm/unittests/BUILD.gn +++ b/utils/gn/secondary/llvm/unittests/BUILD.gn @@ -34,7 +34,6 @@ group("unittests") { "Remarks:RemarksTests", "Support:SupportTests", "Support/DynamicLibrary:DynamicLibraryTests", - "TableGen:TableGenTests", "TextAPI:TextAPITests", "Transforms/IPO:IPOTests", "Transforms/Scalar:ScalarTests", diff --git a/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn b/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn deleted file mode 100644 index 3f128f8ce2aa..000000000000 --- a/utils/gn/secondary/llvm/unittests/TableGen/BUILD.gn +++ /dev/null @@ -1,13 +0,0 @@ -import("//llvm/utils/unittest/unittest.gni") - -unittest("TableGenTests") { - deps = [ - "//llvm/lib/Support", - "//llvm/lib/TableGen", - "//llvm/utils/TableGen/GlobalISel", - ] - include_dirs = [ "//llvm/utils/TableGen" ] - sources = [ - "CodeExpanderTest.cpp", - ] -} diff --git a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn index 952b2f916062..9f5043faeed8 100644 --- a/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn +++ b/utils/gn/secondary/llvm/utils/TableGen/BUILD.gn @@ -4,7 +4,6 @@ executable("llvm-tblgen") { "//llvm/lib/MC", "//llvm/lib/Support", "//llvm/lib/TableGen", - "//llvm/utils/TableGen/GlobalISel", ] sources = [ "AsmMatcherEmitter.cpp", diff --git a/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn b/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn deleted file mode 100644 index fe703cf9cad8..000000000000 --- a/utils/gn/secondary/llvm/utils/TableGen/GlobalISel/BUILD.gn +++ /dev/null @@ -1,8 +0,0 @@ -source_set("GlobalISel") { - deps = [ - "//llvm/lib/Support", - ] - sources = [ - "CodeExpander.cpp", - ] -} From 9d51eb9dfcdd24998c791eaac1b1e44293feea2e Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Thu, 3 Oct 2019 12:06:56 +0000 Subject: [PATCH 46/82] [llvm-readobj][mips] Do not show an error if GOT is missed It is not an error if a file does not contain GOT. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373587 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-readobj/mips-got.test | 8 -------- tools/llvm-readobj/ELFDumper.cpp | 2 +- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test index e6e21ad6aca2..6fc162ae5c6a 100644 --- a/test/tools/llvm-readobj/mips-got.test +++ b/test/tools/llvm-readobj/mips-got.test @@ -1,5 +1,3 @@ -RUN: not llvm-readobj --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GOT-OBJ RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s -check-prefix GOT-EXE RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ @@ -11,8 +9,6 @@ RUN: FileCheck %s -check-prefix GOT-EMPTY RUN: llvm-readobj --mips-plt-got %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s -check-prefix GOT-STATIC -RUN: not llvm-readelf --mips-plt-got %p/Inputs/relocs.obj.elf-mips 2>&1 | \ -RUN: FileCheck %s -DFILE=%p/Inputs/relocs.obj.elf-mips -check-prefix GNU-GOT-OBJ RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EXE RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ @@ -24,8 +20,6 @@ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EMPTY RUN: llvm-readelf --mips-plt-got %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-STATIC -GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GOT-EXE: Primary GOT { GOT-EXE-NEXT: Canonical gp value: 0x418880 GOT-EXE-NEXT: Reserved entries [ @@ -380,8 +374,6 @@ GOT-STATIC-NEXT: } GOT-STATIC-NEXT: ] GOT-STATIC-NEXT: } -GNU-GOT-OBJ: error: '[[FILE]]': Cannot find .got section - GNU-GOT-EXE: Primary GOT: GNU-GOT-EXE-NEXT: Canonical gp value: 00418880 diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp index 1b3e8f4851df..2a5918486853 100644 --- a/tools/llvm-readobj/ELFDumper.cpp +++ b/tools/llvm-readobj/ELFDumper.cpp @@ -2315,7 +2315,7 @@ MipsGOTParser::MipsGOTParser(const ELFO *Obj, StringRef FileName, if (IsStatic) { GotSec = findSectionByName(*Obj, FileName, ".got"); if (!GotSec) - reportError(createError("Cannot find .got section"), FileName); + return; ArrayRef Content = unwrapOrError(FileName, Obj->getSectionContents(GotSec)); From 65819da738997845049c90bb85b6e85ebe72b649 Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Thu, 3 Oct 2019 12:07:07 +0000 Subject: [PATCH 47/82] [llvm-readobj][mips] Display MIPS specific info under --arch-specific flag Old options `--mips-plt-got`, `--mips-abi-flags`, '--mips-reginfo`, and `--mips-options` wiil be deleted in a separate patch. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373588 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../llvm-readobj/mips-got-overlapped.test | 4 ++-- test/tools/llvm-readobj/mips-got.test | 20 +++++++++---------- test/tools/llvm-readobj/mips-options-sec.test | 2 +- test/tools/llvm-readobj/mips-plt.test | 4 ++-- test/tools/llvm-readobj/mips-reginfo.test | 2 +- tools/llvm-readobj/llvm-readobj.cpp | 9 ++++++++- 6 files changed, 24 insertions(+), 17 deletions(-) diff --git a/test/tools/llvm-readobj/mips-got-overlapped.test b/test/tools/llvm-readobj/mips-got-overlapped.test index 85c4fe2d67c1..881c63b79a4f 100644 --- a/test/tools/llvm-readobj/mips-got-overlapped.test +++ b/test/tools/llvm-readobj/mips-got-overlapped.test @@ -1,9 +1,9 @@ -# Check that llvm-readobj --mips-plt-got correctly shows .got section +# Check that llvm-readobj -A correctly shows .got section # content if there are some other zero-sized sections with the same # address as the .got. got-over.exe.elf-mips has zero-sized .data # section at the same offset .got section. -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-over.exe.elf-mips | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/got-over.exe.elf-mips | FileCheck %s GOT-OBJ: Cannot find PLTGOT dynamic table tag. diff --git a/test/tools/llvm-readobj/mips-got.test b/test/tools/llvm-readobj/mips-got.test index 6fc162ae5c6a..8ed35d4b68e2 100644 --- a/test/tools/llvm-readobj/mips-got.test +++ b/test/tools/llvm-readobj/mips-got.test @@ -1,23 +1,23 @@ -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s -check-prefix GOT-EXE -RUN: llvm-readobj --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readobj -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s -check-prefix GOT-SO -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readobj -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s -check-prefix GOT-TLS -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readobj -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s -check-prefix GOT-EMPTY -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readobj -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s -check-prefix GOT-STATIC -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EXE -RUN: llvm-readelf --mips-plt-got %p/Inputs/dynamic-table-so.mips | \ +RUN: llvm-readelf -A %p/Inputs/dynamic-table-so.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-SO -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-tls.so.elf-mips64el | \ +RUN: llvm-readelf -A %p/Inputs/got-tls.so.elf-mips64el | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-TLS -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-empty.exe.mipsel | \ +RUN: llvm-readelf -A %p/Inputs/got-empty.exe.mipsel | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-EMPTY -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-static.exe.mips | \ +RUN: llvm-readelf -A %p/Inputs/got-static.exe.mips | \ RUN: FileCheck %s --strict-whitespace -check-prefix GNU-GOT-STATIC GOT-EXE: Primary GOT { diff --git a/test/tools/llvm-readobj/mips-options-sec.test b/test/tools/llvm-readobj/mips-options-sec.test index 64b3f0e91795..3636d56cfe6e 100644 --- a/test/tools/llvm-readobj/mips-options-sec.test +++ b/test/tools/llvm-readobj/mips-options-sec.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-options %p/Inputs/options.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/options.obj.elf-mipsel | FileCheck %s CHECK: MIPS Options { CHECK-NEXT: ODK_REGINFO { diff --git a/test/tools/llvm-readobj/mips-plt.test b/test/tools/llvm-readobj/mips-plt.test index b130a67d0443..4e40ca6aa2c1 100644 --- a/test/tools/llvm-readobj/mips-plt.test +++ b/test/tools/llvm-readobj/mips-plt.test @@ -1,5 +1,5 @@ -RUN: llvm-readobj --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s -RUN: llvm-readelf --mips-plt-got %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s +RUN: llvm-readobj -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck %s +RUN: llvm-readelf -A %p/Inputs/got-plt.exe.elf-mipsel | FileCheck --check-prefix=GNU %s CHECK: PLT GOT { CHECK-NEXT: Reserved entries [ diff --git a/test/tools/llvm-readobj/mips-reginfo.test b/test/tools/llvm-readobj/mips-reginfo.test index 7571d4c56bf0..20177a99d8cb 100644 --- a/test/tools/llvm-readobj/mips-reginfo.test +++ b/test/tools/llvm-readobj/mips-reginfo.test @@ -1,4 +1,4 @@ -RUN: llvm-readobj --mips-reginfo %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s +RUN: llvm-readobj -A %p/Inputs/reginfo.obj.elf-mipsel | FileCheck %s CHECK: MIPS RegInfo { CHECK-NEXT: GP: 0x7FEF diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 5919a7eed3e3..2e3041fed183 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -519,9 +519,16 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer, if (Obj->isELF()) { if (opts::ELFLinkerOptions) Dumper->printELFLinkerOptions(); - if (opts::ArchSpecificInfo) + if (opts::ArchSpecificInfo) { if (Obj->getArch() == llvm::Triple::arm) Dumper->printAttributes(); + else if (isMipsArch(Obj->getArch())) { + Dumper->printMipsABIFlags(); + Dumper->printMipsOptions(); + Dumper->printMipsReginfo(); + Dumper->printMipsPLTGOT(); + } + } if (isMipsArch(Obj->getArch())) { if (opts::MipsPLTGOT) Dumper->printMipsPLTGOT(); From 329896719f4b6a86946a8794e0be158a712fa709 Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Thu, 3 Oct 2019 12:08:04 +0000 Subject: [PATCH 48/82] [mips] Use llvm-readobj `-A` flag in test cases. NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373589 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/Mips/micromips-ase-function-attribute.ll | 2 +- test/MC/Mips/crc/module-crc.s | 2 +- test/MC/Mips/crc/module-nocrc.s | 2 +- test/MC/Mips/ginv/module-ginv.s | 2 +- test/MC/Mips/ginv/module-noginv.s | 2 +- test/MC/Mips/micromips-ase-directive.s | 2 +- test/MC/Mips/mips_abi_flags_xx.s | 8 ++++---- test/MC/Mips/mips_abi_flags_xx_set.s | 2 +- test/MC/Mips/module-hardfloat.s | 2 +- test/MC/Mips/module-softfloat.s | 2 +- test/MC/Mips/mt/abiflag.s | 2 +- test/MC/Mips/mt/module-directive.s | 2 +- test/MC/Mips/mt/set-directive.s | 2 +- test/MC/Mips/virt/module-novirt.s | 2 +- test/MC/Mips/virt/module-virt.s | 2 +- test/Object/Mips/abi-flags.yaml | 2 +- test/tools/llvm-readobj/mips-abiflags.test | 4 ++-- 17 files changed, 21 insertions(+), 21 deletions(-) diff --git a/test/CodeGen/Mips/micromips-ase-function-attribute.ll b/test/CodeGen/Mips/micromips-ase-function-attribute.ll index fe82b7c5b6cf..cd78166d372a 100644 --- a/test/CodeGen/Mips/micromips-ase-function-attribute.ll +++ b/test/CodeGen/Mips/micromips-ase-function-attribute.ll @@ -1,5 +1,5 @@ ; RUN: llc -mtriple=mips-unknown-linux -filetype=obj %s -o - | \ -; RUN: llvm-readobj --mips-abi-flags | \ +; RUN: llvm-readobj -A | \ ; RUN: FileCheck --check-prefix=ASE-MICROMIPS %s define void @_Z3foov() #0 { diff --git a/test/MC/Mips/crc/module-crc.s b/test/MC/Mips/crc/module-crc.s index 92c428e67ff9..66c54647cf44 100644 --- a/test/MC/Mips/crc/module-crc.s +++ b/test/MC/Mips/crc/module-crc.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module crc diff --git a/test/MC/Mips/crc/module-nocrc.s b/test/MC/Mips/crc/module-nocrc.s index c67279194c8e..193ed360b574 100644 --- a/test/MC/Mips/crc/module-nocrc.s +++ b/test/MC/Mips/crc/module-nocrc.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+crc | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module nocrc diff --git a/test/MC/Mips/ginv/module-ginv.s b/test/MC/Mips/ginv/module-ginv.s index 07f1bc4d40e9..8adcd90b23f7 100644 --- a/test/MC/Mips/ginv/module-ginv.s +++ b/test/MC/Mips/ginv/module-ginv.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r6 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module ginv diff --git a/test/MC/Mips/ginv/module-noginv.s b/test/MC/Mips/ginv/module-noginv.s index 2ed4fd9c314b..611d72c52d56 100644 --- a/test/MC/Mips/ginv/module-noginv.s +++ b/test/MC/Mips/ginv/module-noginv.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r6 -filetype=obj -o - -mattr=+ginv | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module noginv diff --git a/test/MC/Mips/micromips-ase-directive.s b/test/MC/Mips/micromips-ase-directive.s index f3ac60057dc5..fef40ecc3eeb 100644 --- a/test/MC/Mips/micromips-ase-directive.s +++ b/test/MC/Mips/micromips-ase-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc -triple=mips-unknown-linux -filetype=obj %s -o - | \ -# RUN: llvm-readobj --mips-abi-flags | \ +# RUN: llvm-readobj -A | \ # RUN: FileCheck --check-prefix=ASE-MICROMIPS %s .set micromips diff --git a/test/MC/Mips/mips_abi_flags_xx.s b/test/MC/Mips/mips_abi_flags_xx.s index 94101ae0c8f5..f8386b49774f 100644 --- a/test/MC/Mips/mips_abi_flags_xx.s +++ b/test/MC/Mips/mips_abi_flags_xx.s @@ -2,19 +2,19 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R1,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips-unknown-linux-gnu -mcpu=mips32r6 -mattr=fpxx -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-32R6,CHECK-OBJ-MIPS # RUN: llvm-mc /dev/null -triple mips64-unknown-linux-gnu -mcpu=octeon -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefixes=CHECK-OBJ,CHECK-OBJ-64R2,CHECK-OBJ-OCTEON # CHECK-ASM: .module fp=xx diff --git a/test/MC/Mips/mips_abi_flags_xx_set.s b/test/MC/Mips/mips_abi_flags_xx_set.s index f2445eba7774..8e4e2dbcf534 100644 --- a/test/MC/Mips/mips_abi_flags_xx_set.s +++ b/test/MC/Mips/mips_abi_flags_xx_set.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --sections --section-data --section-relocations --mips-abi-flags - | \ +# RUN: llvm-readobj --sections --section-data --section-relocations -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module fp=xx diff --git a/test/MC/Mips/module-hardfloat.s b/test/MC/Mips/module-hardfloat.s index f29fbc09353c..5738a09a91b9 100644 --- a/test/MC/Mips/module-hardfloat.s +++ b/test/MC/Mips/module-hardfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -triple mips-unknown-linux-gnu -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module hardfloat diff --git a/test/MC/Mips/module-softfloat.s b/test/MC/Mips/module-softfloat.s index 77e62e38e201..94ab7be63dcc 100644 --- a/test/MC/Mips/module-softfloat.s +++ b/test/MC/Mips/module-softfloat.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module softfloat diff --git a/test/MC/Mips/mt/abiflag.s b/test/MC/Mips/mt/abiflag.s index 2d03c5d1106c..d067c55587c9 100644 --- a/test/MC/Mips/mt/abiflag.s +++ b/test/MC/Mips/mt/abiflag.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -mattr=+mt -filetype=obj -o - \ -# RUN: | llvm-readobj --mips-abi-flags | FileCheck %s +# RUN: | llvm-readobj -A | FileCheck %s # Test that the usage of the MT ASE is recorded in .MIPS.abiflags diff --git a/test/MC/Mips/mt/module-directive.s b/test/MC/Mips/mt/module-directive.s index 0d9ab97b4550..1bbe91147545 100644 --- a/test/MC/Mips/mt/module-directive.s +++ b/test/MC/Mips/mt/module-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck --check-prefix=CHECK-OBJ %s +# RUN: llvm-readobj -A | FileCheck --check-prefix=CHECK-OBJ %s # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck --check-prefix=CHECK-ASM %s diff --git a/test/MC/Mips/mt/set-directive.s b/test/MC/Mips/mt/set-directive.s index 9088655d8c5d..5d18486059d4 100644 --- a/test/MC/Mips/mt/set-directive.s +++ b/test/MC/Mips/mt/set-directive.s @@ -1,5 +1,5 @@ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags | FileCheck %s --check-prefix=CHECK-OBJ +# RUN: llvm-readobj -A | FileCheck %s --check-prefix=CHECK-OBJ # RUN: llvm-mc < %s -arch=mips -mcpu=mips32r2 -filetype=asm -o - | \ # RUN: FileCheck %s --check-prefix=CHECK-ASM diff --git a/test/MC/Mips/virt/module-novirt.s b/test/MC/Mips/virt/module-novirt.s index 0f531dbbc80b..6b953d0c5857 100644 --- a/test/MC/Mips/virt/module-novirt.s +++ b/test/MC/Mips/virt/module-novirt.s @@ -2,7 +2,7 @@ # RUN: FileCheck %s -check-prefix=CHECK-ASM # # RUN: llvm-mc %s -arch=mips -mcpu=mips32r5 -filetype=obj -o - -mattr=+virt | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module novirt diff --git a/test/MC/Mips/virt/module-virt.s b/test/MC/Mips/virt/module-virt.s index ae38b83d8486..1fb035df8783 100644 --- a/test/MC/Mips/virt/module-virt.s +++ b/test/MC/Mips/virt/module-virt.s @@ -3,7 +3,7 @@ # # RUN: llvm-mc %s -triple=mips-unknown-linux-gnu -mcpu=mips32r5 \ # RUN: -filetype=obj -o - | \ -# RUN: llvm-readobj --mips-abi-flags - | \ +# RUN: llvm-readobj -A - | \ # RUN: FileCheck %s -check-prefix=CHECK-OBJ # CHECK-ASM: .module virt diff --git a/test/Object/Mips/abi-flags.yaml b/test/Object/Mips/abi-flags.yaml index b5142fd3303a..ce8234a9a0db 100644 --- a/test/Object/Mips/abi-flags.yaml +++ b/test/Object/Mips/abi-flags.yaml @@ -1,5 +1,5 @@ # RUN: yaml2obj %s > %t -# RUN: llvm-readobj --mips-abi-flags %t | FileCheck -check-prefix=OBJ %s +# RUN: llvm-readobj -A %t | FileCheck -check-prefix=OBJ %s # RUN: obj2yaml %t | FileCheck -check-prefix=YAML %s # OBJ: MIPS ABI Flags { diff --git a/test/tools/llvm-readobj/mips-abiflags.test b/test/tools/llvm-readobj/mips-abiflags.test index c06d147397eb..f014c10340fb 100644 --- a/test/tools/llvm-readobj/mips-abiflags.test +++ b/test/tools/llvm-readobj/mips-abiflags.test @@ -1,6 +1,6 @@ -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mipsel | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mipsel | \ RUN: FileCheck -check-prefix=EL64 %s -RUN: llvm-readobj --mips-abi-flags %p/Inputs/abiflags.obj.elf-mips | \ +RUN: llvm-readobj -A %p/Inputs/abiflags.obj.elf-mips | \ RUN: FileCheck -check-prefix=BE32 %s EL64: MIPS ABI Flags { From 43e1f17ba6bb933bd1b814323cb75ab72b32efe0 Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Thu, 3 Oct 2019 12:08:11 +0000 Subject: [PATCH 49/82] [llvm-readobj][mips] Remove non-standard --misp-xxx flags llvm-readobj "non-standard" flags `--mips-plt-got`, `--mips-abi-flags`, `--mips-reginfo`, and `--mips-options` are superseded by the `--arch-specific` flag and can be removed now. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373590 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/llvm-readobj/llvm-readobj.cpp | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp index 2e3041fed183..dfcda9996310 100644 --- a/tools/llvm-readobj/llvm-readobj.cpp +++ b/tools/llvm-readobj/llvm-readobj.cpp @@ -237,23 +237,6 @@ namespace opts { cl::alias ArchSpecifcInfoShort("A", cl::desc("Alias for --arch-specific"), cl::aliasopt(ArchSpecificInfo), cl::NotHidden); - // --mips-plt-got - cl::opt - MipsPLTGOT("mips-plt-got", - cl::desc("Display the MIPS GOT and PLT GOT sections")); - - // --mips-abi-flags - cl::opt MipsABIFlags("mips-abi-flags", - cl::desc("Display the MIPS.abiflags section")); - - // --mips-reginfo - cl::opt MipsReginfo("mips-reginfo", - cl::desc("Display the MIPS .reginfo section")); - - // --mips-options - cl::opt MipsOptions("mips-options", - cl::desc("Display the MIPS .MIPS.options section")); - // --coff-imports cl::opt COFFImports("coff-imports", cl::desc("Display the PE/COFF import table")); @@ -529,16 +512,6 @@ static void dumpObject(const ObjectFile *Obj, ScopedPrinter &Writer, Dumper->printMipsPLTGOT(); } } - if (isMipsArch(Obj->getArch())) { - if (opts::MipsPLTGOT) - Dumper->printMipsPLTGOT(); - if (opts::MipsABIFlags) - Dumper->printMipsABIFlags(); - if (opts::MipsReginfo) - Dumper->printMipsReginfo(); - if (opts::MipsOptions) - Dumper->printMipsOptions(); - } if (opts::SectionGroups) Dumper->printGroupSections(); if (opts::HashHistogram) From 09d1a7a6cbcaf171c23c6171badee17c1de736cf Mon Sep 17 00:00:00 2001 From: Simon Atanasyan Date: Thu, 3 Oct 2019 12:08:26 +0000 Subject: [PATCH 50/82] [mips] Push `fixup_Mips_LO16` fixup for `jialc` and `jic` instructions git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373591 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp | 7 +++++-- test/MC/Mips/micromips32r6/relocations.s | 10 ++++++++++ test/MC/Mips/mips32r6/relocations.s | 10 ++++++++++ test/MC/Mips/mips64r6/relocations.s | 10 ++++++++++ 4 files changed, 35 insertions(+), 2 deletions(-) diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp index 759a7fdb32b8..142e9cebb79e 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp @@ -485,8 +485,11 @@ getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo, assert(MO.isExpr() && "getJumpOffset16OpValue expects only expressions or an immediate"); - // TODO: Push fixup. - return 0; + const MCExpr *Expr = MO.getExpr(); + Mips::Fixups FixupKind = + isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16 : Mips::fixup_Mips_LO16; + Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind))); + return 0; } /// getJumpTargetOpValue - Return binary encoding of the jump diff --git a/test/MC/Mips/micromips32r6/relocations.s b/test/MC/Mips/micromips32r6/relocations.s index 7e8f3f6107e6..615b445a0faa 100644 --- a/test/MC/Mips/micromips32r6/relocations.s +++ b/test/MC/Mips/micromips32r6/relocations.s @@ -26,6 +26,12 @@ # CHECK-FIXUP: bnezc $3, bar # encoding: [0xa0,0b011AAAAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar-4, kind: fixup_MICROMIPS_PC21_S1 +# CHECK-FIXUP: jialc $5, bar # encoding: [0x80,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xa0,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_MICROMIPS_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -37,6 +43,8 @@ # CHECK-ELF: 0x10 R_MICROMIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x14 R_MICROMIPS_PC21_S1 bar 0x0 # CHECK-ELF: 0x18 R_MICROMIPS_PC21_S1 bar 0x0 +# CHECK-ELF: 0x1C R_MICROMIPS_LO16 bar 0x0 +# CHECK-ELF: 0x20 R_MICROMIPS_LO16 bar 0x0 # CHECK-ELF: ] balc bar @@ -46,3 +54,5 @@ lwpc $2,bar beqzc $3, bar bnezc $3, bar + jialc $5, bar + jic $5, bar diff --git a/test/MC/Mips/mips32r6/relocations.s b/test/MC/Mips/mips32r6/relocations.s index 3f42ee8f4717..8095fb156ec9 100644 --- a/test/MC/Mips/mips32r6/relocations.s +++ b/test/MC/Mips/mips32r6/relocations.s @@ -40,6 +40,12 @@ # CHECK-FIXUP: lwpc $2, bar # encoding: [0xec,0b01001AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -55,6 +61,8 @@ # CHECK-ELF: 0x20 R_MIPS_PCLO16 bar 0x0 # CHECK-ELF: 0x24 R_MIPS_PC19_S2 bar 0x0 # CHECK-ELF: 0x28 R_MIPS_PC19_S2 bar 0x0 +# CHECK-ELF: 0x2C R_MIPS_LO16 bar 0x0 +# CHECK-ELF: 0x30 R_MIPS_LO16 bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -68,3 +76,5 @@ addiu $2, $2, %pcrel_lo(bar) lapc $2,bar lwpc $2,bar + jialc $5, bar + jic $5, bar diff --git a/test/MC/Mips/mips64r6/relocations.s b/test/MC/Mips/mips64r6/relocations.s index 4f4efda07c69..5e70f44b96e1 100644 --- a/test/MC/Mips/mips64r6/relocations.s +++ b/test/MC/Mips/mips64r6/relocations.s @@ -47,6 +47,12 @@ # CHECK-FIXUP: lwupc $2, bar # encoding: [0xec,0b01010AAA,A,A] # CHECK-FIXUP: # fixup A - offset: 0, # CHECK-FIXUP: value: bar, kind: fixup_MIPS_PC19_S2 +# CHECK-FIXUP: jialc $5, bar # encoding: [0xf8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 +# CHECK-FIXUP: jic $5, bar # encoding: [0xd8,0x05,A,A] +# CHECK-FIXUP: # fixup A - offset: 0, +# CHECK-FIXUP: value: bar, kind: fixup_Mips_LO16 #------------------------------------------------------------------------------ # Check that the appropriate relocations were created. #------------------------------------------------------------------------------ @@ -64,6 +70,8 @@ # CHECK-ELF: 0x28 R_MIPS_PC18_S3/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x2C R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: 0x30 R_MIPS_PC19_S2/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x34 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 +# CHECK-ELF: 0x38 R_MIPS_LO16/R_MIPS_NONE/R_MIPS_NONE bar 0x0 # CHECK-ELF: ] addiupc $2,bar @@ -79,3 +87,5 @@ ldpc $2,bar lwpc $2,bar lwupc $2,bar + jialc $5, bar + jic $5, bar From f57ee998e4a45e8e231c7a3d55fc8eb227d4b5f8 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Thu, 3 Oct 2019 13:13:23 +0000 Subject: [PATCH 51/82] [llvm-readobj] - Stop using a precompiled binary in all.test Having a precompiled binary here is excessive. I also added a few missing tags. Differential revision: https://reviews.llvm.org/D68386 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373594 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/llvm-readobj/all.test | 39 ++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/test/tools/llvm-readobj/all.test b/test/tools/llvm-readobj/all.test index 17c5a007adfa..2fef5b842277 100644 --- a/test/tools/llvm-readobj/all.test +++ b/test/tools/llvm-readobj/all.test @@ -1,16 +1,25 @@ -RUN: llvm-readobj -a %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL -RUN: llvm-readobj --all %p/Inputs/trivial.obj.elf-i386 \ -RUN: | FileCheck %s -check-prefix ALL +# RUN: yaml2obj %s -o %t.o +# RUN: llvm-readobj -a %t.o | FileCheck %s --check-prefix ALL +# RUN: llvm-readobj --all %t.o | FileCheck %s --check-prefix ALL -ALL: Format: ELF32-i386 -ALL: Arch: i386 -ALL: AddressSize: 32bit -ALL: LoadName: -ALL: ElfHeader { -ALL: Sections [ -ALL: Relocations [ -ALL: Symbols [ -ALL: ProgramHeaders [ -ALL: Notes [ -ALL: StackSizes [ +# ALL: Format: ELF32-i386 +# ALL: Arch: i386 +# ALL: AddressSize: 32bit +# ALL: LoadName: +# ALL: ElfHeader { +# ALL: Sections [ +# ALL: Relocations [ +# ALL: Symbols [ +# ALL: ProgramHeaders [ +# ALL: Version symbols { +# ALL: SHT_GNU_verdef { +# ALL: SHT_GNU_verneed { +# ALL: Notes [ +# ALL: StackSizes [ + +--- !ELF +FileHeader: + Class: ELFCLASS32 + Data: ELFDATA2LSB + Type: ET_REL + Machine: EM_386 From 34ace4c17af9405368e8a025830165d1052e2fda Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Thu, 3 Oct 2019 13:17:21 +0000 Subject: [PATCH 52/82] [Alignment][NFC] Remove StoreInst::setAlignment(unsigned) Summary: This is patch is part of a series to introduce an Alignment type. See this thread for context: http://lists.llvm.org/pipermail/llvm-dev/2019-July/133851.html See this patch for the introduction of the type: https://reviews.llvm.org/D64790 Reviewers: courbet, bollu, jdoerfert Subscribers: hiraditya, asbirlea, cfe-commits, llvm-commits Tags: #clang, #llvm Differential Revision: https://reviews.llvm.org/D68268 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373595 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/IR/IRBuilder.h | 2 +- include/llvm/IR/Instructions.h | 2 -- lib/CodeGen/AtomicExpandPass.cpp | 2 +- lib/IR/Core.cpp | 2 +- lib/IR/Instructions.cpp | 8 ++----- lib/Transforms/IPO/Attributor.cpp | 2 +- .../InstCombine/InstCombineAtomicRMW.cpp | 2 +- .../InstCombine/InstCombineCalls.cpp | 21 +++++++++---------- .../InstCombineLoadStoreAlloca.cpp | 12 +++++------ .../Scalar/AlignmentFromAssumptions.cpp | 2 +- lib/Transforms/Scalar/GVNHoist.cpp | 4 ++-- lib/Transforms/Scalar/LICM.cpp | 2 +- lib/Transforms/Scalar/SROA.cpp | 2 +- lib/Transforms/Utils/SimplifyCFG.cpp | 6 +++--- lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 15 files changed, 32 insertions(+), 39 deletions(-) diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index fa053fa70291..c69815066249 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -1649,7 +1649,7 @@ class IRBuilder : public IRBuilderBase, public Inserter { StoreInst *CreateAlignedStore(Value *Val, Value *Ptr, unsigned Align, bool isVolatile = false) { StoreInst *SI = CreateStore(Val, Ptr, isVolatile); - SI->setAlignment(Align); + SI->setAlignment(MaybeAlign(Align)); return SI; } diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 71cf9fc38d83..eaaf50646462 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -374,8 +374,6 @@ class StoreInst : public Instruction { return 0; } - // FIXME: Remove once migration to Align is over. - void setAlignment(unsigned Align); void setAlignment(MaybeAlign Align); /// Returns the ordering constraint of this store instruction. diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index cc3379f13b4d..27b298dcf6af 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -469,7 +469,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) { Value *NewAddr = Builder.CreateBitCast(Addr, PT); StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr); - NewSI->setAlignment(SI->getAlignment()); + NewSI->setAlignment(MaybeAlign(SI->getAlignment())); NewSI->setVolatile(SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n"); diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp index a599a0899876..c548c56211ae 100644 --- a/lib/IR/Core.cpp +++ b/lib/IR/Core.cpp @@ -2014,7 +2014,7 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) { else if (LoadInst *LI = dyn_cast(P)) LI->setAlignment(MaybeAlign(Bytes)); else if (StoreInst *SI = dyn_cast(P)) - SI->setAlignment(Bytes); + SI->setAlignment(MaybeAlign(Bytes)); else llvm_unreachable( "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment"); diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index 0f000623bdb5..de1317ea9d3f 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -1397,7 +1397,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } @@ -1413,15 +1413,11 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Op<0>() = val; Op<1>() = addr; setVolatile(isVolatile); - setAlignment(Align); + setAlignment(MaybeAlign(Align)); setAtomic(Order, SSID); AssertOK(); } -void StoreInst::setAlignment(unsigned Align) { - setAlignment(llvm::MaybeAlign(Align)); -} - void StoreInst::setAlignment(MaybeAlign Align) { assert((!Align || *Align <= MaximumAlignment) && "Alignment is greater than MaximumAlignment!"); diff --git a/lib/Transforms/IPO/Attributor.cpp b/lib/Transforms/IPO/Attributor.cpp index 1455a906103a..58ce91c807dd 100644 --- a/lib/Transforms/IPO/Attributor.cpp +++ b/lib/Transforms/IPO/Attributor.cpp @@ -2467,7 +2467,7 @@ struct AAAlignImpl : AAAlign { if (SI->getAlignment() < getAssumedAlign()) { STATS_DECLTRACK(AAAlign, Store, "Number of times alignemnt added to a store"); - SI->setAlignment(getAssumedAlign()); + SI->setAlignment(Align(getAssumedAlign())); Changed = ChangeStatus::CHANGED; } } else if (auto *LI = dyn_cast(U.getUser())) { diff --git a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp index 2cfd3f5bb17f..825f4b468b0a 100644 --- a/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp +++ b/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp @@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) { auto *SI = new StoreInst(RMWI.getValOperand(), RMWI.getPointerOperand(), &RMWI); SI->setAtomic(Ordering, RMWI.getSyncScopeID()); - SI->setAlignment(DL.getABITypeAlignment(RMWI.getType())); + SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType()))); return eraseInstFromFunction(RMWI); } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index bc458ebf6521..8d4b0dc0a7a7 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -185,7 +185,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy); LoadInst *L = Builder.CreateLoad(IntType, Src); // Alignment from the mem intrinsic will be better, so use it. - L->setAlignment(MaybeAlign(CopySrcAlign)); + L->setAlignment( + MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) L->setMetadata(LLVMContext::MD_tbaa, CopyMD); MDNode *LoopMemParallelMD = @@ -198,7 +199,8 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { StoreInst *S = Builder.CreateStore(L, Dest); // Alignment from the mem intrinsic will be better, so use it. - S->setAlignment(CopyDstAlign); + S->setAlignment( + MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead. if (CopyMD) S->setMetadata(LLVMContext::MD_tbaa, CopyMD); if (LoopMemParallelMD) @@ -223,9 +225,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) { } Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { - unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); - if (MI->getDestAlignment() < Alignment) { - MI->setDestAlignment(Alignment); + const unsigned KnownAlignment = + getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT); + if (MI->getDestAlignment() < KnownAlignment) { + MI->setDestAlignment(KnownAlignment); return MI; } @@ -243,13 +246,9 @@ Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) { ConstantInt *FillC = dyn_cast(MI->getValue()); if (!LenC || !FillC || !FillC->getType()->isIntegerTy(8)) return nullptr; - uint64_t Len = LenC->getLimitedValue(); - Alignment = MI->getDestAlignment(); + const uint64_t Len = LenC->getLimitedValue(); assert(Len && "0-sized memory setting should be removed already."); - - // Alignment 0 is identity for alignment 1 for memset, but not store. - if (Alignment == 0) - Alignment = 1; + const Align Alignment = assumeAligned(MI->getDestAlignment()); // If it is an atomic and alignment is less than the size then we will // introduce the unaligned memory access which will be later transformed diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index eb01b4b7d7d1..4c5e1cc43760 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -1356,15 +1356,15 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) { return eraseInstFromFunction(SI); // Attempt to improve the alignment. - unsigned KnownAlign = getOrEnforceKnownAlignment( - Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT); - unsigned StoreAlign = SI.getAlignment(); - unsigned EffectiveStoreAlign = - StoreAlign != 0 ? StoreAlign : DL.getABITypeAlignment(Val->getType()); + const Align KnownAlign = Align(getOrEnforceKnownAlignment( + Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT)); + const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment()); + const Align EffectiveStoreAlign = + StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType())); if (KnownAlign > EffectiveStoreAlign) SI.setAlignment(KnownAlign); - else if (StoreAlign == 0) + else if (!StoreAlign) SI.setAlignment(EffectiveStoreAlign); // Try to canonicalize the stored type. diff --git a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp index 2d135b41279f..0e9f03a06061 100644 --- a/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp +++ b/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp @@ -329,7 +329,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) { SI->getPointerOperand(), SE); if (NewAlignment > SI->getAlignment()) { - SI->setAlignment(NewAlignment); + SI->setAlignment(MaybeAlign(NewAlignment)); ++NumStoreAlignChanged; } } else if (MemIntrinsic *MI = dyn_cast(J)) { diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp index 79b901ac0db8..1f01ba2fbfc6 100644 --- a/lib/Transforms/Scalar/GVNHoist.cpp +++ b/lib/Transforms/Scalar/GVNHoist.cpp @@ -894,8 +894,8 @@ class GVNHoist { ++NumLoadsRemoved; } else if (auto *ReplacementStore = dyn_cast(Repl)) { ReplacementStore->setAlignment( - std::min(ReplacementStore->getAlignment(), - cast(I)->getAlignment())); + MaybeAlign(std::min(ReplacementStore->getAlignment(), + cast(I)->getAlignment()))); ++NumStoresRemoved; } else if (auto *ReplacementAlloca = dyn_cast(Repl)) { ReplacementAlloca->setAlignment( diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp index 5c2176f873bc..2e13e8e4150d 100644 --- a/lib/Transforms/Scalar/LICM.cpp +++ b/lib/Transforms/Scalar/LICM.cpp @@ -1790,7 +1790,7 @@ class LoopPromoter : public LoadAndStorePromoter { StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos); if (UnorderedAtomic) NewSI->setOrdering(AtomicOrdering::Unordered); - NewSI->setAlignment(Alignment); + NewSI->setAlignment(MaybeAlign(Alignment)); NewSI->setDebugLoc(DL); if (AATags) NewSI->setAAMetadata(AATags); diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp index cec65ba76eda..c1e935fda7f8 100644 --- a/lib/Transforms/Scalar/SROA.cpp +++ b/lib/Transforms/Scalar/SROA.cpp @@ -3127,7 +3127,7 @@ class llvm::sroa::AllocaSliceRewriter Value *Op = SI->getOperand(0); StoreAlign = DL.getABITypeAlignment(Op->getType()); } - SI->setAlignment(std::min(StoreAlign, getSliceAlign())); + SI->setAlignment(MaybeAlign(std::min(StoreAlign, getSliceAlign()))); continue; } diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index 7352ce83adb4..279a844f9e44 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -3087,15 +3087,15 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB, // store that doesn't execute. if (MinAlignment != 0) { // Choose the minimum of all non-zero alignments. - SI->setAlignment(MinAlignment); + SI->setAlignment(Align(MinAlignment)); } else if (MaxAlignment != 0) { // Choose the minimal alignment between the non-zero alignment and the ABI // default alignment for the type of the stored value. - SI->setAlignment(std::min(MaxAlignment, TypeAlignment)); + SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment))); } else { // If both alignments are zero, use ABI default alignment for the type of // the stored value. - SI->setAlignment(TypeAlignment); + SI->setAlignment(Align(TypeAlignment)); } QStore->eraseFromParent(); diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index 196dbe12b876..99428c6c5dee 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -4024,7 +4024,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { if (!Alignment) Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType()); - ST->setAlignment(Alignment); + ST->setAlignment(Align(Alignment)); Value *V = propagateMetadata(ST, E->Scalars); if (NeedToShuffleReuses) { V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), From 3441df011233cecd0ba5d2fd8b24a5d75925df0f Mon Sep 17 00:00:00 2001 From: Djordje Todorovic Date: Thu, 3 Oct 2019 13:18:14 +0000 Subject: [PATCH 53/82] [llvm-locstats] Copy the script only when needed; NFC git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373596 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/llvm-locstats/CMakeLists.txt | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/utils/llvm-locstats/CMakeLists.txt b/utils/llvm-locstats/CMakeLists.txt index aa5aeca149f0..a919023e141e 100644 --- a/utils/llvm-locstats/CMakeLists.txt +++ b/utils/llvm-locstats/CMakeLists.txt @@ -1,7 +1,12 @@ if (LLVM_BUILD_UTILS AND LLVM_BUILD_TOOLS) - add_custom_target(llvm-locstats ALL - COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + add_custom_command( + OUTPUT ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + DEPENDS ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py + COMMAND ${CMAKE_COMMAND} -E copy ${LLVM_MAIN_SRC_DIR}/utils/llvm-locstats/llvm-locstats.py ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats COMMENT "Copying llvm-locstats into ${LLVM_TOOLS_BINARY_DIR}" ) + add_custom_target(llvm-locstats ALL + DEPENDS ${LLVM_TOOLS_BINARY_DIR}/llvm-locstats + ) set_target_properties(llvm-locstats PROPERTIES FOLDER "Tools") endif() From befd1c1098cf794d147edffc598239961d923004 Mon Sep 17 00:00:00 2001 From: Roman Lebedev Date: Thu, 3 Oct 2019 13:36:00 +0000 Subject: [PATCH 54/82] [NFC][InstCombine] Some tests for sub-of-negatible pattern As we have previously estabilished, `sub` is an outcast, and should be considered non-canonical iff it can be converted to `add`. It can be converted to `add` if it's second operand can be negated. So far we mostly only do that for constants and negation itself, but we should be more through. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373597 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../InstCombine/sub-of-negatible.ll | 292 ++++++++++++++++++ 1 file changed, 292 insertions(+) create mode 100644 test/Transforms/InstCombine/sub-of-negatible.ll diff --git a/test/Transforms/InstCombine/sub-of-negatible.ll b/test/Transforms/InstCombine/sub-of-negatible.ll new file mode 100644 index 000000000000..2d9910352683 --- /dev/null +++ b/test/Transforms/InstCombine/sub-of-negatible.ll @@ -0,0 +1,292 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt %s -instcombine -S | FileCheck %s + +declare void @use8(i8) + +; Constant can be freely negated. +define i8 @t0(i8 %x) { +; CHECK-LABEL: @t0( +; CHECK-NEXT: [[T0:%.*]] = add i8 [[X:%.*]], 42 +; CHECK-NEXT: ret i8 [[T0]] +; + %t0 = sub i8 %x, -42 + ret i8 %t0 +} + +; Negation can be negated for free +define i8 @t1(i8 %x, i8 %y) { +; CHECK-LABEL: @t1( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[X:%.*]], [[Y]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} + +; Shift-left can be negated if all uses can be updated +define i8 @t2(i8 %x, i8 %y) { +; CHECK-LABEL: @t2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n2(i8 %x, i8 %y) { +; CHECK-LABEL: @n2( +; CHECK-NEXT: [[T0:%.*]] = shl i8 -42, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = shl i8 -42, %y + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n3(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n3( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = shl i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = shl i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Select can be negated if all it's operands can be negated and all the users of select can be updated +define i8 @t4(i8 %x, i1 %y) { +; CHECK-LABEL: @t4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n4(i8 %x, i1 %y) { +; CHECK-LABEL: @n4( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 44 +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 44 + call void @use8(i8 %t0) + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @n5(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n5( +; CHECK-NEXT: [[T0:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[X:%.*]], [[T0]] +; CHECK-NEXT: ret i8 [[T1]] +; + %t0 = select i1 %y, i8 -42, i8 %z + %t1 = sub i8 %x, %t0 + ret i8 %t1 +} +define i8 @t6(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t6( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 -42, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 -42, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @t7(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @t7( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n8(i8 %x, i1 %y, i8 %z) { +; CHECK-LABEL: @n8( +; CHECK-NEXT: [[T0:%.*]] = shl i8 1, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = select i1 [[Y:%.*]], i8 0, i8 [[T0]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = shl i8 1, %z + call void @use8(i8 %t0) + %t1 = select i1 %y, i8 0, i8 %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Subtraction can be negated if the first operand can be negated +; x - (y - z) -> x - y + z -> x + (-y) + z +define i8 @t9(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t9( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n10(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n10( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 [[T0]], [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %t0, %y + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n11(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n11( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = add i8 [[Y:%.*]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %z + call void @use8(i8 %t0) + %t1 = sub i8 %y, %t0 + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} + +; Addition can be negated if both operands can be negated +; x - (y + z) -> x - y - z -> x + ((-y) + (-z))) +define i8 @t12(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t12( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} +define i8 @n13(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n13( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T11:%.*]] = sub i8 [[Y]], [[Z:%.*]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[T11]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = add i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n14(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n14( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = sub i8 0, [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i8 [[Y]], [[Z]] +; CHECK-NEXT: [[T2:%.*]] = sub i8 0, [[TMP1]] +; CHECK-NEXT: call void @use8(i8 [[T2]]) +; CHECK-NEXT: [[T3:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T3]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = sub i8 0, %z + call void @use8(i8 %t1) + %t2 = add i8 %t0, %t1 + call void @use8(i8 %t2) + %t3 = sub i8 %x, %t2 + ret i8 %t3 +} + +; Multiplication can be negated if either one of operands can be negated +; x - (y * z) -> x + ((-y) * z) or x + ((-z) * y) +define i8 @t15(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @t15( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[TMP1:%.*]] = mul i8 [[Z:%.*]], [[Y]] +; CHECK-NEXT: [[T2:%.*]] = add i8 [[TMP1]], [[X:%.*]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} +define i8 @n16(i8 %x, i8 %y, i8 %z) { +; CHECK-LABEL: @n16( +; CHECK-NEXT: [[T0:%.*]] = sub i8 0, [[Y:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T0]]) +; CHECK-NEXT: [[T1:%.*]] = mul i8 [[T0]], [[Z:%.*]] +; CHECK-NEXT: call void @use8(i8 [[T1]]) +; CHECK-NEXT: [[T2:%.*]] = sub i8 [[X:%.*]], [[T1]] +; CHECK-NEXT: ret i8 [[T2]] +; + %t0 = sub i8 0, %y + call void @use8(i8 %t0) + %t1 = mul i8 %t0, %z + call void @use8(i8 %t1) + %t2 = sub i8 %x, %t1 + ret i8 %t2 +} From 265ff646af74fcac6bceaf043c32a8403d2d7eb0 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Thu, 3 Oct 2019 13:57:08 +0000 Subject: [PATCH 55/82] [yaml2obj/obj2yaml] - Add support for SHT_LLVM_ADDRSIG sections. SHT_LLVM_ADDRSIG is described here: https://llvm.org/docs/Extensions.html#sht-llvm-addrsig-section-address-significance-table This patch teaches tools to dump them and to parse the YAML declarations of such sections. Differential revision: https://reviews.llvm.org/D68333 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373598 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ObjectYAML/ELFYAML.h | 27 ++- lib/ObjectYAML/ELFEmitter.cpp | 30 +++ lib/ObjectYAML/ELFYAML.cpp | 37 +++ .../tools/llvm-readobj/elf-section-types.test | 1 + .../obj2yaml/elf-llvm-addrsig-section.yaml | 98 ++++++++ .../yaml2obj/elf-llvm-addrsig-section.yaml | 213 ++++++++++++++++++ tools/obj2yaml/elf2yaml.cpp | 74 +++++- 7 files changed, 469 insertions(+), 11 deletions(-) create mode 100644 test/tools/obj2yaml/elf-llvm-addrsig-section.yaml create mode 100644 test/tools/yaml2obj/elf-llvm-addrsig-section.yaml diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h index 1662d06bf91e..592b5021bd69 100644 --- a/include/llvm/ObjectYAML/ELFYAML.h +++ b/include/llvm/ObjectYAML/ELFYAML.h @@ -137,7 +137,8 @@ struct Section { StackSizes, SymtabShndxSection, Symver, - MipsABIFlags + MipsABIFlags, + Addrsig }; SectionKind Kind; StringRef Name; @@ -256,6 +257,25 @@ struct VerneedSection : Section { } }; +struct AddrsigSymbol { + AddrsigSymbol(StringRef N) : Name(N), Index(None) {} + AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {} + AddrsigSymbol() : Name(None), Index(None) {} + + Optional Name; + Optional Index; +}; + +struct AddrsigSection : Section { + Optional Content; + Optional> Symbols; + + AddrsigSection() : Section(SectionKind::Addrsig) {} + static bool classof(const Section *S) { + return S->Kind == SectionKind::Addrsig; + } +}; + struct SymverSection : Section { std::vector Entries; @@ -362,6 +382,7 @@ struct Object { } // end namespace ELFYAML } // end namespace llvm +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader) @@ -518,6 +539,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::VernauxEntry &E); }; +template <> struct MappingTraits { + static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym); +}; + template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::Relocation &Rel); }; diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp index c85cf4c924f0..3f3b27c5bfad 100644 --- a/lib/ObjectYAML/ELFEmitter.cpp +++ b/lib/ObjectYAML/ELFEmitter.cpp @@ -174,6 +174,10 @@ template class ELFState { void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::HashSection &Section, ContiguousBlobAccumulator &CBA); + void writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA); + ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH); public: @@ -423,6 +427,8 @@ void ELFState::initSectionHeaders(std::vector &SHeaders, writeSectionContent(SHeader, *S, CBA); } else if (auto S = dyn_cast(Sec)) { writeSectionContent(SHeader, *S, CBA); + } else if (auto S = dyn_cast(Sec)) { + writeSectionContent(SHeader, *S, CBA); } else { llvm_unreachable("Unknown section type"); } @@ -990,6 +996,30 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, Section.Content->writeAsBinary(OS); } +template +void ELFState::writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA) { + raw_ostream &OS = + CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign); + + unsigned Link = 0; + if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) + SHeader.sh_link = Link; + + if (Section.Content) { + SHeader.sh_size = writeContent(OS, Section.Content, None); + return; + } + + for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) { + uint64_t Val = + Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false) + : (uint32_t)*Sym.Index; + SHeader.sh_size += encodeULEB128(Val, OS); + } +} + template void ELFState::buildSectionIndex() { for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) { StringRef Name = Doc.Sections[I]->Name; diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index 0dd6854cfee0..e295a000ef8a 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -1071,6 +1071,12 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { IO.mapRequired("Entries", Section.Entries); } +static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { + commonSectionMapping(IO, Section); + IO.mapOptional("Content", Section.Content); + IO.mapOptional("Symbols", Section.Symbols); +} + void MappingTraits::mapping( IO &IO, ELFYAML::SectionOrType §ionOrType) { IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType); @@ -1161,6 +1167,11 @@ void MappingTraits>::mapping( Section.reset(new ELFYAML::SymtabShndxSection()); sectionMapping(IO, *cast(Section.get())); break; + case ELF::SHT_LLVM_ADDRSIG: + if (!IO.outputting()) + Section.reset(new ELFYAML::AddrsigSection()); + sectionMapping(IO, *cast(Section.get())); + break; default: if (!IO.outputting()) { StringRef Name; @@ -1233,6 +1244,26 @@ StringRef MappingTraits>::validate( return {}; } + if (const auto *Sec = dyn_cast(Section.get())) { + if (!Sec->Symbols && !Sec->Content) + return "one of \"Symbols\" or \"Content\" must be specified"; + + if (Sec->Content) { + if (Sec->Symbols) + return "\"Content\" and \"Symbols\" cannot be used together"; + return {}; + } + + if (!Sec->Symbols) + return {}; + + for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols) + if (AS.Index && AS.Name) + return "\"Index\" and \"Name\" cannot be used together when defining a " + "symbol"; + return {}; + } + return {}; } @@ -1340,6 +1371,12 @@ void MappingTraits::mapping(IO &IO, ELFYAML::Object &Object) { IO.setContext(nullptr); } +void MappingTraits::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("Name", Sym.Name); + IO.mapOptional("Index", Sym.Index); +} + LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG) LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT) diff --git a/test/tools/llvm-readobj/elf-section-types.test b/test/tools/llvm-readobj/elf-section-types.test index aad9f43c8a30..20b881249c7f 100644 --- a/test/tools/llvm-readobj/elf-section-types.test +++ b/test/tools/llvm-readobj/elf-section-types.test @@ -196,6 +196,7 @@ Sections: Type: SHT_LLVM_CALL_GRAPH_PROFILE - Name: llvm_addrsig Type: SHT_LLVM_ADDRSIG + Symbols: - Name: .deplibs Type: SHT_LLVM_DEPENDENT_LIBRARIES - Name: .llvm_sympart.f diff --git a/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..6f21c3212bd9 --- /dev/null +++ b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml @@ -0,0 +1,98 @@ +## Check how obj2yaml dumps the SHT_LLVM_ADDRSIG section. + +## Check that when possible obj2yaml tries to produce the "Name" tag when +## dumping entries of the SHT_LLVM_ADDRSIG section. It falls back to producing +## the "Index" tag when it can't match a symbol index with a symbol table entry. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=NAME + +# NAME: - Name: .llvm_addrsig +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Link: .symtab +# NAME-NEXT: Symbols: +# NAME-NEXT: - Name: foo +# NAME-NEXT: - Name: bar +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF +# NAME: - Name: .llvm_addrsig_unlinked +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Symbols: +# NAME-NEXT: - Index: 0x00000001 +# NAME-NEXT: - Index: 0x00000002 +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF + - Name: .llvm_addrsig_unlinked + Type: SHT_LLVM_ADDRSIG + Link: 0 + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check that obj2yaml dumps the SHT_LLVM_ADDRSIG section +## data using the "Content" tag when at least one of the entries is broken, +## e.g. because the entry contains a malformed uleb128 value. + +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=INVALID-ENTRY + +# INVALID-ENTRY: - Name: .llvm_addrsig +# INVALID-ENTRY-NEXT: Type: SHT_LLVM_ADDRSIG +# INVALID-ENTRY-NEXT: Link: .symtab +# INVALID-ENTRY-NEXT: Content: FFFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FFFFFFFFFF" + +## obj2yaml produces a "Symbols" tag when describing an empty SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=3 %s -o %t3 +# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=EMPTY + +# EMPTY: - Name: .llvm_addrsig +# EMPTY-NEXT: Type: SHT_LLVM_ADDRSIG +# EMPTY-NEXT: Link: .symtab +# EMPTY-NEXT: Symbols: [] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" diff --git a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..98496d30fa84 --- /dev/null +++ b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml @@ -0,0 +1,213 @@ +## Check how yaml2obj produces SHT_LLVM_ADDRSIG sections. + +## Check we can describe SHT_LLVM_ADDRSIG using the "Symbols" tag. We can define +## symbols either using names or indexes. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefix=SYMBOLS + +# SYMBOLS: Section { +# SYMBOLS: Index: 1 +# SYMBOLS-NEXT: Name: .llvm_addrsig +# SYMBOLS-NEXT: Type: SHT_LLVM_ADDRSIG +# SYMBOLS-NEXT: Flags [ +# SYMBOLS-NEXT: ] +# SYMBOLS-NEXT: Address: 0x0 +# SYMBOLS-NEXT: Offset: 0x40 +# SYMBOLS-NEXT: Size: 4 +# SYMBOLS-NEXT: Link: 2 +# SYMBOLS-NEXT: Info: 0 +# SYMBOLS-NEXT: AddressAlignment: 0 +# SYMBOLS-NEXT: EntrySize: 0 +# SYMBOLS-NEXT: SectionData ( +# SYMBOLS-NEXT: 0000: 01020102 +# SYMBOLS-NEXT: ) +# SYMBOLS-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + - Index: 1 + - Index: 2 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## We can't specify both "Index" and "Name" when defining a symbol. + +# RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INDEX-NAME + +# INDEX-NAME: error: "Index" and "Name" cannot be used together when defining a symbol + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + Index: 1 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check we report an error if an unknown symbol is referenced in the +## SHT_LLVM_ADDRSIG section description. + +# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-UNKNOWN + +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'foo' by YAML section '.llvm_addrsig' +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'bar' by YAML section '.llvm_addrsig' + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + +## Check we can specify any arbitrary symbol indices. + +# RUN: yaml2obj --docnum=4 %s -o %t4 +# RUN: llvm-readobj --sections --section-data %t4 | FileCheck %s --check-prefix=SYMBOL-INDEX + +# SYMBOL-INDEX: Type: SHT_LLVM_ADDRSIG +# SYMBOL-INDEX: SectionData ( +# SYMBOL-INDEX-NEXT: 0000: 00FF01C4 E6888901 FFFFFFFF 0F +# SYMBOL-INDEX-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0 + - Index: 255 + - Index: 0x11223344 +## 0xFFFFFFFF is a maximum allowed index value. + - Index: 0xFFFFFFFF + +## Check that the maximum symbol index size is 32 bits. + +# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-INDEX-OVERFLOW + +# SYMBOL-INDEX-OVERFLOW: error: out of range hex32 number + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0x1122334455 + +## Check we can use the "Content" tag to specify any data for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=6 %s -o %t6 +# RUN: llvm-readobj --sections --section-data %t6 | FileCheck %s --check-prefix=CONTENT + +# CONTENT: Type: SHT_LLVM_ADDRSIG +# CONTENT: Size: +# CONTENT-SAME: 5 +# CONTENT: SectionData ( +# CONTENT-NEXT: 0000: 11223344 55 +# CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "1122334455" + +## Either "Content" or "Symbols" must be specifed for SHT_LLVM_ADDRSIG sections. + +# RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS + +# NO-TAGS: error: one of "Symbols" or "Content" must be specified + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + +## "Content" and "Symbols" cannot be used together to describe the SHT_LLVM_ADDRSIG section. + +# RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +# CONTENT-SYMBOLS: error: "Content" and "Symbols" cannot be used together + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" + Symbols: + +## Check we can set an arbitrary sh_link value for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=9 %s -o %t9 +# RUN: llvm-readobj --sections %t9 | FileCheck %s --check-prefix=LINK + +# LINK: Name: .llvm_addrsig +# LINK: Link: +# LINK-SAME: 123{{$}} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Link: 123 + Content: "" diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp index c4b6eb79d18c..fa6a6a4b31e8 100644 --- a/tools/obj2yaml/elf2yaml.cpp +++ b/tools/obj2yaml/elf2yaml.cpp @@ -41,6 +41,7 @@ class ELFDumper { Expected getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, const Elf_Shdr *SymTab); + Expected getSymbolName(uint32_t SymtabNdx, uint32_t SymbolNdx); const object::ELFFile &Obj; ArrayRef ShndxTable; @@ -56,6 +57,7 @@ class ELFDumper { Error dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, ELFYAML::Relocation &R); + Expected dumpAddrsigSection(const Elf_Shdr *Shdr); Expected dumpDynamicSection(const Elf_Shdr *Shdr); Expected dumpRelocSection(const Elf_Shdr *Shdr); Expected @@ -284,6 +286,13 @@ template Expected ELFDumper::dump() { Y->Sections.emplace_back(*SecOrErr); break; } + case ELF::SHT_LLVM_ADDRSIG: { + Expected SecOrErr = dumpAddrsigSection(&Sec); + if (!SecOrErr) + return SecOrErr.takeError(); + Y->Sections.emplace_back(*SecOrErr); + break; + } case ELF::SHT_NULL: { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj @@ -519,6 +528,45 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { return S.release(); } +template +Expected +ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); + + auto ContentOrErr = Obj.getSectionContents(Shdr); + if (!ContentOrErr) + return ContentOrErr.takeError(); + + ArrayRef Content = *ContentOrErr; + DataExtractor::Cursor Cur(0); + DataExtractor Data(Content, Obj.isLE(), /*AddressSize=*/0); + std::vector Symbols; + while (Cur && Cur.tell() < Content.size()) { + uint64_t SymNdx = Data.getULEB128(Cur); + if (!Cur) + break; + + Expected SymbolName = getSymbolName(Shdr->sh_link, SymNdx); + if (!SymbolName || SymbolName->empty()) { + consumeError(SymbolName.takeError()); + Symbols.emplace_back(SymNdx); + continue; + } + + Symbols.emplace_back(*SymbolName); + } + + if (Cur) { + S->Symbols = std::move(Symbols); + return S.release(); + } + + S->Content = yaml::BinaryRef(Content); + return S.release(); +} + template Expected ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { @@ -791,25 +839,31 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { } template -Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - - auto SymtabOrErr = Obj.getSection(Shdr->sh_link); +Expected ELFDumper::getSymbolName(uint32_t SymtabNdx, + uint32_t SymbolNdx) { + auto SymtabOrErr = Obj.getSection(SymtabNdx); if (!SymtabOrErr) return SymtabOrErr.takeError(); - // Get symbol with index sh_info which name is the signature of the group. + const Elf_Shdr *Symtab = *SymtabOrErr; - auto SymOrErr = Obj.getSymbol(Symtab, Shdr->sh_info); + auto SymOrErr = Obj.getSymbol(Symtab, SymbolNdx); if (!SymOrErr) return SymOrErr.takeError(); + auto StrTabOrErr = Obj.getStringTableForSymtab(*Symtab); if (!StrTabOrErr) return StrTabOrErr.takeError(); + return getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); +} + +template +Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); - Expected SymbolName = - getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); + // Get symbol with index sh_info. This symbol's name is the signature of the group. + Expected SymbolName = getSymbolName(Shdr->sh_link, Shdr->sh_info); if (!SymbolName) return SymbolName.takeError(); S->Signature = *SymbolName; From 12eaff5ef92dcaf33da29e1cca5ff454a59a4997 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Thu, 3 Oct 2019 14:04:47 +0000 Subject: [PATCH 56/82] Revert r373598 "[yaml2obj/obj2yaml] - Add support for SHT_LLVM_ADDRSIG sections." It broke BB: http://lab.llvm.org:8011/builders/clang-x86_64-debian-fast/builds/18655/steps/test/logs/stdio git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373599 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ObjectYAML/ELFYAML.h | 27 +-- lib/ObjectYAML/ELFEmitter.cpp | 30 --- lib/ObjectYAML/ELFYAML.cpp | 37 --- .../tools/llvm-readobj/elf-section-types.test | 1 - .../obj2yaml/elf-llvm-addrsig-section.yaml | 98 -------- .../yaml2obj/elf-llvm-addrsig-section.yaml | 213 ------------------ tools/obj2yaml/elf2yaml.cpp | 74 +----- 7 files changed, 11 insertions(+), 469 deletions(-) delete mode 100644 test/tools/obj2yaml/elf-llvm-addrsig-section.yaml delete mode 100644 test/tools/yaml2obj/elf-llvm-addrsig-section.yaml diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h index 592b5021bd69..1662d06bf91e 100644 --- a/include/llvm/ObjectYAML/ELFYAML.h +++ b/include/llvm/ObjectYAML/ELFYAML.h @@ -137,8 +137,7 @@ struct Section { StackSizes, SymtabShndxSection, Symver, - MipsABIFlags, - Addrsig + MipsABIFlags }; SectionKind Kind; StringRef Name; @@ -257,25 +256,6 @@ struct VerneedSection : Section { } }; -struct AddrsigSymbol { - AddrsigSymbol(StringRef N) : Name(N), Index(None) {} - AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {} - AddrsigSymbol() : Name(None), Index(None) {} - - Optional Name; - Optional Index; -}; - -struct AddrsigSection : Section { - Optional Content; - Optional> Symbols; - - AddrsigSection() : Section(SectionKind::Addrsig) {} - static bool classof(const Section *S) { - return S->Kind == SectionKind::Addrsig; - } -}; - struct SymverSection : Section { std::vector Entries; @@ -382,7 +362,6 @@ struct Object { } // end namespace ELFYAML } // end namespace llvm -LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader) @@ -539,10 +518,6 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::VernauxEntry &E); }; -template <> struct MappingTraits { - static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym); -}; - template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::Relocation &Rel); }; diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp index 3f3b27c5bfad..c85cf4c924f0 100644 --- a/lib/ObjectYAML/ELFEmitter.cpp +++ b/lib/ObjectYAML/ELFEmitter.cpp @@ -174,10 +174,6 @@ template class ELFState { void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::HashSection &Section, ContiguousBlobAccumulator &CBA); - void writeSectionContent(Elf_Shdr &SHeader, - const ELFYAML::AddrsigSection &Section, - ContiguousBlobAccumulator &CBA); - ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH); public: @@ -427,8 +423,6 @@ void ELFState::initSectionHeaders(std::vector &SHeaders, writeSectionContent(SHeader, *S, CBA); } else if (auto S = dyn_cast(Sec)) { writeSectionContent(SHeader, *S, CBA); - } else if (auto S = dyn_cast(Sec)) { - writeSectionContent(SHeader, *S, CBA); } else { llvm_unreachable("Unknown section type"); } @@ -996,30 +990,6 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, Section.Content->writeAsBinary(OS); } -template -void ELFState::writeSectionContent(Elf_Shdr &SHeader, - const ELFYAML::AddrsigSection &Section, - ContiguousBlobAccumulator &CBA) { - raw_ostream &OS = - CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign); - - unsigned Link = 0; - if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) - SHeader.sh_link = Link; - - if (Section.Content) { - SHeader.sh_size = writeContent(OS, Section.Content, None); - return; - } - - for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) { - uint64_t Val = - Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false) - : (uint32_t)*Sym.Index; - SHeader.sh_size += encodeULEB128(Val, OS); - } -} - template void ELFState::buildSectionIndex() { for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) { StringRef Name = Doc.Sections[I]->Name; diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index e295a000ef8a..0dd6854cfee0 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -1071,12 +1071,6 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { IO.mapRequired("Entries", Section.Entries); } -static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { - commonSectionMapping(IO, Section); - IO.mapOptional("Content", Section.Content); - IO.mapOptional("Symbols", Section.Symbols); -} - void MappingTraits::mapping( IO &IO, ELFYAML::SectionOrType §ionOrType) { IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType); @@ -1167,11 +1161,6 @@ void MappingTraits>::mapping( Section.reset(new ELFYAML::SymtabShndxSection()); sectionMapping(IO, *cast(Section.get())); break; - case ELF::SHT_LLVM_ADDRSIG: - if (!IO.outputting()) - Section.reset(new ELFYAML::AddrsigSection()); - sectionMapping(IO, *cast(Section.get())); - break; default: if (!IO.outputting()) { StringRef Name; @@ -1244,26 +1233,6 @@ StringRef MappingTraits>::validate( return {}; } - if (const auto *Sec = dyn_cast(Section.get())) { - if (!Sec->Symbols && !Sec->Content) - return "one of \"Symbols\" or \"Content\" must be specified"; - - if (Sec->Content) { - if (Sec->Symbols) - return "\"Content\" and \"Symbols\" cannot be used together"; - return {}; - } - - if (!Sec->Symbols) - return {}; - - for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols) - if (AS.Index && AS.Name) - return "\"Index\" and \"Name\" cannot be used together when defining a " - "symbol"; - return {}; - } - return {}; } @@ -1371,12 +1340,6 @@ void MappingTraits::mapping(IO &IO, ELFYAML::Object &Object) { IO.setContext(nullptr); } -void MappingTraits::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) { - assert(IO.getContext() && "The IO context is not initialized"); - IO.mapOptional("Name", Sym.Name); - IO.mapOptional("Index", Sym.Index); -} - LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG) LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT) diff --git a/test/tools/llvm-readobj/elf-section-types.test b/test/tools/llvm-readobj/elf-section-types.test index 20b881249c7f..aad9f43c8a30 100644 --- a/test/tools/llvm-readobj/elf-section-types.test +++ b/test/tools/llvm-readobj/elf-section-types.test @@ -196,7 +196,6 @@ Sections: Type: SHT_LLVM_CALL_GRAPH_PROFILE - Name: llvm_addrsig Type: SHT_LLVM_ADDRSIG - Symbols: - Name: .deplibs Type: SHT_LLVM_DEPENDENT_LIBRARIES - Name: .llvm_sympart.f diff --git a/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml deleted file mode 100644 index 6f21c3212bd9..000000000000 --- a/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml +++ /dev/null @@ -1,98 +0,0 @@ -## Check how obj2yaml dumps the SHT_LLVM_ADDRSIG section. - -## Check that when possible obj2yaml tries to produce the "Name" tag when -## dumping entries of the SHT_LLVM_ADDRSIG section. It falls back to producing -## the "Index" tag when it can't match a symbol index with a symbol table entry. - -# RUN: yaml2obj --docnum=1 %s -o %t1 -# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=NAME - -# NAME: - Name: .llvm_addrsig -# NAME-NEXT: Type: SHT_LLVM_ADDRSIG -# NAME-NEXT: Link: .symtab -# NAME-NEXT: Symbols: -# NAME-NEXT: - Name: foo -# NAME-NEXT: - Name: bar -# NAME-NEXT: - Index: 0x00000003 -# NAME-NEXT: - Index: 0xFFFFFFFF -# NAME: - Name: .llvm_addrsig_unlinked -# NAME-NEXT: Type: SHT_LLVM_ADDRSIG -# NAME-NEXT: Symbols: -# NAME-NEXT: - Index: 0x00000001 -# NAME-NEXT: - Index: 0x00000002 -# NAME-NEXT: - Index: 0x00000003 -# NAME-NEXT: - Index: 0xFFFFFFFF - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Index: 1 - - Index: 2 - - Index: 3 - - Index: 0xFFFFFFFF - - Name: .llvm_addrsig_unlinked - Type: SHT_LLVM_ADDRSIG - Link: 0 - Symbols: - - Index: 1 - - Index: 2 - - Index: 3 - - Index: 0xFFFFFFFF -Symbols: - - Name: foo - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: bar - Type: STT_FUNC - Binding: STB_GLOBAL - -## Check that obj2yaml dumps the SHT_LLVM_ADDRSIG section -## data using the "Content" tag when at least one of the entries is broken, -## e.g. because the entry contains a malformed uleb128 value. - -# RUN: yaml2obj --docnum=2 %s -o %t2 -# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=INVALID-ENTRY - -# INVALID-ENTRY: - Name: .llvm_addrsig -# INVALID-ENTRY-NEXT: Type: SHT_LLVM_ADDRSIG -# INVALID-ENTRY-NEXT: Link: .symtab -# INVALID-ENTRY-NEXT: Content: FFFFFFFFFF - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Content: "FFFFFFFFFF" - -## obj2yaml produces a "Symbols" tag when describing an empty SHT_LLVM_ADDRSIG section. - -# RUN: yaml2obj --docnum=3 %s -o %t3 -# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=EMPTY - -# EMPTY: - Name: .llvm_addrsig -# EMPTY-NEXT: Type: SHT_LLVM_ADDRSIG -# EMPTY-NEXT: Link: .symtab -# EMPTY-NEXT: Symbols: [] - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Content: "" diff --git a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml deleted file mode 100644 index 98496d30fa84..000000000000 --- a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml +++ /dev/null @@ -1,213 +0,0 @@ -## Check how yaml2obj produces SHT_LLVM_ADDRSIG sections. - -## Check we can describe SHT_LLVM_ADDRSIG using the "Symbols" tag. We can define -## symbols either using names or indexes. - -# RUN: yaml2obj --docnum=1 %s -o %t1 -# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefix=SYMBOLS - -# SYMBOLS: Section { -# SYMBOLS: Index: 1 -# SYMBOLS-NEXT: Name: .llvm_addrsig -# SYMBOLS-NEXT: Type: SHT_LLVM_ADDRSIG -# SYMBOLS-NEXT: Flags [ -# SYMBOLS-NEXT: ] -# SYMBOLS-NEXT: Address: 0x0 -# SYMBOLS-NEXT: Offset: 0x40 -# SYMBOLS-NEXT: Size: 4 -# SYMBOLS-NEXT: Link: 2 -# SYMBOLS-NEXT: Info: 0 -# SYMBOLS-NEXT: AddressAlignment: 0 -# SYMBOLS-NEXT: EntrySize: 0 -# SYMBOLS-NEXT: SectionData ( -# SYMBOLS-NEXT: 0000: 01020102 -# SYMBOLS-NEXT: ) -# SYMBOLS-NEXT: } - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Name: foo - - Name: bar - - Index: 1 - - Index: 2 -Symbols: - - Name: foo - Type: STT_FUNC - Binding: STB_GLOBAL - - Name: bar - Type: STT_FUNC - Binding: STB_GLOBAL - -## We can't specify both "Index" and "Name" when defining a symbol. - -# RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INDEX-NAME - -# INDEX-NAME: error: "Index" and "Name" cannot be used together when defining a symbol - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Name: foo - Index: 1 -Symbols: - - Name: foo - Type: STT_FUNC - Binding: STB_GLOBAL - -## Check we report an error if an unknown symbol is referenced in the -## SHT_LLVM_ADDRSIG section description. - -# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-UNKNOWN - -# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'foo' by YAML section '.llvm_addrsig' -# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'bar' by YAML section '.llvm_addrsig' - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Name: foo - - Name: bar - -## Check we can specify any arbitrary symbol indices. - -# RUN: yaml2obj --docnum=4 %s -o %t4 -# RUN: llvm-readobj --sections --section-data %t4 | FileCheck %s --check-prefix=SYMBOL-INDEX - -# SYMBOL-INDEX: Type: SHT_LLVM_ADDRSIG -# SYMBOL-INDEX: SectionData ( -# SYMBOL-INDEX-NEXT: 0000: 00FF01C4 E6888901 FFFFFFFF 0F -# SYMBOL-INDEX-NEXT: ) - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Index: 0 - - Index: 255 - - Index: 0x11223344 -## 0xFFFFFFFF is a maximum allowed index value. - - Index: 0xFFFFFFFF - -## Check that the maximum symbol index size is 32 bits. - -# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-INDEX-OVERFLOW - -# SYMBOL-INDEX-OVERFLOW: error: out of range hex32 number - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Symbols: - - Index: 0x1122334455 - -## Check we can use the "Content" tag to specify any data for SHT_LLVM_ADDRSIG sections. - -# RUN: yaml2obj --docnum=6 %s -o %t6 -# RUN: llvm-readobj --sections --section-data %t6 | FileCheck %s --check-prefix=CONTENT - -# CONTENT: Type: SHT_LLVM_ADDRSIG -# CONTENT: Size: -# CONTENT-SAME: 5 -# CONTENT: SectionData ( -# CONTENT-NEXT: 0000: 11223344 55 -# CONTENT-NEXT: ) - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Content: "1122334455" - -## Either "Content" or "Symbols" must be specifed for SHT_LLVM_ADDRSIG sections. - -# RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS - -# NO-TAGS: error: one of "Symbols" or "Content" must be specified - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - -## "Content" and "Symbols" cannot be used together to describe the SHT_LLVM_ADDRSIG section. - -# RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS - -# CONTENT-SYMBOLS: error: "Content" and "Symbols" cannot be used together - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Content: "" - Symbols: - -## Check we can set an arbitrary sh_link value for SHT_LLVM_ADDRSIG sections. - -# RUN: yaml2obj --docnum=9 %s -o %t9 -# RUN: llvm-readobj --sections %t9 | FileCheck %s --check-prefix=LINK - -# LINK: Name: .llvm_addrsig -# LINK: Link: -# LINK-SAME: 123{{$}} - ---- !ELF -FileHeader: - Class: ELFCLASS64 - Data: ELFDATA2LSB - Type: ET_DYN - Machine: EM_X86_64 -Sections: - - Name: .llvm_addrsig - Type: SHT_LLVM_ADDRSIG - Link: 123 - Content: "" diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp index fa6a6a4b31e8..c4b6eb79d18c 100644 --- a/tools/obj2yaml/elf2yaml.cpp +++ b/tools/obj2yaml/elf2yaml.cpp @@ -41,7 +41,6 @@ class ELFDumper { Expected getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, const Elf_Shdr *SymTab); - Expected getSymbolName(uint32_t SymtabNdx, uint32_t SymbolNdx); const object::ELFFile &Obj; ArrayRef ShndxTable; @@ -57,7 +56,6 @@ class ELFDumper { Error dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, ELFYAML::Relocation &R); - Expected dumpAddrsigSection(const Elf_Shdr *Shdr); Expected dumpDynamicSection(const Elf_Shdr *Shdr); Expected dumpRelocSection(const Elf_Shdr *Shdr); Expected @@ -286,13 +284,6 @@ template Expected ELFDumper::dump() { Y->Sections.emplace_back(*SecOrErr); break; } - case ELF::SHT_LLVM_ADDRSIG: { - Expected SecOrErr = dumpAddrsigSection(&Sec); - if (!SecOrErr) - return SecOrErr.takeError(); - Y->Sections.emplace_back(*SecOrErr); - break; - } case ELF::SHT_NULL: { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj @@ -528,45 +519,6 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { return S.release(); } -template -Expected -ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - - auto ContentOrErr = Obj.getSectionContents(Shdr); - if (!ContentOrErr) - return ContentOrErr.takeError(); - - ArrayRef Content = *ContentOrErr; - DataExtractor::Cursor Cur(0); - DataExtractor Data(Content, Obj.isLE(), /*AddressSize=*/0); - std::vector Symbols; - while (Cur && Cur.tell() < Content.size()) { - uint64_t SymNdx = Data.getULEB128(Cur); - if (!Cur) - break; - - Expected SymbolName = getSymbolName(Shdr->sh_link, SymNdx); - if (!SymbolName || SymbolName->empty()) { - consumeError(SymbolName.takeError()); - Symbols.emplace_back(SymNdx); - continue; - } - - Symbols.emplace_back(*SymbolName); - } - - if (Cur) { - S->Symbols = std::move(Symbols); - return S.release(); - } - - S->Content = yaml::BinaryRef(Content); - return S.release(); -} - template Expected ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { @@ -839,31 +791,25 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { } template -Expected ELFDumper::getSymbolName(uint32_t SymtabNdx, - uint32_t SymbolNdx) { - auto SymtabOrErr = Obj.getSection(SymtabNdx); +Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); + + auto SymtabOrErr = Obj.getSection(Shdr->sh_link); if (!SymtabOrErr) return SymtabOrErr.takeError(); - + // Get symbol with index sh_info which name is the signature of the group. const Elf_Shdr *Symtab = *SymtabOrErr; - auto SymOrErr = Obj.getSymbol(Symtab, SymbolNdx); + auto SymOrErr = Obj.getSymbol(Symtab, Shdr->sh_info); if (!SymOrErr) return SymOrErr.takeError(); - auto StrTabOrErr = Obj.getStringTableForSymtab(*Symtab); if (!StrTabOrErr) return StrTabOrErr.takeError(); - return getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); -} - -template -Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - // Get symbol with index sh_info. This symbol's name is the signature of the group. - Expected SymbolName = getSymbolName(Shdr->sh_link, Shdr->sh_info); + Expected SymbolName = + getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); if (!SymbolName) return SymbolName.takeError(); S->Signature = *SymbolName; From cf3dfcb56e9874f4deef3a9801f29ee5220be743 Mon Sep 17 00:00:00 2001 From: Ehsan Amiri Date: Thu, 3 Oct 2019 14:19:55 +0000 Subject: [PATCH 57/82] [AArch64][SVE] Adding patterns for floating point SVE add instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373600 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AArch64/AArch64SVEInstrInfo.td | 12 ++++++------ lib/Target/AArch64/SVEInstrFormats.td | 14 ++++++++------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/lib/Target/AArch64/AArch64SVEInstrInfo.td b/lib/Target/AArch64/AArch64SVEInstrInfo.td index d46e905d0fe7..1657a76a685c 100644 --- a/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -138,12 +138,12 @@ let Predicates = [HasSVE] in { defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd">; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub">; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul">; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul">; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps">; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts">; + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; diff --git a/lib/Target/AArch64/SVEInstrFormats.td b/lib/Target/AArch64/SVEInstrFormats.td index 1a9784065d5b..e2bd47ee6ae3 100644 --- a/lib/Target/AArch64/SVEInstrFormats.td +++ b/lib/Target/AArch64/SVEInstrFormats.td @@ -1219,10 +1219,12 @@ multiclass sve_fp_ftmad { //===----------------------------------------------------------------------===// class sve_fp_3op_u_zd sz, bits<3> opc, string asm, - ZPRRegOp zprty> + ZPRRegOp zprty, + ValueType vt, ValueType vt2, SDPatternOperator op> : I<(outs zprty:$Zd), (ins zprty:$Zn, zprty:$Zm), asm, "\t$Zd, $Zn, $Zm", - "", []>, Sched<[]> { + "", + [(set (vt zprty:$Zd), (op (vt zprty:$Zn), (vt2 zprty:$Zm)))]>, Sched<[]> { bits<5> Zd; bits<5> Zm; bits<5> Zn; @@ -1236,10 +1238,10 @@ class sve_fp_3op_u_zd sz, bits<3> opc, string asm, let Inst{4-0} = Zd; } -multiclass sve_fp_3op_u_zd opc, string asm> { - def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16>; - def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32>; - def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64>; +multiclass sve_fp_3op_u_zd opc, string asm, SDPatternOperator op> { + def _H : sve_fp_3op_u_zd<0b01, opc, asm, ZPR16, nxv8f16, nxv8f16, op>; + def _S : sve_fp_3op_u_zd<0b10, opc, asm, ZPR32, nxv4f32, nxv4f32, op>; + def _D : sve_fp_3op_u_zd<0b11, opc, asm, ZPR64, nxv2f64, nxv2f64, op>; } //===----------------------------------------------------------------------===// From fe9dd78fd0c039832b0cd57636876280caf1f95d Mon Sep 17 00:00:00 2001 From: Bardia Mahjour Date: Thu, 3 Oct 2019 14:20:50 +0000 Subject: [PATCH 58/82] [PGO] Refactor Value Profiling into a plugin based oracle and create a well defined API for the plugins. Summary: This PR creates a utility class called ValueProfileCollector that tells PGOInstrumentationGen and PGOInstrumentationUse what to value-profile and where to attach the profile metadata. It then refactors logic scattered in PGOInstrumentation.cpp into two plugins that plug into the ValueProfileCollector. Authored By: Wael Yehia Reviewer: davidxl, tejohnson, xur Reviewed By: davidxl, tejohnson, xur Subscribers: llvm-commits Tag: #llvm Differential Revision: https://reviews.llvm.org/D67920 Patch By Wael Yehia git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373601 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Transforms/Instrumentation/CMakeLists.txt | 1 + .../Instrumentation/PGOInstrumentation.cpp | 167 +++++------------- .../Instrumentation/ValueProfileCollector.cpp | 78 ++++++++ .../Instrumentation/ValueProfileCollector.h | 79 +++++++++ .../Instrumentation/ValueProfilePlugins.inc | 75 ++++++++ 5 files changed, 280 insertions(+), 120 deletions(-) create mode 100644 lib/Transforms/Instrumentation/ValueProfileCollector.cpp create mode 100644 lib/Transforms/Instrumentation/ValueProfileCollector.h create mode 100644 lib/Transforms/Instrumentation/ValueProfilePlugins.inc diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt index 78b697f7f940..22190ad7a0ae 100644 --- a/lib/Transforms/Instrumentation/CMakeLists.txt +++ b/lib/Transforms/Instrumentation/CMakeLists.txt @@ -14,6 +14,7 @@ add_llvm_library(LLVMInstrumentation PGOMemOPSizeOpt.cpp PoisonChecking.cpp SanitizerCoverage.cpp + ValueProfileCollector.cpp ThreadSanitizer.cpp HWAddressSanitizer.cpp diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp index e776d59cccb5..3862f19ab7ab 100644 --- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp +++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp @@ -48,6 +48,7 @@ //===----------------------------------------------------------------------===// #include "CFGMST.h" +#include "ValueProfileCollector.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" @@ -61,7 +62,6 @@ #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/CFG.h" -#include "llvm/Analysis/IndirectCallVisitor.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ProfileSummaryInfo.h" @@ -121,6 +121,7 @@ using namespace llvm; using ProfileCount = Function::ProfileCount; +using VPCandidateInfo = ValueProfileCollector::CandidateInfo; #define DEBUG_TYPE "pgo-instrumentation" @@ -287,6 +288,11 @@ static std::string getBranchCondString(Instruction *TI) { return result; } +static const char *ValueProfKindDescr[] = { +#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, +#include "llvm/ProfileData/InstrProfData.inc" +}; + namespace { /// The select instruction visitor plays three roles specified @@ -349,50 +355,6 @@ struct SelectInstVisitor : public InstVisitor { unsigned getNumOfSelectInsts() const { return NSIs; } }; -/// Instruction Visitor class to visit memory intrinsic calls. -struct MemIntrinsicVisitor : public InstVisitor { - Function &F; - unsigned NMemIs = 0; // Number of memIntrinsics instrumented. - VisitMode Mode = VM_counting; // Visiting mode. - unsigned CurCtrId = 0; // Current counter index. - unsigned TotalNumCtrs = 0; // Total number of counters - GlobalVariable *FuncNameVar = nullptr; - uint64_t FuncHash = 0; - PGOUseFunc *UseFunc = nullptr; - std::vector Candidates; - - MemIntrinsicVisitor(Function &Func) : F(Func) {} - - void countMemIntrinsics(Function &Func) { - NMemIs = 0; - Mode = VM_counting; - visit(Func); - } - - void instrumentMemIntrinsics(Function &Func, unsigned TotalNC, - GlobalVariable *FNV, uint64_t FHash) { - Mode = VM_instrument; - TotalNumCtrs = TotalNC; - FuncHash = FHash; - FuncNameVar = FNV; - visit(Func); - } - - std::vector findMemIntrinsics(Function &Func) { - Candidates.clear(); - Mode = VM_annotate; - visit(Func); - return Candidates; - } - - // Visit the IR stream and annotate all mem intrinsic call instructions. - void instrumentOneMemIntrinsic(MemIntrinsic &MI); - - // Visit \p MI instruction and perform tasks according to visit mode. - void visitMemIntrinsic(MemIntrinsic &SI); - - unsigned getNumOfMemIntrinsics() const { return NMemIs; } -}; class PGOInstrumentationGenLegacyPass : public ModulePass { public: @@ -564,13 +526,14 @@ template class FuncPGOInstrumentation { // A map that stores the Comdat group in function F. std::unordered_multimap &ComdatMembers; + ValueProfileCollector VPC; + void computeCFGHash(); void renameComdatFunction(); public: - std::vector> ValueSites; + std::vector> ValueSites; SelectInstVisitor SIVisitor; - MemIntrinsicVisitor MIVisitor; std::string FuncName; GlobalVariable *FuncNameVar; @@ -605,23 +568,21 @@ template class FuncPGOInstrumentation { std::unordered_multimap &ComdatMembers, bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr, BlockFrequencyInfo *BFI = nullptr, bool IsCS = false) - : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), - ValueSites(IPVK_Last + 1), SIVisitor(Func), MIVisitor(Func), - MST(F, BPI, BFI) { + : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func), + ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) { // This should be done before CFG hash computation. SIVisitor.countSelects(Func); - MIVisitor.countMemIntrinsics(Func); + ValueSites[IPVK_MemOPSize] = VPC.get(IPVK_MemOPSize); if (!IsCS) { NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfPGOBB += MST.BBInfos.size(); - ValueSites[IPVK_IndirectCallTarget] = findIndirectCalls(Func); + ValueSites[IPVK_IndirectCallTarget] = VPC.get(IPVK_IndirectCallTarget); } else { NumOfCSPGOSelectInsts += SIVisitor.getNumOfSelectInsts(); - NumOfCSPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics(); + NumOfCSPGOMemIntrinsics += ValueSites[IPVK_MemOPSize].size(); NumOfCSPGOBB += MST.BBInfos.size(); } - ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func); FuncName = getPGOFuncName(F); computeCFGHash(); @@ -875,28 +836,36 @@ static void instrumentOneFunc( if (DisableValueProfiling) return; - unsigned NumIndirectCalls = 0; - for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) { - CallSite CS(I); - Value *Callee = CS.getCalledValue(); - LLVM_DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = " - << NumIndirectCalls << "\n"); - IRBuilder<> Builder(I); - assert(Builder.GetInsertPoint() != I->getParent()->end() && - "Cannot get the Instrumentation point"); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), - Builder.getInt64(FuncInfo.FunctionHash), - Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()), - Builder.getInt32(IPVK_IndirectCallTarget), - Builder.getInt32(NumIndirectCalls++)}); - } - NumOfPGOICall += NumIndirectCalls; + NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size(); + + // For each VP Kind, walk the VP candidates and instrument each one. + for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) { + unsigned SiteIndex = 0; + if (Kind == IPVK_MemOPSize && !PGOInstrMemOP) + continue; - // Now instrument memop intrinsic calls. - FuncInfo.MIVisitor.instrumentMemIntrinsics( - F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash); + for (VPCandidateInfo Cand : FuncInfo.ValueSites[Kind]) { + LLVM_DEBUG(dbgs() << "Instrument one VP " << ValueProfKindDescr[Kind] + << " site: CallSite Index = " << SiteIndex << "\n"); + + IRBuilder<> Builder(Cand.InsertPt); + assert(Builder.GetInsertPoint() != Cand.InsertPt->getParent()->end() && + "Cannot get the Instrumentation point"); + + Value *ToProfile = nullptr; + if (Cand.V->getType()->isIntegerTy()) + ToProfile = Builder.CreateZExtOrTrunc(Cand.V, Builder.getInt64Ty()); + else if (Cand.V->getType()->isPointerTy()) + ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty()); + assert(ToProfile && "value profiling Value is of unexpected type"); + + Builder.CreateCall( + Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), + {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy), + Builder.getInt64(FuncInfo.FunctionHash), ToProfile, + Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)}); + } + } // IPVK_First <= Kind <= IPVK_Last } namespace { @@ -1429,43 +1398,6 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) { llvm_unreachable("Unknown visiting mode"); } -void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) { - Module *M = F.getParent(); - IRBuilder<> Builder(&MI); - Type *Int64Ty = Builder.getInt64Ty(); - Type *I8PtrTy = Builder.getInt8PtrTy(); - Value *Length = MI.getLength(); - assert(!isa(Length)); - Builder.CreateCall( - Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile), - {ConstantExpr::getBitCast(FuncNameVar, I8PtrTy), - Builder.getInt64(FuncHash), Builder.CreateZExtOrTrunc(Length, Int64Ty), - Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)}); - ++CurCtrId; -} - -void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) { - if (!PGOInstrMemOP) - return; - Value *Length = MI.getLength(); - // Not instrument constant length calls. - if (dyn_cast(Length)) - return; - - switch (Mode) { - case VM_counting: - NMemIs++; - return; - case VM_instrument: - instrumentOneMemIntrinsic(MI); - return; - case VM_annotate: - Candidates.push_back(&MI); - return; - } - llvm_unreachable("Unknown visiting mode"); -} - // Traverse all valuesites and annotate the instructions for all value kind. void PGOUseFunc::annotateValueSites() { if (DisableValueProfiling) @@ -1478,11 +1410,6 @@ void PGOUseFunc::annotateValueSites() { annotateValueSites(Kind); } -static const char *ValueProfKindDescr[] = { -#define VALUE_PROF_KIND(Enumerator, Value, Descr) Descr, -#include "llvm/ProfileData/InstrProfData.inc" -}; - // Annotate the instructions for a specific value kind. void PGOUseFunc::annotateValueSites(uint32_t Kind) { assert(Kind <= IPVK_Last); @@ -1501,11 +1428,11 @@ void PGOUseFunc::annotateValueSites(uint32_t Kind) { return; } - for (auto &I : ValueSites) { + for (VPCandidateInfo &I : ValueSites) { LLVM_DEBUG(dbgs() << "Read one value site profile (kind = " << Kind << "): Index = " << ValueSiteIndex << " out of " << NumValueSites << "\n"); - annotateValueSite(*M, *I, ProfileRecord, + annotateValueSite(*M, *I.AnnotatedInst, ProfileRecord, static_cast(Kind), ValueSiteIndex, Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations : MaxNumAnnotations); diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp new file mode 100644 index 000000000000..604726d4f40f --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfileCollector.cpp @@ -0,0 +1,78 @@ +//===- ValueProfileCollector.cpp - determine what to value profile --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// The implementation of the ValueProfileCollector via ValueProfileCollectorImpl +// +//===----------------------------------------------------------------------===// + +#include "ValueProfilePlugins.inc" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/InitializePasses.h" + +#include + +using namespace llvm; + +namespace { + +/// A plugin-based class that takes an arbitrary number of Plugin types. +/// Each plugin type must satisfy the following API: +/// 1) the constructor must take a `Function &f`. Typically, the plugin would +/// scan the function looking for candidates. +/// 2) contain a member function with the following signature and name: +/// void run(std::vector &Candidates); +/// such that the plugin would append its result into the vector parameter. +/// +/// Plugins are defined in ValueProfilePlugins.inc +template class PluginChain; + +/// The type PluginChainFinal is the final chain of plugins that will be used by +/// ValueProfileCollectorImpl. +using PluginChainFinal = PluginChain; + +template <> class PluginChain<> { +public: + PluginChain(Function &F) {} + void get(InstrProfValueKind K, std::vector &Candidates) {} +}; + +template +class PluginChain : public PluginChain { + PluginT Plugin; + using Base = PluginChain; + +public: + PluginChain(Function &F) : PluginChain(F), Plugin(F) {} + + void get(InstrProfValueKind K, std::vector &Candidates) { + if (K == PluginT::Kind) + Plugin.run(Candidates); + Base::get(K, Candidates); + } +}; + +} // end anonymous namespace + +/// ValueProfileCollectorImpl inherits the API of PluginChainFinal. +class ValueProfileCollector::ValueProfileCollectorImpl : public PluginChainFinal { +public: + using PluginChainFinal::PluginChainFinal; +}; + +ValueProfileCollector::ValueProfileCollector(Function &F) + : PImpl(new ValueProfileCollectorImpl(F)) {} + +ValueProfileCollector::~ValueProfileCollector() = default; + +std::vector +ValueProfileCollector::get(InstrProfValueKind Kind) const { + std::vector Result; + PImpl->get(Kind, Result); + return Result; +} diff --git a/lib/Transforms/Instrumentation/ValueProfileCollector.h b/lib/Transforms/Instrumentation/ValueProfileCollector.h new file mode 100644 index 000000000000..ff883c8d0c77 --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfileCollector.h @@ -0,0 +1,79 @@ +//===- ValueProfileCollector.h - determine what to value profile ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a utility class, ValueProfileCollector, that is used to +// determine what kind of llvm::Value's are worth value-profiling, at which +// point in the program, and which instruction holds the Value Profile metadata. +// Currently, the only users of this utility is the PGOInstrumentation[Gen|Use] +// passes. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H +#define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H + +#include "llvm/IR/Function.h" +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" +#include "llvm/ProfileData/InstrProf.h" + +namespace llvm { + +/// Utility analysis that determines what values are worth profiling. +/// The actual logic is inside the ValueProfileCollectorImpl, whose job is to +/// populate the Candidates vector. +/// +/// Value profiling an expression means to track the values that this expression +/// takes at runtime and the frequency of each value. +/// It is important to distinguish between two sets of value profiles for a +/// particular expression: +/// 1) The set of values at the point of evaluation. +/// 2) The set of values at the point of use. +/// In some cases, the two sets are identical, but it's not unusual for the two +/// to differ. +/// +/// To elaborate more, consider this C code, and focus on the expression `nn`: +/// void foo(int nn, bool b) { +/// if (b) memcpy(x, y, nn); +/// } +/// The point of evaluation can be as early as the start of the function, and +/// let's say the value profile for `nn` is: +/// total=100; (value,freq) set = {(8,10), (32,50)} +/// The point of use is right before we call memcpy, and since we execute the +/// memcpy conditionally, the value profile of `nn` can be: +/// total=15; (value,freq) set = {(8,10), (4,5)} +/// +/// For this reason, a plugin is responsible for computing the insertion point +/// for each value to be profiled. The `CandidateInfo` structure encapsulates +/// all the information needed for each value profile site. +class ValueProfileCollector { +public: + struct CandidateInfo { + Value *V; // The value to profile. + Instruction *InsertPt; // Insert the VP lib call before this instr. + Instruction *AnnotatedInst; // Where metadata is attached. + }; + + ValueProfileCollector(Function &Fn); + ValueProfileCollector(ValueProfileCollector &&) = delete; + ValueProfileCollector &operator=(ValueProfileCollector &&) = delete; + + ValueProfileCollector(const ValueProfileCollector &) = delete; + ValueProfileCollector &operator=(const ValueProfileCollector &) = delete; + ~ValueProfileCollector(); + + /// returns a list of value profiling candidates of the given kind + std::vector get(InstrProfValueKind Kind) const; + +private: + class ValueProfileCollectorImpl; + std::unique_ptr PImpl; +}; + +} // namespace llvm + +#endif diff --git a/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc new file mode 100644 index 000000000000..4cc4c6c848c3 --- /dev/null +++ b/lib/Transforms/Instrumentation/ValueProfilePlugins.inc @@ -0,0 +1,75 @@ +//=== ValueProfilePlugins.inc - set of plugins used by ValueProfileCollector =// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a set of plugin classes used in ValueProfileCollectorImpl. +// Each plugin is responsible for collecting Value Profiling candidates for a +// particular optimization. +// Each plugin must satisfy the interface described in ValueProfileCollector.cpp +// +//===----------------------------------------------------------------------===// + +#include "ValueProfileCollector.h" +#include "llvm/Analysis/IndirectCallVisitor.h" +#include "llvm/IR/InstVisitor.h" + +using namespace llvm; +using CandidateInfo = ValueProfileCollector::CandidateInfo; + +///--------------------------- MemIntrinsicPlugin ------------------------------ +class MemIntrinsicPlugin : public InstVisitor { + Function &F; + std::vector *Candidates; + +public: + static constexpr InstrProfValueKind Kind = IPVK_MemOPSize; + + MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {} + + void run(std::vector &Cs) { + Candidates = &Cs; + visit(F); + Candidates = nullptr; + } + void visitMemIntrinsic(MemIntrinsic &MI) { + Value *Length = MI.getLength(); + // Not instrument constant length calls. + if (dyn_cast(Length)) + return; + + Instruction *InsertPt = &MI; + Instruction *AnnotatedInst = &MI; + Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst}); + } +}; + +///------------------------ IndirectCallPromotionPlugin ------------------------ +class IndirectCallPromotionPlugin { + Function &F; + +public: + static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget; + + IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {} + + void run(std::vector &Candidates) { + std::vector Result = findIndirectCalls(F); + for (Instruction *I : Result) { + Value *Callee = CallSite(I).getCalledValue(); + Instruction *InsertPt = I; + Instruction *AnnotatedInst = I; + Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst}); + } + } +}; + +///----------------------- Registration of the plugins ------------------------- +/// For now, registering a plugin with the ValueProfileCollector is done by +/// adding the plugin type to the VP_PLUGIN_LIST macro. +#define VP_PLUGIN_LIST \ + MemIntrinsicPlugin, \ + IndirectCallPromotionPlugin From eb8b6d491465700f273c53257fd1b8134e0368bd Mon Sep 17 00:00:00 2001 From: GN Sync Bot Date: Thu, 3 Oct 2019 14:28:27 +0000 Subject: [PATCH 59/82] gn build: Merge r373601 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373603 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn index d6c256b52b43..ec7f3d81cba9 100644 --- a/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn +++ b/utils/gn/secondary/llvm/lib/Transforms/Instrumentation/BUILD.gn @@ -26,5 +26,6 @@ static_library("Instrumentation") { "PoisonChecking.cpp", "SanitizerCoverage.cpp", "ThreadSanitizer.cpp", + "ValueProfileCollector.cpp", ] } From da186b07ad70e5878b904b83263ee8f52e13f90e Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 3 Oct 2019 14:34:28 +0000 Subject: [PATCH 60/82] [UpdateTestChecks] add basic support for parsing msp430 asm git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373605 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/UpdateTestChecks/asm.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/utils/UpdateTestChecks/asm.py b/utils/UpdateTestChecks/asm.py index 1eb354d8a46a..81556d65802c 100644 --- a/utils/UpdateTestChecks/asm.py +++ b/utils/UpdateTestChecks/asm.py @@ -58,6 +58,12 @@ class string: # .Lfunc_end0: (mips64 - NewABI) flags=(re.M | re.S)) +ASM_FUNCTION_MSP430_RE = re.compile( + r'^_?(?P[^:]+):[ \t]*;+[ \t]*@(?P=func)\n[^:]*?' + r'(?P.*?)\n' + r'(\$|\.L)func_end[0-9]+:\n', # $func_end0: + flags=(re.M | re.S)) + ASM_FUNCTION_PPC_RE = re.compile( r'^_?(?P[^:]+):[ \t]*#+[ \t]*@(?P=func)\n' r'.*?' @@ -231,6 +237,16 @@ def scrub_asm_mips(asm, args): asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) return asm +def scrub_asm_msp430(asm, args): + # Scrub runs of whitespace out of the assembly, but leave the leading + # whitespace in place. + asm = common.SCRUB_WHITESPACE_RE.sub(r' ', asm) + # Expand the tabs used for indentation. + asm = string.expandtabs(asm, 2) + # Strip trailing whitespace. + asm = common.SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm) + return asm + def scrub_asm_riscv(asm, args): # Scrub runs of whitespace out of the assembly, but leave the leading # whitespace in place. @@ -315,6 +331,7 @@ def build_function_body_dictionary_for_triple(args, raw_tool_output, triple, pre 'thumbv5-macho': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_MACHO_RE), 'thumbv7-apple-ios' : (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_IOS_RE), 'mips': (scrub_asm_mips, ASM_FUNCTION_MIPS_RE), + 'msp430': (scrub_asm_msp430, ASM_FUNCTION_MSP430_RE), 'ppc32': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'powerpc': (scrub_asm_powerpc, ASM_FUNCTION_PPC_RE), 'riscv32': (scrub_asm_riscv, ASM_FUNCTION_RISCV_RE), From ffd82b44aee0103df4d5ecaea3ca35bd49059088 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Thu, 3 Oct 2019 14:52:33 +0000 Subject: [PATCH 61/82] Recommit r373598 "[yaml2obj/obj2yaml] - Add support for SHT_LLVM_ADDRSIG sections." Fix: call `consumeError()` for a case missed. Original commit message: SHT_LLVM_ADDRSIG is described here: https://llvm.org/docs/Extensions.html#sht-llvm-addrsig-section-address-significance-table This patch teaches tools to dump them and to parse the YAML declarations of such sections. Differential revision: https://reviews.llvm.org/D68333 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373606 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ObjectYAML/ELFYAML.h | 27 ++- lib/ObjectYAML/ELFEmitter.cpp | 30 +++ lib/ObjectYAML/ELFYAML.cpp | 37 +++ .../tools/llvm-readobj/elf-section-types.test | 1 + .../obj2yaml/elf-llvm-addrsig-section.yaml | 98 ++++++++ .../yaml2obj/elf-llvm-addrsig-section.yaml | 213 ++++++++++++++++++ tools/obj2yaml/elf2yaml.cpp | 75 +++++- 7 files changed, 470 insertions(+), 11 deletions(-) create mode 100644 test/tools/obj2yaml/elf-llvm-addrsig-section.yaml create mode 100644 test/tools/yaml2obj/elf-llvm-addrsig-section.yaml diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h index 1662d06bf91e..592b5021bd69 100644 --- a/include/llvm/ObjectYAML/ELFYAML.h +++ b/include/llvm/ObjectYAML/ELFYAML.h @@ -137,7 +137,8 @@ struct Section { StackSizes, SymtabShndxSection, Symver, - MipsABIFlags + MipsABIFlags, + Addrsig }; SectionKind Kind; StringRef Name; @@ -256,6 +257,25 @@ struct VerneedSection : Section { } }; +struct AddrsigSymbol { + AddrsigSymbol(StringRef N) : Name(N), Index(None) {} + AddrsigSymbol(llvm::yaml::Hex32 Ndx) : Name(None), Index(Ndx) {} + AddrsigSymbol() : Name(None), Index(None) {} + + Optional Name; + Optional Index; +}; + +struct AddrsigSection : Section { + Optional Content; + Optional> Symbols; + + AddrsigSection() : Section(SectionKind::Addrsig) {} + static bool classof(const Section *S) { + return S->Kind == SectionKind::Addrsig; + } +}; + struct SymverSection : Section { std::vector Entries; @@ -362,6 +382,7 @@ struct Object { } // end namespace ELFYAML } // end namespace llvm +LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::AddrsigSymbol) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::StackSizeEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::DynamicEntry) LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::ELFYAML::ProgramHeader) @@ -518,6 +539,10 @@ template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::VernauxEntry &E); }; +template <> struct MappingTraits { + static void mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym); +}; + template <> struct MappingTraits { static void mapping(IO &IO, ELFYAML::Relocation &Rel); }; diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp index c85cf4c924f0..3f3b27c5bfad 100644 --- a/lib/ObjectYAML/ELFEmitter.cpp +++ b/lib/ObjectYAML/ELFEmitter.cpp @@ -174,6 +174,10 @@ template class ELFState { void writeSectionContent(Elf_Shdr &SHeader, const ELFYAML::HashSection &Section, ContiguousBlobAccumulator &CBA); + void writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA); + ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH); public: @@ -423,6 +427,8 @@ void ELFState::initSectionHeaders(std::vector &SHeaders, writeSectionContent(SHeader, *S, CBA); } else if (auto S = dyn_cast(Sec)) { writeSectionContent(SHeader, *S, CBA); + } else if (auto S = dyn_cast(Sec)) { + writeSectionContent(SHeader, *S, CBA); } else { llvm_unreachable("Unknown section type"); } @@ -990,6 +996,30 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, Section.Content->writeAsBinary(OS); } +template +void ELFState::writeSectionContent(Elf_Shdr &SHeader, + const ELFYAML::AddrsigSection &Section, + ContiguousBlobAccumulator &CBA) { + raw_ostream &OS = + CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign); + + unsigned Link = 0; + if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) + SHeader.sh_link = Link; + + if (Section.Content) { + SHeader.sh_size = writeContent(OS, Section.Content, None); + return; + } + + for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) { + uint64_t Val = + Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false) + : (uint32_t)*Sym.Index; + SHeader.sh_size += encodeULEB128(Val, OS); + } +} + template void ELFState::buildSectionIndex() { for (unsigned I = 0, E = Doc.Sections.size(); I != E; ++I) { StringRef Name = Doc.Sections[I]->Name; diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index 0dd6854cfee0..e295a000ef8a 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -1071,6 +1071,12 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { IO.mapRequired("Entries", Section.Entries); } +static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { + commonSectionMapping(IO, Section); + IO.mapOptional("Content", Section.Content); + IO.mapOptional("Symbols", Section.Symbols); +} + void MappingTraits::mapping( IO &IO, ELFYAML::SectionOrType §ionOrType) { IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType); @@ -1161,6 +1167,11 @@ void MappingTraits>::mapping( Section.reset(new ELFYAML::SymtabShndxSection()); sectionMapping(IO, *cast(Section.get())); break; + case ELF::SHT_LLVM_ADDRSIG: + if (!IO.outputting()) + Section.reset(new ELFYAML::AddrsigSection()); + sectionMapping(IO, *cast(Section.get())); + break; default: if (!IO.outputting()) { StringRef Name; @@ -1233,6 +1244,26 @@ StringRef MappingTraits>::validate( return {}; } + if (const auto *Sec = dyn_cast(Section.get())) { + if (!Sec->Symbols && !Sec->Content) + return "one of \"Symbols\" or \"Content\" must be specified"; + + if (Sec->Content) { + if (Sec->Symbols) + return "\"Content\" and \"Symbols\" cannot be used together"; + return {}; + } + + if (!Sec->Symbols) + return {}; + + for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols) + if (AS.Index && AS.Name) + return "\"Index\" and \"Name\" cannot be used together when defining a " + "symbol"; + return {}; + } + return {}; } @@ -1340,6 +1371,12 @@ void MappingTraits::mapping(IO &IO, ELFYAML::Object &Object) { IO.setContext(nullptr); } +void MappingTraits::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) { + assert(IO.getContext() && "The IO context is not initialized"); + IO.mapOptional("Name", Sym.Name); + IO.mapOptional("Index", Sym.Index); +} + LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG) LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP) LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT) diff --git a/test/tools/llvm-readobj/elf-section-types.test b/test/tools/llvm-readobj/elf-section-types.test index aad9f43c8a30..20b881249c7f 100644 --- a/test/tools/llvm-readobj/elf-section-types.test +++ b/test/tools/llvm-readobj/elf-section-types.test @@ -196,6 +196,7 @@ Sections: Type: SHT_LLVM_CALL_GRAPH_PROFILE - Name: llvm_addrsig Type: SHT_LLVM_ADDRSIG + Symbols: - Name: .deplibs Type: SHT_LLVM_DEPENDENT_LIBRARIES - Name: .llvm_sympart.f diff --git a/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..6f21c3212bd9 --- /dev/null +++ b/test/tools/obj2yaml/elf-llvm-addrsig-section.yaml @@ -0,0 +1,98 @@ +## Check how obj2yaml dumps the SHT_LLVM_ADDRSIG section. + +## Check that when possible obj2yaml tries to produce the "Name" tag when +## dumping entries of the SHT_LLVM_ADDRSIG section. It falls back to producing +## the "Index" tag when it can't match a symbol index with a symbol table entry. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: obj2yaml %t1 | FileCheck %s --check-prefix=NAME + +# NAME: - Name: .llvm_addrsig +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Link: .symtab +# NAME-NEXT: Symbols: +# NAME-NEXT: - Name: foo +# NAME-NEXT: - Name: bar +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF +# NAME: - Name: .llvm_addrsig_unlinked +# NAME-NEXT: Type: SHT_LLVM_ADDRSIG +# NAME-NEXT: Symbols: +# NAME-NEXT: - Index: 0x00000001 +# NAME-NEXT: - Index: 0x00000002 +# NAME-NEXT: - Index: 0x00000003 +# NAME-NEXT: - Index: 0xFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF + - Name: .llvm_addrsig_unlinked + Type: SHT_LLVM_ADDRSIG + Link: 0 + Symbols: + - Index: 1 + - Index: 2 + - Index: 3 + - Index: 0xFFFFFFFF +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check that obj2yaml dumps the SHT_LLVM_ADDRSIG section +## data using the "Content" tag when at least one of the entries is broken, +## e.g. because the entry contains a malformed uleb128 value. + +# RUN: yaml2obj --docnum=2 %s -o %t2 +# RUN: obj2yaml %t2 | FileCheck %s --check-prefix=INVALID-ENTRY + +# INVALID-ENTRY: - Name: .llvm_addrsig +# INVALID-ENTRY-NEXT: Type: SHT_LLVM_ADDRSIG +# INVALID-ENTRY-NEXT: Link: .symtab +# INVALID-ENTRY-NEXT: Content: FFFFFFFFFF + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "FFFFFFFFFF" + +## obj2yaml produces a "Symbols" tag when describing an empty SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=3 %s -o %t3 +# RUN: obj2yaml %t3 | FileCheck %s --check-prefix=EMPTY + +# EMPTY: - Name: .llvm_addrsig +# EMPTY-NEXT: Type: SHT_LLVM_ADDRSIG +# EMPTY-NEXT: Link: .symtab +# EMPTY-NEXT: Symbols: [] + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" diff --git a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml new file mode 100644 index 000000000000..98496d30fa84 --- /dev/null +++ b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml @@ -0,0 +1,213 @@ +## Check how yaml2obj produces SHT_LLVM_ADDRSIG sections. + +## Check we can describe SHT_LLVM_ADDRSIG using the "Symbols" tag. We can define +## symbols either using names or indexes. + +# RUN: yaml2obj --docnum=1 %s -o %t1 +# RUN: llvm-readobj --sections --section-data %t1 | FileCheck %s --check-prefix=SYMBOLS + +# SYMBOLS: Section { +# SYMBOLS: Index: 1 +# SYMBOLS-NEXT: Name: .llvm_addrsig +# SYMBOLS-NEXT: Type: SHT_LLVM_ADDRSIG +# SYMBOLS-NEXT: Flags [ +# SYMBOLS-NEXT: ] +# SYMBOLS-NEXT: Address: 0x0 +# SYMBOLS-NEXT: Offset: 0x40 +# SYMBOLS-NEXT: Size: 4 +# SYMBOLS-NEXT: Link: 2 +# SYMBOLS-NEXT: Info: 0 +# SYMBOLS-NEXT: AddressAlignment: 0 +# SYMBOLS-NEXT: EntrySize: 0 +# SYMBOLS-NEXT: SectionData ( +# SYMBOLS-NEXT: 0000: 01020102 +# SYMBOLS-NEXT: ) +# SYMBOLS-NEXT: } + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + - Index: 1 + - Index: 2 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + - Name: bar + Type: STT_FUNC + Binding: STB_GLOBAL + +## We can't specify both "Index" and "Name" when defining a symbol. + +# RUN: not yaml2obj --docnum=2 %s 2>&1 | FileCheck %s --check-prefix=INDEX-NAME + +# INDEX-NAME: error: "Index" and "Name" cannot be used together when defining a symbol + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + Index: 1 +Symbols: + - Name: foo + Type: STT_FUNC + Binding: STB_GLOBAL + +## Check we report an error if an unknown symbol is referenced in the +## SHT_LLVM_ADDRSIG section description. + +# RUN: not yaml2obj --docnum=3 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-UNKNOWN + +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'foo' by YAML section '.llvm_addrsig' +# SYMBOL-UNKNOWN: error: unknown symbol referenced: 'bar' by YAML section '.llvm_addrsig' + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Name: foo + - Name: bar + +## Check we can specify any arbitrary symbol indices. + +# RUN: yaml2obj --docnum=4 %s -o %t4 +# RUN: llvm-readobj --sections --section-data %t4 | FileCheck %s --check-prefix=SYMBOL-INDEX + +# SYMBOL-INDEX: Type: SHT_LLVM_ADDRSIG +# SYMBOL-INDEX: SectionData ( +# SYMBOL-INDEX-NEXT: 0000: 00FF01C4 E6888901 FFFFFFFF 0F +# SYMBOL-INDEX-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0 + - Index: 255 + - Index: 0x11223344 +## 0xFFFFFFFF is a maximum allowed index value. + - Index: 0xFFFFFFFF + +## Check that the maximum symbol index size is 32 bits. + +# RUN: not yaml2obj --docnum=5 %s 2>&1 | FileCheck %s --check-prefix=SYMBOL-INDEX-OVERFLOW + +# SYMBOL-INDEX-OVERFLOW: error: out of range hex32 number + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Symbols: + - Index: 0x1122334455 + +## Check we can use the "Content" tag to specify any data for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=6 %s -o %t6 +# RUN: llvm-readobj --sections --section-data %t6 | FileCheck %s --check-prefix=CONTENT + +# CONTENT: Type: SHT_LLVM_ADDRSIG +# CONTENT: Size: +# CONTENT-SAME: 5 +# CONTENT: SectionData ( +# CONTENT-NEXT: 0000: 11223344 55 +# CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "1122334455" + +## Either "Content" or "Symbols" must be specifed for SHT_LLVM_ADDRSIG sections. + +# RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS + +# NO-TAGS: error: one of "Symbols" or "Content" must be specified + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + +## "Content" and "Symbols" cannot be used together to describe the SHT_LLVM_ADDRSIG section. + +# RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +# CONTENT-SYMBOLS: error: "Content" and "Symbols" cannot be used together + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Content: "" + Symbols: + +## Check we can set an arbitrary sh_link value for SHT_LLVM_ADDRSIG sections. + +# RUN: yaml2obj --docnum=9 %s -o %t9 +# RUN: llvm-readobj --sections %t9 | FileCheck %s --check-prefix=LINK + +# LINK: Name: .llvm_addrsig +# LINK: Link: +# LINK-SAME: 123{{$}} + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_DYN + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Link: 123 + Content: "" diff --git a/tools/obj2yaml/elf2yaml.cpp b/tools/obj2yaml/elf2yaml.cpp index c4b6eb79d18c..2c17b9570e1b 100644 --- a/tools/obj2yaml/elf2yaml.cpp +++ b/tools/obj2yaml/elf2yaml.cpp @@ -41,6 +41,7 @@ class ELFDumper { Expected getUniquedSymbolName(const Elf_Sym *Sym, StringRef StrTable, const Elf_Shdr *SymTab); + Expected getSymbolName(uint32_t SymtabNdx, uint32_t SymbolNdx); const object::ELFFile &Obj; ArrayRef ShndxTable; @@ -56,6 +57,7 @@ class ELFDumper { Error dumpRelocation(const RelT *Rel, const Elf_Shdr *SymTab, ELFYAML::Relocation &R); + Expected dumpAddrsigSection(const Elf_Shdr *Shdr); Expected dumpDynamicSection(const Elf_Shdr *Shdr); Expected dumpRelocSection(const Elf_Shdr *Shdr); Expected @@ -284,6 +286,13 @@ template Expected ELFDumper::dump() { Y->Sections.emplace_back(*SecOrErr); break; } + case ELF::SHT_LLVM_ADDRSIG: { + Expected SecOrErr = dumpAddrsigSection(&Sec); + if (!SecOrErr) + return SecOrErr.takeError(); + Y->Sections.emplace_back(*SecOrErr); + break; + } case ELF::SHT_NULL: { // We only dump the SHT_NULL section at index 0 when it // has at least one non-null field, because yaml2obj @@ -519,6 +528,46 @@ ELFDumper::dumpStackSizesSection(const Elf_Shdr *Shdr) { return S.release(); } +template +Expected +ELFDumper::dumpAddrsigSection(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); + + auto ContentOrErr = Obj.getSectionContents(Shdr); + if (!ContentOrErr) + return ContentOrErr.takeError(); + + ArrayRef Content = *ContentOrErr; + DataExtractor::Cursor Cur(0); + DataExtractor Data(Content, Obj.isLE(), /*AddressSize=*/0); + std::vector Symbols; + while (Cur && Cur.tell() < Content.size()) { + uint64_t SymNdx = Data.getULEB128(Cur); + if (!Cur) + break; + + Expected SymbolName = getSymbolName(Shdr->sh_link, SymNdx); + if (!SymbolName || SymbolName->empty()) { + consumeError(SymbolName.takeError()); + Symbols.emplace_back(SymNdx); + continue; + } + + Symbols.emplace_back(*SymbolName); + } + + if (Cur) { + S->Symbols = std::move(Symbols); + return S.release(); + } + + consumeError(Cur.takeError()); + S->Content = yaml::BinaryRef(Content); + return S.release(); +} + template Expected ELFDumper::dumpDynamicSection(const Elf_Shdr *Shdr) { @@ -791,25 +840,31 @@ ELFDumper::dumpVerneedSection(const Elf_Shdr *Shdr) { } template -Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { - auto S = std::make_unique(); - if (Error E = dumpCommonSection(Shdr, *S)) - return std::move(E); - - auto SymtabOrErr = Obj.getSection(Shdr->sh_link); +Expected ELFDumper::getSymbolName(uint32_t SymtabNdx, + uint32_t SymbolNdx) { + auto SymtabOrErr = Obj.getSection(SymtabNdx); if (!SymtabOrErr) return SymtabOrErr.takeError(); - // Get symbol with index sh_info which name is the signature of the group. + const Elf_Shdr *Symtab = *SymtabOrErr; - auto SymOrErr = Obj.getSymbol(Symtab, Shdr->sh_info); + auto SymOrErr = Obj.getSymbol(Symtab, SymbolNdx); if (!SymOrErr) return SymOrErr.takeError(); + auto StrTabOrErr = Obj.getStringTableForSymtab(*Symtab); if (!StrTabOrErr) return StrTabOrErr.takeError(); + return getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); +} + +template +Expected ELFDumper::dumpGroup(const Elf_Shdr *Shdr) { + auto S = std::make_unique(); + if (Error E = dumpCommonSection(Shdr, *S)) + return std::move(E); - Expected SymbolName = - getUniquedSymbolName(*SymOrErr, *StrTabOrErr, Symtab); + // Get symbol with index sh_info. This symbol's name is the signature of the group. + Expected SymbolName = getSymbolName(Shdr->sh_link, Shdr->sh_info); if (!SymbolName) return SymbolName.takeError(); S->Signature = *SymbolName; From 31aa362969e8322f41fdc4a73c9aa3d66cf2b17b Mon Sep 17 00:00:00 2001 From: Sanjay Patel Date: Thu, 3 Oct 2019 14:54:03 +0000 Subject: [PATCH 62/82] [MSP430] add tests for unwanted shift codegen; NFC (PR43542) git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373607 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/MSP430/selectcc.ll | 61 +++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 test/CodeGen/MSP430/selectcc.ll diff --git a/test/CodeGen/MSP430/selectcc.ll b/test/CodeGen/MSP430/selectcc.ll new file mode 100644 index 000000000000..c72079eac276 --- /dev/null +++ b/test/CodeGen/MSP430/selectcc.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=msp430-- < %s | FileCheck %s + +define i16 @select_to_shifts_i16(i16 %a, i16 %b) { +; CHECK-LABEL: select_to_shifts_i16: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.b r12, r12 +; CHECK-NEXT: swpb r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: add r12, r12 +; CHECK-NEXT: swpb r12 +; CHECK-NEXT: sxt r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and r13, r12 +; CHECK-NEXT: ret + %and = and i16 %a, 2 + %tobool = icmp eq i16 %and, 0 + %select = select i1 %tobool, i16 0, i16 %b + ret i16 %select +} + +define i32 @select_to_shifts_i32(i32 %a, i32 %b) { +; CHECK-LABEL: select_to_shifts_i32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov.b r13, r13 +; CHECK-NEXT: swpb r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: add r13, r13 +; CHECK-NEXT: swpb r13 +; CHECK-NEXT: sxt r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: and r13, r14 +; CHECK-NEXT: and r15, r13 +; CHECK-NEXT: mov r14, r12 +; CHECK-NEXT: ret + %and = and i32 %a, 2 + %tobool = icmp eq i32 %and, 0 + %select = select i1 %tobool, i32 0, i32 %b + ret i32 %select +} From 9c0ef65f155c65655b12a0bea952f00ab42146eb Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Thu, 3 Oct 2019 14:57:49 +0000 Subject: [PATCH 63/82] Test commit. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373609 91177308-0d34-0410-b5e6-96231b3b80d8 --- README.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/README.txt b/README.txt index b9b71a3b6daf..ebbd50fc0b71 100644 --- a/README.txt +++ b/README.txt @@ -15,3 +15,4 @@ documentation setup. If you are writing a package for LLVM, see docs/Packaging.rst for our suggestions. + From c9518a90403bd7292f79da5afc3eddbb59860a11 Mon Sep 17 00:00:00 2001 From: George Rimar Date: Thu, 3 Oct 2019 15:02:18 +0000 Subject: [PATCH 64/82] [yaml2obj] - Add a Size tag support for SHT_LLVM_ADDRSIG sections. It allows using "Size" with or without "Content" in YAML descriptions of SHT_LLVM_ADDRSIG sections. Differential revision: https://reviews.llvm.org/D68334 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373610 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/ObjectYAML/ELFYAML.h | 1 + lib/ObjectYAML/ELFEmitter.cpp | 4 +- lib/ObjectYAML/ELFYAML.cpp | 14 ++- .../yaml2obj/elf-llvm-addrsig-section.yaml | 98 ++++++++++++++++++- 4 files changed, 109 insertions(+), 8 deletions(-) diff --git a/include/llvm/ObjectYAML/ELFYAML.h b/include/llvm/ObjectYAML/ELFYAML.h index 592b5021bd69..ef2b4fba031d 100644 --- a/include/llvm/ObjectYAML/ELFYAML.h +++ b/include/llvm/ObjectYAML/ELFYAML.h @@ -268,6 +268,7 @@ struct AddrsigSymbol { struct AddrsigSection : Section { Optional Content; + Optional Size; Optional> Symbols; AddrsigSection() : Section(SectionKind::Addrsig) {} diff --git a/lib/ObjectYAML/ELFEmitter.cpp b/lib/ObjectYAML/ELFEmitter.cpp index 3f3b27c5bfad..f9c31f335f11 100644 --- a/lib/ObjectYAML/ELFEmitter.cpp +++ b/lib/ObjectYAML/ELFEmitter.cpp @@ -1007,8 +1007,8 @@ void ELFState::writeSectionContent(Elf_Shdr &SHeader, if (Section.Link.empty() && SN2I.lookup(".symtab", Link)) SHeader.sh_link = Link; - if (Section.Content) { - SHeader.sh_size = writeContent(OS, Section.Content, None); + if (Section.Content || Section.Size) { + SHeader.sh_size = writeContent(OS, Section.Content, Section.Size); return; } diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp index e295a000ef8a..29585abe6e80 100644 --- a/lib/ObjectYAML/ELFYAML.cpp +++ b/lib/ObjectYAML/ELFYAML.cpp @@ -1074,6 +1074,7 @@ static void sectionMapping(IO &IO, ELFYAML::SymtabShndxSection &Section) { static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) { commonSectionMapping(IO, Section); IO.mapOptional("Content", Section.Content); + IO.mapOptional("Size", Section.Size); IO.mapOptional("Symbols", Section.Symbols); } @@ -1245,12 +1246,17 @@ StringRef MappingTraits>::validate( } if (const auto *Sec = dyn_cast(Section.get())) { - if (!Sec->Symbols && !Sec->Content) - return "one of \"Symbols\" or \"Content\" must be specified"; + if (!Sec->Symbols && !Sec->Content && !Sec->Size) + return "one of \"Content\", \"Size\" or \"Symbols\" must be specified"; + + if (Sec->Content || Sec->Size) { + if (Sec->Size && Sec->Content && + (uint64_t)*Sec->Size < Sec->Content->binary_size()) + return "\"Size\" must be greater than or equal to the content " + "size"; - if (Sec->Content) { if (Sec->Symbols) - return "\"Content\" and \"Symbols\" cannot be used together"; + return "\"Symbols\" cannot be used with \"Content\" or \"Size\""; return {}; } diff --git a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml index 98496d30fa84..1433d6dbc13e 100644 --- a/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml +++ b/test/tools/yaml2obj/elf-llvm-addrsig-section.yaml @@ -161,7 +161,7 @@ Sections: # RUN: not yaml2obj --docnum=7 %s 2>&1 | FileCheck %s --check-prefix=NO-TAGS -# NO-TAGS: error: one of "Symbols" or "Content" must be specified +# NO-TAGS: error: one of "Content", "Size" or "Symbols" must be specified --- !ELF FileHeader: @@ -177,7 +177,7 @@ Sections: # RUN: not yaml2obj --docnum=8 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS -# CONTENT-SYMBOLS: error: "Content" and "Symbols" cannot be used together +# CONTENT-SYMBOLS: "Symbols" cannot be used with "Content" or "Size" --- !ELF FileHeader: @@ -211,3 +211,97 @@ Sections: Type: SHT_LLVM_ADDRSIG Link: 123 Content: "" + +## Check we can use only "Size" to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=10 %s -o %t10 +# RUN: llvm-readobj --sections --section-data %t10 | FileCheck %s --check-prefix=SIZE + +# SIZE: Name: .llvm_addrsig +# SIZE: Size: +# SIZE-SAME: 17 +# SIZE: SectionData ( +# SIZE-NEXT: 0000: 00000000 00000000 00000000 00000000 | +# SIZE-NEXT: 0010: 00 | +# SIZE-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x11 + +## Check we can use "Size" and "Content" together to create a SHT_LLVM_ADDRSIG section. + +# RUN: yaml2obj --docnum=11 %s -o %t11 +# RUN: llvm-readobj --sections --section-data %t11 | FileCheck %s --check-prefix=SIZE-CONTENT + +# SIZE-CONTENT: Name: .llvm_addrsig_sizegr +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 5 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 11223300 00 | +# SIZE-CONTENT-NEXT: ) + +# SIZE-CONTENT: Name: .llvm_addrsig_sizeeq +# SIZE-CONTENT: Size: +# SIZE-CONTENT-SAME: 3 +# SIZE-CONTENT: SectionData ( +# SIZE-CONTENT-NEXT: 0000: 112233 | +# SIZE-CONTENT-NEXT: ) + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig_sizegr + Type: SHT_LLVM_ADDRSIG + Size: 0x5 + Content: "112233" + - Name: .llvm_addrsig_sizeeq + Type: SHT_LLVM_ADDRSIG + Size: 0x3 + Content: "112233" + +## Check that when "Size" and "Content" are used together, the size +## must be greater than or equal to the content size. + +# RUN: not yaml2obj --docnum=12 %s 2>&1 | FileCheck %s --check-prefix=SIZE-CONTENT-ERR + +# SIZE-CONTENT-ERR: error: "Size" must be greater than or equal to the content size + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Content: "1122" + +## Check we can't use "Size" and "Symbols" tags together. + +# RUN: not yaml2obj --docnum=13 %s 2>&1 | FileCheck %s --check-prefix=CONTENT-SYMBOLS + +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_X86_64 +Sections: + - Name: .llvm_addrsig + Type: SHT_LLVM_ADDRSIG + Size: 0x1 + Symbols: [ ] From 36e56a17b6356849468b14eeafc29184143d67b4 Mon Sep 17 00:00:00 2001 From: Edward Jones Date: Thu, 3 Oct 2019 15:47:28 +0000 Subject: [PATCH 65/82] [RISCV] Add obsolete aliases of fscsr, frcsr (fssr, frsr) These old aliases were renamed, but are still used by some projects (eg newlib). Differential Revision: https://reviews.llvm.org/D68392 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373618 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/RISCV/RISCVInstrInfoF.td | 6 ++++++ test/MC/RISCV/rvf-aliases-valid.s | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/lib/Target/RISCV/RISCVInstrInfoF.td b/lib/Target/RISCV/RISCVInstrInfoF.td index 032642942f2b..3b73c865ea17 100644 --- a/lib/Target/RISCV/RISCVInstrInfoF.td +++ b/lib/Target/RISCV/RISCVInstrInfoF.td @@ -227,6 +227,12 @@ def : InstAlias<"frcsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 2>; def : InstAlias<"fscsr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs)>; def : InstAlias<"fscsr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 2>; +// frsr, fssr are obsolete aliases replaced by frcsr, fscsr, so give them +// zero weight. +def : InstAlias<"frsr $rd", (CSRRS GPR:$rd, FCSR.Encoding, X0), 0>; +def : InstAlias<"fssr $rd, $rs", (CSRRW GPR:$rd, FCSR.Encoding, GPR:$rs), 0>; +def : InstAlias<"fssr $rs", (CSRRW X0, FCSR.Encoding, GPR:$rs), 0>; + def : InstAlias<"frrm $rd", (CSRRS GPR:$rd, FRM.Encoding, X0), 2>; def : InstAlias<"fsrm $rd, $rs", (CSRRW GPR:$rd, FRM.Encoding, GPR:$rs)>; def : InstAlias<"fsrm $rs", (CSRRW X0, FRM.Encoding, GPR:$rs), 2>; diff --git a/test/MC/RISCV/rvf-aliases-valid.s b/test/MC/RISCV/rvf-aliases-valid.s index 725dbe6d6a2c..0d8179ff31f9 100644 --- a/test/MC/RISCV/rvf-aliases-valid.s +++ b/test/MC/RISCV/rvf-aliases-valid.s @@ -55,6 +55,18 @@ fscsr x6, x7 # CHECK-ALIAS: fscsr t3 fscsr x28 +# These are obsolete aliases of frcsr/fscsr. They are accepted by the assembler +# but the disassembler should always print them as the equivalent, new aliases. +# CHECK-INST: csrrs t4, fcsr, zero +# CHECK-ALIAS: frcsr t4 +frsr x29 +# CHECK-INST: csrrw t5, fcsr, t6 +# CHECK-ALIAS: fscsr t5, t6 +fssr x30, x31 +# CHECK-INST: csrrw zero, fcsr, s0 +# CHECK-ALIAS: fscsr s0 +fssr x8 + # CHECK-INST: csrrs t4, frm, zero # CHECK-ALIAS: frrm t4 frrm x29 From 4ddc21624e8f4fdad6375a770ac86015bca70a65 Mon Sep 17 00:00:00 2001 From: Guillaume Chatelet Date: Thu, 3 Oct 2019 15:53:50 +0000 Subject: [PATCH 66/82] Revert "[Alignment][NFC] Allow constexpr Align" This reverts commit b3af236fb5fc6e50fcc1b54d868f0bff557f3fb1. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373619 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Support/Alignment.h | 26 ++++------------------ include/llvm/Support/MathExtras.h | 9 -------- lib/Target/AArch64/AArch64StackTagging.cpp | 2 +- unittests/Support/AlignmentTest.cpp | 10 --------- unittests/Support/MathExtrasTest.cpp | 19 ---------------- 5 files changed, 5 insertions(+), 61 deletions(-) diff --git a/include/llvm/Support/Alignment.h b/include/llvm/Support/Alignment.h index f94d7cdc9a14..3d8a4235b0e6 100644 --- a/include/llvm/Support/Alignment.h +++ b/include/llvm/Support/Alignment.h @@ -58,10 +58,10 @@ struct Align { constexpr Align() = default; /// Do not perform checks in case of copy/move construct/assign, because the /// checks have been performed when building `Other`. - constexpr Align(const Align &Other) = default; - constexpr Align &operator=(const Align &Other) = default; - constexpr Align(Align &&Other) = default; - constexpr Align &operator=(Align &&Other) = default; + Align(const Align &Other) = default; + Align &operator=(const Align &Other) = default; + Align(Align &&Other) = default; + Align &operator=(Align &&Other) = default; explicit Align(uint64_t Value) { assert(Value > 0 && "Value must not be 0"); @@ -80,24 +80,6 @@ struct Align { /// would be better than /// `if (A > Align(1))` constexpr static const Align None() { return Align(); } - - /// This function is useful when initializing constexpr Align constants. - /// e.g. static constexpr Align kAlign16 = Align::Constant<16>(); - /// Most compilers (clang, gcc, icc) will be able to compute `ShiftValue` - /// at compile time with `Align::Align(uint64_t Value)` but to be - /// able to use Align as a constexpr constant use this method. - /// FIXME: When LLVM is C++17 ready `Align::Align(uint64_t Value)` - /// can be constexpr and we can dispatch between runtime (Log2_64) vs - /// compile time (CTLog2) versions using constexpr-if. Then this - /// function is no more necessary and we can add user defined literals - /// for convenience. - template constexpr static Align Constant() { - static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), - "Not a valid alignment"); - Align A; - A.ShiftValue = CTLog2(); - return A; - } }; /// Treats the value 0 as a 1, so Align is always at least 1. diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h index 9570ae67a9d1..775d19a698f4 100644 --- a/include/llvm/Support/MathExtras.h +++ b/include/llvm/Support/MathExtras.h @@ -532,15 +532,6 @@ inline double Log2(double Value) { #endif } -/// Return the compile time log base 2 of the specified Value. -/// `kValue` has to be a power of two. -template static constexpr inline uint8_t CTLog2() { - static_assert(kValue > 0 && llvm::isPowerOf2_64(kValue), - "Value is not a valid power of 2"); - return 1 + CTLog2(); -} -template <> constexpr inline uint8_t CTLog2<1>() { return 0; } - /// Return the floor log base 2 of the specified value, -1 if the value is zero. /// (32 bit edition.) /// Ex. Log2_32(32) == 5, Log2_32(1) == 0, Log2_32(0) == -1, Log2_32(6) == 2 diff --git a/lib/Target/AArch64/AArch64StackTagging.cpp b/lib/Target/AArch64/AArch64StackTagging.cpp index 55c7afbd69f7..0c52711a8d7e 100644 --- a/lib/Target/AArch64/AArch64StackTagging.cpp +++ b/lib/Target/AArch64/AArch64StackTagging.cpp @@ -62,7 +62,7 @@ static cl::opt ClMergeInit( static cl::opt ClScanLimit("stack-tagging-merge-init-scan-limit", cl::init(40), cl::Hidden); -static constexpr Align kTagGranuleSize = Align::Constant<16>(); +static const Align kTagGranuleSize = Align(16); namespace { diff --git a/unittests/Support/AlignmentTest.cpp b/unittests/Support/AlignmentTest.cpp index 3d35a612b832..0b1435912b93 100644 --- a/unittests/Support/AlignmentTest.cpp +++ b/unittests/Support/AlignmentTest.cpp @@ -44,16 +44,6 @@ TEST(AlignmentTest, ValidCTors) { } } -TEST(AlignmentTest, CompileTimeConstant) { - EXPECT_EQ(Align::Constant<1>(), Align(1)); - EXPECT_EQ(Align::Constant<2>(), Align(2)); - EXPECT_EQ(Align::Constant<4>(), Align(4)); - EXPECT_EQ(Align::Constant<8>(), Align(8)); - EXPECT_EQ(Align::Constant<16>(), Align(16)); - EXPECT_EQ(Align::Constant<32>(), Align(32)); - EXPECT_EQ(Align::Constant<64>(), Align(64)); -} - TEST(AlignmentTest, CheckMaybeAlignHasValue) { EXPECT_TRUE(MaybeAlign(1)); EXPECT_TRUE(MaybeAlign(1).hasValue()); diff --git a/unittests/Support/MathExtrasTest.cpp b/unittests/Support/MathExtrasTest.cpp index 00d037ad110b..01c83c9e14d3 100644 --- a/unittests/Support/MathExtrasTest.cpp +++ b/unittests/Support/MathExtrasTest.cpp @@ -203,25 +203,6 @@ TEST(MathExtras, PowerOf2Floor) { EXPECT_EQ(4U, PowerOf2Floor(7U)); } -TEST(MathExtras, CTLog2) { - EXPECT_EQ(CTLog2<1ULL << 0>(), 0); - EXPECT_EQ(CTLog2<1ULL << 1>(), 1); - EXPECT_EQ(CTLog2<1ULL << 2>(), 2); - EXPECT_EQ(CTLog2<1ULL << 3>(), 3); - EXPECT_EQ(CTLog2<1ULL << 4>(), 4); - EXPECT_EQ(CTLog2<1ULL << 5>(), 5); - EXPECT_EQ(CTLog2<1ULL << 6>(), 6); - EXPECT_EQ(CTLog2<1ULL << 7>(), 7); - EXPECT_EQ(CTLog2<1ULL << 8>(), 8); - EXPECT_EQ(CTLog2<1ULL << 9>(), 9); - EXPECT_EQ(CTLog2<1ULL << 10>(), 10); - EXPECT_EQ(CTLog2<1ULL << 11>(), 11); - EXPECT_EQ(CTLog2<1ULL << 12>(), 12); - EXPECT_EQ(CTLog2<1ULL << 13>(), 13); - EXPECT_EQ(CTLog2<1ULL << 14>(), 14); - EXPECT_EQ(CTLog2<1ULL << 15>(), 15); -} - TEST(MathExtras, ByteSwap_32) { EXPECT_EQ(0x44332211u, ByteSwap_32(0x11223344)); EXPECT_EQ(0xDDCCBBAAu, ByteSwap_32(0xAABBCCDD)); From b45f56658d16662ff0f39a95cc477ddbeba1a0dc Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Thu, 3 Oct 2019 16:30:29 +0000 Subject: [PATCH 67/82] [BPF] Handle offset reloc endpoint ending in the middle of chain properly During studying support for bitfield, I found an issue for an example like the one in test offset-reloc-middle-chain.ll. struct t1 { int c; }; struct s1 { struct t1 b; }; struct r1 { struct s1 a; }; #define _(x) __builtin_preserve_access_index(x) void test1(void *p1, void *p2, void *p3); void test(struct r1 *arg) { struct s1 *ps = _(&arg->a); struct t1 *pt = _(&arg->a.b); int *pi = _(&arg->a.b.c); test1(ps, pt, pi); } The IR looks like: %0 = llvm.preserve.struct.access(base, ...) %1 = llvm.preserve.struct.access(%0, ...) %2 = llvm.preserve.struct.access(%1, ...) using %0, %1 and %2 In this case, we need to generate three relocatiions corresponding to chains: (%0), (%0, %1) and (%0, %1, %2). After collecting all the chains, the current implementation process each chain (in a map) with code generation sequentially. For example, after (%0) is processed, the code may look like: %0 = base + special_global_variable // llvm.preserve.struct.access(base, ...) is delisted // from the instruction stream. %1 = llvm.preserve.struct.access(%0, ...) %2 = llvm.preserve.struct.access(%1, ...) using %0, %1 and %2 When processing chain (%0, %1), the current implementation tries to visit intrinsic llvm.preserve.struct.access(base, ...) to get some of its properties and this caused segfault. This patch fixed the issue by remembering all necessary information (kind, metadata, access_index, base) during analysis phase, so in code generation phase there is no need to examine the intrinsic call instructions. This also simplifies the code. Differential Revision: https://reviews.llvm.org/D68389 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373621 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/BPF/BPFAbstractMemberAccess.cpp | 218 ++++++++---------- .../BPF/CORE/offset-reloc-middle-chain.ll | 127 ++++++++++ 2 files changed, 227 insertions(+), 118 deletions(-) create mode 100644 test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll diff --git a/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/lib/Target/BPF/BPFAbstractMemberAccess.cpp index c682c46fe316..870300ab2b25 100644 --- a/lib/Target/BPF/BPFAbstractMemberAccess.cpp +++ b/lib/Target/BPF/BPFAbstractMemberAccess.cpp @@ -90,6 +90,13 @@ class BPFAbstractMemberAccess final : public ModulePass { static char ID; BPFAbstractMemberAccess() : ModulePass(ID) {} + struct CallInfo { + uint32_t Kind; + uint32_t AccessIndex; + MDNode *Metadata; + Value *Base; + }; + private: enum : uint32_t { BPFPreserveArrayAI = 1, @@ -99,34 +106,32 @@ class BPFAbstractMemberAccess final : public ModulePass { std::map GEPGlobals; // A map to link preserve_*_access_index instrinsic calls. - std::map> AIChain; + std::map> AIChain; // A map to hold all the base preserve_*_access_index instrinsic calls. // The base call is not an input of any other preserve_*_access_index // intrinsics. - std::map BaseAICalls; + std::map BaseAICalls; bool doTransformation(Module &M); - void traceAICall(CallInst *Call, uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI); - void traceBitCast(BitCastInst *BitCast, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); - void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, uint32_t ParentAI); + void traceAICall(CallInst *Call, CallInfo &ParentInfo); + void traceBitCast(BitCastInst *BitCast, CallInst *Parent, + CallInfo &ParentInfo); + void traceGEP(GetElementPtrInst *GEP, CallInst *Parent, + CallInfo &ParentInfo); void collectAICallChains(Module &M, Function &F); - bool IsPreserveDIAccessIndexCall(const CallInst *Call, uint32_t &Kind, - const MDNode *&TypeMeta, uint32_t &AccessIndex); + bool IsPreserveDIAccessIndexCall(const CallInst *Call, CallInfo &Cinfo); bool IsValidAIChain(const MDNode *ParentMeta, uint32_t ParentAI, const MDNode *ChildMeta); bool removePreserveAccessIndexIntrinsic(Module &M); void replaceWithGEP(std::vector &CallList, uint32_t NumOfZerosIndex, uint32_t DIIndex); - Value *computeBaseAndAccessKey(CallInst *Call, std::string &AccessKey, - uint32_t Kind, MDNode *&BaseMeta); - bool getAccessIndex(const Value *IndexValue, uint64_t &AccessIndex); - bool transformGEPChain(Module &M, CallInst *Call, uint32_t Kind); + Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo, + std::string &AccessKey, MDNode *&BaseMeta); + uint64_t getConstant(const Value *IndexValue); + bool transformGEPChain(Module &M, CallInst *Call, CallInfo &CInfo); }; } // End anonymous namespace @@ -192,9 +197,7 @@ static uint32_t calcArraySize(const DICompositeType *CTy, uint32_t StartDim) { /// Check whether a call is a preserve_*_access_index intrinsic call or not. bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, - uint32_t &Kind, - const MDNode *&TypeMeta, - uint32_t &AccessIndex) { + CallInfo &CInfo) { if (!Call) return false; @@ -202,30 +205,30 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call, if (!GV) return false; if (GV->getName().startswith("llvm.preserve.array.access.index")) { - Kind = BPFPreserveArrayAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveArrayAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.array.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.union.access.index")) { - Kind = BPFPreserveUnionAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveUnionAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.union.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(1)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(1)); + CInfo.Base = Call->getArgOperand(0); return true; } if (GV->getName().startswith("llvm.preserve.struct.access.index")) { - Kind = BPFPreserveStructAI; - TypeMeta = Call->getMetadata(LLVMContext::MD_preserve_access_index); - if (!TypeMeta) + CInfo.Kind = BPFPreserveStructAI; + CInfo.Metadata = Call->getMetadata(LLVMContext::MD_preserve_access_index); + if (!CInfo.Metadata) report_fatal_error("Missing metadata for llvm.preserve.struct.access.index intrinsic"); - AccessIndex = cast(Call->getArgOperand(2)) - ->getZExtValue(); + CInfo.AccessIndex = getConstant(Call->getArgOperand(2)); + CInfo.Base = Call->getArgOperand(0); return true; } @@ -238,8 +241,7 @@ void BPFAbstractMemberAccess::replaceWithGEP(std::vector &CallList, for (auto Call : CallList) { uint32_t Dimension = 1; if (DimensionIndex > 0) - Dimension = cast(Call->getArgOperand(DimensionIndex)) - ->getZExtValue(); + Dimension = getConstant(Call->getArgOperand(DimensionIndex)); Constant *Zero = ConstantInt::get(Type::getInt32Ty(Call->getParent()->getContext()), 0); @@ -265,16 +267,14 @@ bool BPFAbstractMemberAccess::removePreserveAccessIndexIntrinsic(Module &M) { for (auto &BB : F) for (auto &I : BB) { auto *Call = dyn_cast(&I); - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex)) + CallInfo CInfo; + if (!IsPreserveDIAccessIndexCall(Call, CInfo)) continue; Found = true; - if (Kind == BPFPreserveArrayAI) + if (CInfo.Kind == BPFPreserveArrayAI) PreserveArrayIndexCalls.push_back(Call); - else if (Kind == BPFPreserveUnionAI) + else if (CInfo.Kind == BPFPreserveUnionAI) PreserveUnionIndexCalls.push_back(Call); else PreserveStructIndexCalls.push_back(Call); @@ -349,99 +349,94 @@ bool BPFAbstractMemberAccess::IsValidAIChain(const MDNode *ParentType, return dyn_cast(stripQualifiers(Ty)) == CTy; } -void BPFAbstractMemberAccess::traceAICall(CallInst *Call, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { +void BPFAbstractMemberAccess::traceAICall(CallInst *Call, + CallInfo &ParentInfo) { for (User *U : Call->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Call, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Call, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Call, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Call, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Call, Kind, ParentMeta, ParentAI); + traceGEP(GI, Call, ParentInfo); else - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } else { - BaseAICalls[Call] = Kind; + BaseAICalls[Call] = ParentInfo; } } } void BPFAbstractMemberAccess::traceBitCast(BitCastInst *BitCast, - CallInst *Parent, uint32_t Kind, - const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInst *Parent, + CallInfo &ParentInfo) { for (User *U : BitCast->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } void BPFAbstractMemberAccess::traceGEP(GetElementPtrInst *GEP, CallInst *Parent, - uint32_t Kind, const MDNode *ParentMeta, - uint32_t ParentAI) { + CallInfo &ParentInfo) { for (User *U : GEP->users()) { Instruction *Inst = dyn_cast(U); if (!Inst) continue; if (auto *BI = dyn_cast(Inst)) { - traceBitCast(BI, Parent, Kind, ParentMeta, ParentAI); + traceBitCast(BI, Parent, ParentInfo); } else if (auto *CI = dyn_cast(Inst)) { - uint32_t CIKind; - const MDNode *ChildMeta; - uint32_t ChildAI; - if (IsPreserveDIAccessIndexCall(CI, CIKind, ChildMeta, ChildAI) && - IsValidAIChain(ParentMeta, ParentAI, ChildMeta)) { - AIChain[CI] = std::make_pair(Parent, Kind); - traceAICall(CI, CIKind, ChildMeta, ChildAI); + CallInfo ChildInfo; + if (IsPreserveDIAccessIndexCall(CI, ChildInfo) && + IsValidAIChain(ParentInfo.Metadata, ParentInfo.AccessIndex, + ChildInfo.Metadata)) { + AIChain[CI] = std::make_pair(Parent, ParentInfo); + traceAICall(CI, ChildInfo); } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } else if (auto *GI = dyn_cast(Inst)) { if (GI->hasAllZeroIndices()) - traceGEP(GI, Parent, Kind, ParentMeta, ParentAI); + traceGEP(GI, Parent, ParentInfo); else - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } else { - BaseAICalls[Parent] = Kind; + BaseAICalls[Parent] = ParentInfo; } } } @@ -452,44 +447,37 @@ void BPFAbstractMemberAccess::collectAICallChains(Module &M, Function &F) { for (auto &BB : F) for (auto &I : BB) { - uint32_t Kind; - const MDNode *TypeMeta; - uint32_t AccessIndex; + CallInfo CInfo; auto *Call = dyn_cast(&I); - if (!IsPreserveDIAccessIndexCall(Call, Kind, TypeMeta, AccessIndex) || + if (!IsPreserveDIAccessIndexCall(Call, CInfo) || AIChain.find(Call) != AIChain.end()) continue; - traceAICall(Call, Kind, TypeMeta, AccessIndex); + traceAICall(Call, CInfo); } } -/// Get access index from the preserve_*_access_index intrinsic calls. -bool BPFAbstractMemberAccess::getAccessIndex(const Value *IndexValue, - uint64_t &AccessIndex) { +uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) { const ConstantInt *CV = dyn_cast(IndexValue); - if (!CV) - return false; - - AccessIndex = CV->getValue().getZExtValue(); - return true; + assert(CV); + return CV->getValue().getZExtValue(); } /// Compute the base of the whole preserve_*_access_index chains, i.e., the base /// pointer of the first preserve_*_access_index call, and construct the access /// string, which will be the name of a global variable. Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, + CallInfo &CInfo, std::string &AccessKey, - uint32_t Kind, MDNode *&TypeMeta) { Value *Base = nullptr; std::string TypeName; - std::stack> CallStack; + std::stack> CallStack; // Put the access chain into a stack with the top as the head of the chain. while (Call) { - CallStack.push(std::make_pair(Call, Kind)); - Kind = AIChain[Call].second; + CallStack.push(std::make_pair(Call, CInfo)); + CInfo = AIChain[Call].second; Call = AIChain[Call].first; } @@ -508,14 +496,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, while (CallStack.size()) { auto StackElem = CallStack.top(); Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; if (!Base) - Base = Call->getArgOperand(0); + Base = CInfo.Base; - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); - DIType *Ty = stripQualifiers(cast(MDN)); - if (Kind == BPFPreserveUnionAI || Kind == BPFPreserveStructAI) { + DIType *Ty = stripQualifiers(cast(CInfo.Metadata)); + if (CInfo.Kind == BPFPreserveUnionAI || + CInfo.Kind == BPFPreserveStructAI) { // struct or union type TypeName = Ty->getName(); TypeMeta = Ty; @@ -527,9 +515,7 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, CallStack.pop(); // BPFPreserveArrayAI - uint64_t AccessIndex; - if (!getAccessIndex(Call->getArgOperand(2), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; DIType *BaseTy = nullptr; bool CheckElemType = false; @@ -580,18 +566,14 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, // and access key construction. while (CallStack.size()) { auto StackElem = CallStack.top(); - Call = StackElem.first; - Kind = StackElem.second; + CInfo = StackElem.second; CallStack.pop(); // Access Index - uint64_t AccessIndex; - uint32_t ArgIndex = (Kind == BPFPreserveUnionAI) ? 1 : 2; - if (!getAccessIndex(Call->getArgOperand(ArgIndex), AccessIndex)) - return nullptr; + uint64_t AccessIndex = CInfo.AccessIndex; AccessKey += ":" + std::to_string(AccessIndex); - MDNode *MDN = Call->getMetadata(LLVMContext::MD_preserve_access_index); + MDNode *MDN = CInfo.Metadata; // At this stage, it cannot be pointer type. auto *CTy = cast(stripQualifiers(cast(MDN))); uint32_t Tag = CTy->getTag(); @@ -615,11 +597,11 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call, /// Call/Kind is the base preserve_*_access_index() call. Attempts to do /// transformation to a chain of relocable GEPs. bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call, - uint32_t Kind) { + CallInfo &CInfo) { std::string AccessKey; MDNode *TypeMeta; Value *Base = - computeBaseAndAccessKey(Call, AccessKey, Kind, TypeMeta); + computeBaseAndAccessKey(Call, CInfo, AccessKey, TypeMeta); if (!Base) return false; diff --git a/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll new file mode 100644 index 000000000000..0f75cd812421 --- /dev/null +++ b/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll @@ -0,0 +1,127 @@ +; RUN: llc -march=bpfel -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; RUN: llc -march=bpfeb -filetype=asm -o - %s | FileCheck -check-prefixes=CHECK %s +; Source code: +; struct t1 { +; int c; +; }; +; struct s1 { +; struct t1 b; +; }; +; struct r1 { +; struct s1 a; +; }; +; #define _(x) __builtin_preserve_access_index(x) +; void test1(void *p1, void *p2, void *p3); +; void test(struct r1 *arg) { +; struct s1 *ps = _(&arg->a); +; struct t1 *pt = _(&arg->a.b); +; int *pi = _(&arg->a.b.c); +; test1(ps, pt, pi); +; } +; Compilation flag: +; clang -target bpf -O2 -g -S -emit-llvm test.c + +%struct.r1 = type { %struct.s1 } +%struct.s1 = type { %struct.t1 } +%struct.t1 = type { i32 } + +; Function Attrs: nounwind +define dso_local void @test(%struct.r1* %arg) local_unnamed_addr #0 !dbg !7 { +entry: + call void @llvm.dbg.value(metadata %struct.r1* %arg, metadata !22, metadata !DIExpression()), !dbg !29 + %0 = tail call %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1* %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11 + call void @llvm.dbg.value(metadata %struct.s1* %0, metadata !23, metadata !DIExpression()), !dbg !29 + %1 = tail call %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1* %0, i32 0, i32 0), !dbg !31, !llvm.preserve.access.index !14 + call void @llvm.dbg.value(metadata %struct.t1* %1, metadata !25, metadata !DIExpression()), !dbg !29 + %2 = tail call i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1* %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17 + call void @llvm.dbg.value(metadata i32* %2, metadata !27, metadata !DIExpression()), !dbg !29 + %3 = bitcast %struct.s1* %0 to i8*, !dbg !33 + %4 = bitcast %struct.t1* %1 to i8*, !dbg !34 + %5 = bitcast i32* %2 to i8*, !dbg !35 + tail call void @test1(i8* %3, i8* %4, i8* %5) #4, !dbg !36 + ret void, !dbg !37 +} + +; CHECK: .long 1 # BTF_KIND_STRUCT(id = 2) + +; CHECK: .ascii "r1" # string offset=1 +; CHECK: .ascii ".text" # string offset=29 +; CHECK: .ascii "0:0" # string offset=72 +; CHECK: .ascii "0:0:0" # string offset=76 +; CHECK: .ascii "0:0:0:0" # string offset=82 + +; CHECK: .long 12 # OffsetReloc +; CHECK-NEXT: .long 29 # Offset reloc section string offset=29 +; CHECK-NEXT: .long 3 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 72 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 76 +; CHECK_NEXT: .long .Ltmp{{[0-9]+}} +; CHECK_NEXT: .long 2 +; CHECK_NEXT: .long 82 + +; Function Attrs: nounwind readnone +declare %struct.s1* @llvm.preserve.struct.access.index.p0s_struct.s1s.p0s_struct.r1s(%struct.r1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare %struct.t1* @llvm.preserve.struct.access.index.p0s_struct.t1s.p0s_struct.s1s(%struct.s1*, i32, i32) #1 + +; Function Attrs: nounwind readnone +declare i32* @llvm.preserve.struct.access.index.p0i32.p0s_struct.t1s(%struct.t1*, i32, i32) #1 + +declare dso_local void @test1(i8*, i8*, i8*) local_unnamed_addr #2 + +; Function Attrs: nounwind readnone speculatable willreturn +declare void @llvm.dbg.value(metadata, metadata, metadata) #3 + +attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #1 = { nounwind readnone } +attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" } +attributes #3 = { nounwind readnone speculatable willreturn } +attributes #4 = { nounwind } + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4, !5} +!llvm.ident = !{!6} + +!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, nameTableKind: None) +!1 = !DIFile(filename: "test.c", directory: "/tmp/home/yhs/work/tests/core") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 2, !"Debug Info Version", i32 3} +!5 = !{i32 1, !"wchar_size", i32 4} +!6 = !{!"clang version 10.0.0 (https://github.com/llvm/llvm-project.git 42b3328a2368b38fba6bdb0c616fe6c5520e3bc5)"} +!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 12, type: !8, scopeLine: 12, flags: DIFlagPrototyped, isDefinition: true, isOptimized: true, unit: !0, retainedNodes: !21) +!8 = !DISubroutineType(types: !9) +!9 = !{null, !10} +!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64) +!11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "r1", file: !1, line: 7, size: 32, elements: !12) +!12 = !{!13} +!13 = !DIDerivedType(tag: DW_TAG_member, name: "a", scope: !11, file: !1, line: 8, baseType: !14, size: 32) +!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s1", file: !1, line: 4, size: 32, elements: !15) +!15 = !{!16} +!16 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !14, file: !1, line: 5, baseType: !17, size: 32) +!17 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "t1", file: !1, line: 1, size: 32, elements: !18) +!18 = !{!19} +!19 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !17, file: !1, line: 2, baseType: !20, size: 32) +!20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!21 = !{!22, !23, !25, !27} +!22 = !DILocalVariable(name: "arg", arg: 1, scope: !7, file: !1, line: 12, type: !10) +!23 = !DILocalVariable(name: "ps", scope: !7, file: !1, line: 13, type: !24) +!24 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !14, size: 64) +!25 = !DILocalVariable(name: "pt", scope: !7, file: !1, line: 14, type: !26) +!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !17, size: 64) +!27 = !DILocalVariable(name: "pi", scope: !7, file: !1, line: 15, type: !28) +!28 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64) +!29 = !DILocation(line: 0, scope: !7) +!30 = !DILocation(line: 13, column: 19, scope: !7) +!31 = !DILocation(line: 14, column: 19, scope: !7) +!32 = !DILocation(line: 15, column: 13, scope: !7) +!33 = !DILocation(line: 16, column: 9, scope: !7) +!34 = !DILocation(line: 16, column: 13, scope: !7) +!35 = !DILocation(line: 16, column: 17, scope: !7) +!36 = !DILocation(line: 16, column: 3, scope: !7) +!37 = !DILocation(line: 17, column: 1, scope: !7) From 6bf280d1d27cdb2ed56dde02390de5a64cd5e9bb Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Oct 2019 16:34:41 +0000 Subject: [PATCH 68/82] [dsymutil] Tablegenify option parsing This patch reimplements command line option parsing in dsymutil with Tablegen and libOption. The main motivation for this change is to prevent clashes with other cl::opt options defined in llvm. Although it's a bit more heavyweight, it has some nice advantages such as no global static initializers and better separation between the code and the option definitions. I also used this opportunity to improve how dsymutil deals with incompatible options. Instead of having checks spread across the code, everything is now grouped together in verifyOptions. The fact that the options are no longer global means that we need to pass them around a bit more, but I think it's worth the trade-off. Differential revision: https://reviews.llvm.org/D68361 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373622 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/tools/dsymutil/cmdline.test | 14 +- tools/dsymutil/CMakeLists.txt | 6 + tools/dsymutil/Options.td | 146 ++++++++ tools/dsymutil/dsymutil.cpp | 588 +++++++++++++++---------------- 4 files changed, 449 insertions(+), 305 deletions(-) create mode 100644 tools/dsymutil/Options.td diff --git a/test/tools/dsymutil/cmdline.test b/test/tools/dsymutil/cmdline.test index 60a1a0a2d10f..7e9223c94019 100644 --- a/test/tools/dsymutil/cmdline.test +++ b/test/tools/dsymutil/cmdline.test @@ -1,21 +1,19 @@ RUN: dsymutil -help 2>&1 | FileCheck --check-prefix=HELP %s HELP: OVERVIEW: manipulate archived DWARF debug symbol files. -HELP: USAGE: dsymutil{{[^ ]*}} [options] +HELP: USAGE: {{.*}}dsymutil{{[^ ]*}} [options] HELP-NOT: -reverse-iterate -HELP: Color Options -HELP: -color -HELP: Specific Options: +HELP: Dsymutil Options: HELP: -accelerator -HELP: -arch= +HELP: -arch HELP: -dump-debug-map HELP: -flat HELP: -minimize HELP: -no-odr HELP: -no-output HELP: -no-swiftmodule-timestamp -HELP: -num-threads= -HELP: -o= -HELP: -oso-prepend-path= +HELP: -num-threads +HELP: -oso-prepend-path +HELP: -o HELP: -papertrail HELP: -symbol-map HELP: -symtab diff --git a/tools/dsymutil/CMakeLists.txt b/tools/dsymutil/CMakeLists.txt index 19865e3d20e1..f88e6db62c38 100644 --- a/tools/dsymutil/CMakeLists.txt +++ b/tools/dsymutil/CMakeLists.txt @@ -1,3 +1,7 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(DsymutilTableGen) + set(LLVM_LINK_COMPONENTS AllTargetsAsmPrinters AllTargetsCodeGens @@ -7,6 +11,7 @@ set(LLVM_LINK_COMPONENTS DebugInfoDWARF MC Object + Option Support Target ) @@ -27,6 +32,7 @@ add_llvm_tool(dsymutil DEPENDS intrinsics_gen + ${tablegen_deps} ) if(APPLE) diff --git a/tools/dsymutil/Options.td b/tools/dsymutil/Options.td new file mode 100644 index 000000000000..c2114c86a1a3 --- /dev/null +++ b/tools/dsymutil/Options.td @@ -0,0 +1,146 @@ +include "llvm/Option/OptParser.td" + +class F: Flag<["--", "-"], name>; + +def grp_general : OptionGroup<"Dsymutil">, HelpText<"Dsymutil Options">; + +def help: F<"help">, + HelpText<"Prints this help output.">, + Group; +def: Flag<["-"], "h">, + Alias, + HelpText<"Alias for --help">, + Group; + +def version: F<"version">, + HelpText<"Prints the dsymutil version.">, + Group; +def: Flag<["-"], "v">, + Alias, + HelpText<"Alias for --version">, + Group; + +def verbose: F<"verbose">, + HelpText<"Enable verbose mode.">, + Group; + +def verify: F<"verify">, + HelpText<"Run the DWARF verifier on the linked DWARF debug info.">, + Group; + +def no_output: F<"no-output">, + HelpText<"Do the link in memory, but do not emit the result file.">, + Group; + +def no_swiftmodule_timestamp: F<"no-swiftmodule-timestamp">, + HelpText<"Don't check timestamp for swiftmodule files.">, + Group; + +def no_odr: F<"no-odr">, + HelpText<"Do not use ODR (One Definition Rule) for type uniquing.">, + Group; + +def dump_debug_map: F<"dump-debug-map">, + HelpText<"Parse and dump the debug map to standard output. Not DWARF link will take place.">, + Group; + +def yaml_input: F<"y">, + HelpText<"Treat the input file is a YAML debug map rather than a binary.">, + Group; + +def papertrail: F<"papertrail">, + HelpText<"Embed warnings in the linked DWARF debug info.">, + Group; + +def assembly: F<"S">, + HelpText<"Output textual assembly instead of a binary dSYM companion file.">, + Group; + +def symtab: F<"symtab">, + HelpText<"Dumps the symbol table found in executable or object file(s) and exits.">, + Group; +def: Flag<["-"], "s">, + Alias, + HelpText<"Alias for --symtab">, + Group; + +def flat: F<"flat">, + HelpText<"Produce a flat dSYM file (not a bundle).">, + Group; +def: Flag<["-"], "f">, + Alias, + HelpText<"Alias for --flat">, + Group; + +def minimize: F<"minimize">, + HelpText<"When used when creating a dSYM file with Apple accelerator tables, " + "this option will suppress the emission of the .debug_inlines, " + ".debug_pubnames, and .debug_pubtypes sections since dsymutil " + "has better equivalents: .apple_names and .apple_types. When used in " + "conjunction with --update option, this option will cause redundant " + "accelerator tables to be removed.">, + Group; +def: Flag<["-"], "z">, + Alias, + HelpText<"Alias for --minimize">, + Group; + +def update: F<"update">, + HelpText<"Updates existing dSYM files to contain the latest accelerator tables and other DWARF optimizations.">, + Group; +def: Flag<["-"], "u">, + Alias, + HelpText<"Alias for --update">, + Group; + +def output: Separate<["--", "-"], "o">, + MetaVarName<"">, + HelpText<"Specify the output file. Defaults to .dwarf">, + Group; +def: Separate<["-"], "out">, + Alias, + HelpText<"Alias for --o">, + Group; +def: Joined<["--", "-"], "out=">, Alias; +def: Joined<["--", "-"], "o=">, Alias; + +def oso_prepend_path: Separate<["--", "-"], "oso-prepend-path">, + MetaVarName<"">, + HelpText<"Specify a directory to prepend to the paths of object files.">, + Group; +def: Joined<["--", "-"], "oso-prepend-path=">, Alias; + +def symbolmap: Separate<["--", "-"], "symbol-map">, + MetaVarName<"">, + HelpText<"Updates the existing dSYMs inplace using symbol map specified.">, + Group; +def: Joined<["--", "-"], "symbol-map=">, Alias; + +def arch: Separate<["--", "-"], "arch">, + MetaVarName<"">, + HelpText<"Link DWARF debug information only for specified CPU architecture" + "types. This option can be specified multiple times, once for each" + "desired architecture. All CPU architectures will be linked by" + "default.">, + Group; +def: Joined<["--", "-"], "arch=">, Alias; + +def accelerator: Separate<["--", "-"], "accelerator">, + MetaVarName<"">, + HelpText<"Specify the desired type of accelerator table. Valid options are 'Apple', 'Dwarf' and 'Default'">, + Group; +def: Joined<["--", "-"], "accelerator=">, Alias; + +def toolchain: Separate<["--", "-"], "toolchain">, + MetaVarName<"">, + HelpText<"Embed toolchain information in dSYM bundle.">, + Group; + +def threads: Separate<["--", "-"], "num-threads">, + MetaVarName<"">, + HelpText<"Specifies the maximum number of simultaneous threads to use when linking multiple architectures.">, + Group; +def: Separate<["-"], "j">, + Alias, + HelpText<"Alias for --num-threads">, + Group; diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index bf42ec73269c..6d164c449e5e 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -20,12 +20,16 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/Triple.h" #include "llvm/DebugInfo/DIContext.h" #include "llvm/DebugInfo/DWARF/DWARFContext.h" #include "llvm/DebugInfo/DWARF/DWARFVerifier.h" #include "llvm/Object/Binary.h" #include "llvm/Object/MachO.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/InitLLVM.h" @@ -43,137 +47,227 @@ #include using namespace llvm; -using namespace llvm::cl; using namespace llvm::dsymutil; using namespace object; -static OptionCategory DsymCategory("Specific Options"); -static opt Help("h", desc("Alias for -help"), Hidden); -static opt Version("v", desc("Alias for -version"), Hidden); - -static list InputFiles(Positional, OneOrMore, - desc(""), cat(DsymCategory)); - -static opt - OutputFileOpt("o", - desc("Specify the output file. default: .dwarf"), - value_desc("filename"), cat(DsymCategory)); -static alias OutputFileOptA("out", desc("Alias for -o"), - aliasopt(OutputFileOpt)); - -static opt OsoPrependPath( - "oso-prepend-path", - desc("Specify a directory to prepend to the paths of object files."), - value_desc("path"), cat(DsymCategory)); - -static opt Assembly( - "S", - desc("Output textual assembly instead of a binary dSYM companion file."), - init(false), cat(DsymCategory), cl::Hidden); - -static opt DumpStab( - "symtab", - desc("Dumps the symbol table found in executable or object file(s) and\n" - "exits."), - init(false), cat(DsymCategory)); -static alias DumpStabA("s", desc("Alias for --symtab"), aliasopt(DumpStab)); - -static opt FlatOut("flat", - desc("Produce a flat dSYM file (not a bundle)."), - init(false), cat(DsymCategory)); -static alias FlatOutA("f", desc("Alias for --flat"), aliasopt(FlatOut)); - -static opt Minimize( - "minimize", - desc("When used when creating a dSYM file with Apple accelerator tables,\n" - "this option will suppress the emission of the .debug_inlines, \n" - ".debug_pubnames, and .debug_pubtypes sections since dsymutil \n" - "has better equivalents: .apple_names and .apple_types. When used in\n" - "conjunction with --update option, this option will cause redundant\n" - "accelerator tables to be removed."), - init(false), cat(DsymCategory)); -static alias MinimizeA("z", desc("Alias for --minimize"), aliasopt(Minimize)); - -static opt Update( - "update", - desc("Updates existing dSYM files to contain the latest accelerator\n" - "tables and other DWARF optimizations."), - init(false), cat(DsymCategory)); -static alias UpdateA("u", desc("Alias for --update"), aliasopt(Update)); - -static opt SymbolMap( - "symbol-map", - desc("Updates the existing dSYMs inplace using symbol map specified."), - value_desc("bcsymbolmap"), cat(DsymCategory)); - -static cl::opt AcceleratorTable( - "accelerator", cl::desc("Output accelerator tables."), - cl::values(clEnumValN(AccelTableKind::Default, "Default", - "Default for input."), - clEnumValN(AccelTableKind::Apple, "Apple", "Apple"), - clEnumValN(AccelTableKind::Dwarf, "Dwarf", "DWARF")), - cl::init(AccelTableKind::Default), cat(DsymCategory)); - -static opt NumThreads( - "num-threads", - desc("Specifies the maximum number (n) of simultaneous threads to use\n" - "when linking multiple architectures."), - value_desc("n"), init(0), cat(DsymCategory)); -static alias NumThreadsA("j", desc("Alias for --num-threads"), - aliasopt(NumThreads)); - -static opt Verbose("verbose", desc("Verbosity level"), init(false), - cat(DsymCategory)); - -static opt - NoOutput("no-output", - desc("Do the link in memory, but do not emit the result file."), - init(false), cat(DsymCategory)); - -static opt - NoTimestamp("no-swiftmodule-timestamp", - desc("Don't check timestamp for swiftmodule files."), - init(false), cat(DsymCategory)); - -static list ArchFlags( - "arch", - desc("Link DWARF debug information only for specified CPU architecture\n" - "types. This option can be specified multiple times, once for each\n" - "desired architecture. All CPU architectures will be linked by\n" - "default."), - value_desc("arch"), ZeroOrMore, cat(DsymCategory)); - -static opt - NoODR("no-odr", - desc("Do not use ODR (One Definition Rule) for type uniquing."), - init(false), cat(DsymCategory)); - -static opt DumpDebugMap( - "dump-debug-map", - desc("Parse and dump the debug map to standard output. Not DWARF link " - "will take place."), - init(false), cat(DsymCategory)); - -static opt InputIsYAMLDebugMap( - "y", desc("Treat the input file is a YAML debug map rather than a binary."), - init(false), cat(DsymCategory)); - -static opt Verify("verify", desc("Verify the linked DWARF debug info."), - cat(DsymCategory)); - -static opt - Toolchain("toolchain", desc("Embed toolchain information in dSYM bundle."), - cat(DsymCategory)); - -static opt - PaperTrailWarnings("papertrail", - desc("Embed warnings in the linked DWARF debug info."), - cat(DsymCategory)); - -static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { - if (NoOutput) - return Error::success(); +namespace { +enum ID { + OPT_INVALID = 0, // This is not an option ID. +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + OPT_##ID, +#include "Options.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Options.inc" +#undef PREFIX + +const opt::OptTable::Info InfoTable[] = { +#define OPTION(PREFIX, NAME, ID, KIND, GROUP, ALIAS, ALIASARGS, FLAGS, PARAM, \ + HELPTEXT, METAVAR, VALUES) \ + { \ + PREFIX, NAME, HELPTEXT, \ + METAVAR, OPT_##ID, opt::Option::KIND##Class, \ + PARAM, FLAGS, OPT_##GROUP, \ + OPT_##ALIAS, ALIASARGS, VALUES}, +#include "Options.inc" +#undef OPTION +}; + +class DsymutilOptTable : public opt::OptTable { +public: + DsymutilOptTable() : OptTable(InfoTable) {} +}; +} // namespace + +struct DsymutilOptions { + bool DumpDebugMap = false; + bool DumpStab = false; + bool Flat = false; + bool InputIsYAMLDebugMap = false; + bool PaperTrailWarnings = false; + bool Verify = false; + std::string SymbolMap; + std::string OutputFile; + std::string Toolchain; + std::vector Archs; + std::vector InputFiles; + unsigned NumThreads; + LinkOptions LinkOptions; +}; + +/// Return a list of input files. This function has logic for dealing with the +/// special case where we might have dSYM bundles as input. The function +/// returns an error when the directory structure doesn't match that of a dSYM +/// bundle. +static Expected> getInputs(opt::InputArgList &Args, + bool DsymAsInput) { + std::vector InputFiles; + for (auto *File : Args.filtered(OPT_INPUT)) + InputFiles.push_back(File->getValue()); + + if (!DsymAsInput) + return InputFiles; + + // If we are updating, we might get dSYM bundles as input. + std::vector Inputs; + for (const auto &Input : InputFiles) { + if (!llvm::sys::fs::is_directory(Input)) { + Inputs.push_back(Input); + continue; + } + + // Make sure that we're dealing with a dSYM bundle. + SmallString<256> BundlePath(Input); + sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); + if (!llvm::sys::fs::is_directory(BundlePath)) + return make_error( + Input + " is a directory, but doesn't look like a dSYM bundle.", + inconvertibleErrorCode()); + + // Create a directory iterator to iterate over all the entries in the + // bundle. + std::error_code EC; + llvm::sys::fs::directory_iterator DirIt(BundlePath, EC); + llvm::sys::fs::directory_iterator DirEnd; + if (EC) + return errorCodeToError(EC); + + // Add each entry to the list of inputs. + while (DirIt != DirEnd) { + Inputs.push_back(DirIt->path()); + DirIt.increment(EC); + if (EC) + return errorCodeToError(EC); + } + } + return Inputs; +} + +// Verify that the given combination of options makes sense. +static llvm::Error verifyOptions(const DsymutilOptions &Options) { + if (Options.LinkOptions.Update && + std::find(Options.InputFiles.begin(), Options.InputFiles.end(), "-") != + Options.InputFiles.end()) { + // FIXME: We cannot use stdin for an update because stdin will be + // consumed by the BinaryHolder during the debugmap parsing, and + // then we will want to consume it again in DwarfLinker. If we + // used a unique BinaryHolder object that could cache multiple + // binaries this restriction would go away. + return make_error( + "standard input cannot be used as input for a dSYM update.", + errc::invalid_argument); + } + + if (!Options.Flat && Options.OutputFile == "-") + return make_error( + "cannot emit to standard output without --flat.", + errc::invalid_argument); + + if (Options.InputFiles.size() > 1 && Options.Flat && + !Options.OutputFile.empty()) + return make_error( + "cannot use -o with multiple inputs in flat mode.", + errc::invalid_argument); + + if (Options.PaperTrailWarnings && Options.InputIsYAMLDebugMap) + return make_error( + "paper trail warnings are not supported for YAML input.", + errc::invalid_argument); + + return Error::success(); +} + +static Expected getAccelTableKind(opt::InputArgList &Args) { + if (opt::Arg *Accelerator = Args.getLastArg(OPT_accelerator)) { + StringRef S = Accelerator->getValue(); + if (S == "Apple") + return AccelTableKind::Apple; + if (S == "Dwarf") + return AccelTableKind::Dwarf; + if (S == "Default") + return AccelTableKind::Default; + return make_error( + "invalid accelerator type specified: '" + S + + "'. Support values are 'Apple', 'Dwarf' and 'Default'.", + inconvertibleErrorCode()); + } + return AccelTableKind::Default; +} + +/// Parses the command line options into the LinkOptions struct and performs +/// some sanity checking. Returns an error in case the latter fails. +static Expected getOptions(opt::InputArgList &Args) { + DsymutilOptions Options; + + Options.DumpDebugMap = Args.hasArg(OPT_dump_debug_map); + Options.DumpStab = Args.hasArg(OPT_symtab); + Options.Flat = Args.hasArg(OPT_flat); + Options.InputIsYAMLDebugMap = Args.hasArg(OPT_yaml_input); + Options.PaperTrailWarnings = Args.hasArg(OPT_papertrail); + Options.Verify = Args.hasArg(OPT_verify); + + Options.LinkOptions.Minimize = Args.hasArg(OPT_minimize); + Options.LinkOptions.NoODR = Args.hasArg(OPT_no_odr); + Options.LinkOptions.NoOutput = Args.hasArg(OPT_no_output); + Options.LinkOptions.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp); + Options.LinkOptions.Update = Args.hasArg(OPT_update); + Options.LinkOptions.Verbose = Args.hasArg(OPT_verbose); + + if (Expected AccelKind = getAccelTableKind(Args)) { + Options.LinkOptions.TheAccelTableKind = *AccelKind; + } else { + return AccelKind.takeError(); + } + + if (opt::Arg *SymbolMap = Args.getLastArg(OPT_symbolmap)) + Options.SymbolMap = SymbolMap->getValue(); + + if (Args.hasArg(OPT_symbolmap)) + Options.LinkOptions.Update = true; + + if (Expected> InputFiles = + getInputs(Args, Options.LinkOptions.Update)) { + Options.InputFiles = std::move(*InputFiles); + } else { + return InputFiles.takeError(); + } + + for (auto *Arch : Args.filtered(OPT_arch)) + Options.Archs.push_back(Arch->getValue()); + + if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) + Options.LinkOptions.PrependPath = OsoPrependPath->getValue(); + + if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) + Options.OutputFile = OutputFile->getValue(); + + if (opt::Arg *Toolchain = Args.getLastArg(OPT_toolchain)) + Options.Toolchain = Toolchain->getValue(); + + if (Args.hasArg(OPT_assembly)) + Options.LinkOptions.FileType = OutputFileType::Assembly; + + if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) + Options.LinkOptions.Threads = atoi(NumThreads->getValue()); + else + Options.LinkOptions.Threads = llvm::thread::hardware_concurrency(); + + if (Options.DumpDebugMap || Options.LinkOptions.Verbose) + Options.LinkOptions.Threads = 1; + if (getenv("RC_DEBUG_OPTIONS")) + Options.PaperTrailWarnings = true; + + if (Error E = verifyOptions(Options)) + return std::move(E); + return Options; +} + +static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot, + llvm::StringRef Toolchain) { // Create plist file to write to. llvm::SmallString<128> InfoPlist(BundleRoot); llvm::sys::path::append(InfoPlist, "Contents/Info.plist"); @@ -237,9 +331,6 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot) { } static Error createBundleDir(llvm::StringRef BundleBase) { - if (NoOutput) - return Error::success(); - llvm::SmallString<128> Bundle(BundleBase); llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF"); if (std::error_code EC = @@ -250,7 +341,8 @@ static Error createBundleDir(llvm::StringRef BundleBase) { return Error::success(); } -static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch) { +static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch, + bool Verbose) { if (OutputFile == "-") { WithColor::warning() << "verification skipped for " << Arch << "because writing to stdout.\n"; @@ -288,25 +380,27 @@ struct OutputLocation { std::string DWARFFile; llvm::Optional ResourceDir; }; -} +} // namespace -static Expected getOutputFileName(llvm::StringRef InputFile) { - if (OutputFileOpt == "-") - return OutputLocation(OutputFileOpt); +static Expected +getOutputFileName(llvm::StringRef InputFile, const DsymutilOptions &Options) { + if (Options.OutputFile == "-") + return OutputLocation(Options.OutputFile); // When updating, do in place replacement. - if (OutputFileOpt.empty() && (Update || !SymbolMap.empty())) + if (Options.OutputFile.empty() && + (Options.LinkOptions.Update || !Options.SymbolMap.empty())) return OutputLocation(InputFile); // If a flat dSYM has been requested, things are pretty simple. - if (FlatOut) { - if (OutputFileOpt.empty()) { + if (Options.Flat) { + if (Options.OutputFile.empty()) { if (InputFile == "-") return OutputLocation{"a.out.dwarf", {}}; return OutputLocation((InputFile + ".dwarf").str()); } - return OutputLocation(OutputFileOpt); + return OutputLocation(Options.OutputFile); } // We need to create/update a dSYM bundle. @@ -319,13 +413,15 @@ static Expected getOutputFileName(llvm::StringRef InputFile) { // std::string DwarfFile = InputFile == "-" ? llvm::StringRef("a.out") : InputFile; - llvm::SmallString<128> Path(OutputFileOpt); + llvm::SmallString<128> Path(Options.OutputFile); if (Path.empty()) Path = DwarfFile + ".dSYM"; - if (auto E = createBundleDir(Path)) - return std::move(E); - if (auto E = createPlistFile(DwarfFile, Path)) - return std::move(E); + if (!Options.LinkOptions.NoOutput) { + if (auto E = createBundleDir(Path)) + return std::move(E); + if (auto E = createPlistFile(DwarfFile, Path, Options.Toolchain)) + return std::move(E); + } llvm::sys::path::append(Path, "Contents", "Resources"); std::string ResourceDir = Path.str(); @@ -333,177 +429,71 @@ static Expected getOutputFileName(llvm::StringRef InputFile) { return OutputLocation(Path.str(), ResourceDir); } -/// Parses the command line options into the LinkOptions struct and performs -/// some sanity checking. Returns an error in case the latter fails. -static Expected getOptions() { - LinkOptions Options; - - Options.Verbose = Verbose; - Options.NoOutput = NoOutput; - Options.NoODR = NoODR; - Options.Minimize = Minimize; - Options.Update = Update; - Options.NoTimestamp = NoTimestamp; - Options.PrependPath = OsoPrependPath; - Options.TheAccelTableKind = AcceleratorTable; - - if (!SymbolMap.empty()) - Options.Update = true; - - if (Assembly) - Options.FileType = OutputFileType::Assembly; - - if (Options.Update && std::find(InputFiles.begin(), InputFiles.end(), "-") != - InputFiles.end()) { - // FIXME: We cannot use stdin for an update because stdin will be - // consumed by the BinaryHolder during the debugmap parsing, and - // then we will want to consume it again in DwarfLinker. If we - // used a unique BinaryHolder object that could cache multiple - // binaries this restriction would go away. - return make_error( - "standard input cannot be used as input for a dSYM update.", - inconvertibleErrorCode()); - } - - if (NumThreads == 0) - Options.Threads = llvm::thread::hardware_concurrency(); - else - Options.Threads = NumThreads; - if (DumpDebugMap || Verbose) - Options.Threads = 1; - - return Options; -} - -/// Return a list of input files. This function has logic for dealing with the -/// special case where we might have dSYM bundles as input. The function -/// returns an error when the directory structure doesn't match that of a dSYM -/// bundle. -static Expected> getInputs(bool DsymAsInput) { - if (!DsymAsInput) - return InputFiles; - - // If we are updating, we might get dSYM bundles as input. - std::vector Inputs; - for (const auto &Input : InputFiles) { - if (!llvm::sys::fs::is_directory(Input)) { - Inputs.push_back(Input); - continue; - } - - // Make sure that we're dealing with a dSYM bundle. - SmallString<256> BundlePath(Input); - sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); - if (!llvm::sys::fs::is_directory(BundlePath)) - return make_error( - Input + " is a directory, but doesn't look like a dSYM bundle.", - inconvertibleErrorCode()); - - // Create a directory iterator to iterate over all the entries in the - // bundle. - std::error_code EC; - llvm::sys::fs::directory_iterator DirIt(BundlePath, EC); - llvm::sys::fs::directory_iterator DirEnd; - if (EC) - return errorCodeToError(EC); - - // Add each entry to the list of inputs. - while (DirIt != DirEnd) { - Inputs.push_back(DirIt->path()); - DirIt.increment(EC); - if (EC) - return errorCodeToError(EC); - } - } - return Inputs; -} - int main(int argc, char **argv) { InitLLVM X(argc, argv); + // Parse arguments. + DsymutilOptTable T; + unsigned MAI; + unsigned MAC; + ArrayRef ArgsArr = makeArrayRef(argv + 1, argc - 1); + opt::InputArgList Args = T.ParseArgs(ArgsArr, MAI, MAC); + void *P = (void *)(intptr_t)getOutputFileName; std::string SDKPath = llvm::sys::fs::getMainExecutable(argv[0], P); SDKPath = llvm::sys::path::parent_path(SDKPath); - HideUnrelatedOptions({&DsymCategory, &ColorCategory}); - llvm::cl::ParseCommandLineOptions( - argc, argv, - "manipulate archived DWARF debug symbol files.\n\n" - "dsymutil links the DWARF debug information found in the object files\n" - "for the executable by using debug symbols information\n" - "contained in its symbol table.\n"); - - if (Help) { - PrintHelpMessage(); + if (Args.hasArg(OPT_help)) { + T.PrintHelp( + llvm::outs(), + (std::string(argv[0]) + " [options] ").c_str(), + "manipulate archived DWARF debug symbol files.\n\n" + "dsymutil links the DWARF debug information found in the object files\n" + "for the executable by using debug symbols information\n" + "contained in its symbol table.\n", + false); return 0; } - if (Version) { + if (Args.hasArg(OPT_version)) { llvm::cl::PrintVersionMessage(); return 0; } - auto OptionsOrErr = getOptions(); + auto OptionsOrErr = getOptions(Args); if (!OptionsOrErr) { WithColor::error() << toString(OptionsOrErr.takeError()); return 1; } + auto &Options = *OptionsOrErr; + llvm::InitializeAllTargetInfos(); llvm::InitializeAllTargetMCs(); llvm::InitializeAllTargets(); llvm::InitializeAllAsmPrinters(); - auto InputsOrErr = getInputs(OptionsOrErr->Update); - if (!InputsOrErr) { - WithColor::error() << toString(InputsOrErr.takeError()) << '\n'; - return 1; - } - - if (!FlatOut && OutputFileOpt == "-") { - WithColor::error() << "cannot emit to standard output without --flat\n"; - return 1; - } - - if (InputsOrErr->size() > 1 && FlatOut && !OutputFileOpt.empty()) { - WithColor::error() << "cannot use -o with multiple inputs in flat mode\n"; - return 1; - } - - if (InputFiles.size() > 1 && !SymbolMap.empty() && - !llvm::sys::fs::is_directory(SymbolMap)) { - WithColor::error() << "when unobfuscating multiple files, --symbol-map " - << "needs to point to a directory.\n"; - return 1; - } - - if (getenv("RC_DEBUG_OPTIONS")) - PaperTrailWarnings = true; - - if (PaperTrailWarnings && InputIsYAMLDebugMap) - WithColor::warning() - << "Paper trail warnings are not supported for YAML input"; - - for (const auto &Arch : ArchFlags) + for (const auto &Arch : Options.Archs) if (Arch != "*" && Arch != "all" && !llvm::object::MachOObjectFile::isValidArch(Arch)) { WithColor::error() << "unsupported cpu architecture: '" << Arch << "'\n"; return 1; } - SymbolMapLoader SymMapLoader(SymbolMap); + SymbolMapLoader SymMapLoader(Options.SymbolMap); - for (auto &InputFile : *InputsOrErr) { + for (auto &InputFile : Options.InputFiles) { // Dump the symbol table for each input file and requested arch - if (DumpStab) { - if (!dumpStab(InputFile, ArchFlags, OsoPrependPath)) + if (Options.DumpStab) { + if (!dumpStab(InputFile, Options.Archs, Options.LinkOptions.PrependPath)) return 1; continue; } auto DebugMapPtrsOrErr = - parseDebugMap(InputFile, ArchFlags, OsoPrependPath, PaperTrailWarnings, - Verbose, InputIsYAMLDebugMap); + parseDebugMap(InputFile, Options.Archs, Options.LinkOptions.PrependPath, + Options.PaperTrailWarnings, Options.LinkOptions.Verbose, + Options.InputIsYAMLDebugMap); if (auto EC = DebugMapPtrsOrErr.getError()) { WithColor::error() << "cannot parse the debug map for '" << InputFile @@ -511,7 +501,7 @@ int main(int argc, char **argv) { return 1; } - if (OptionsOrErr->Update) { + if (Options.LinkOptions.Update) { // The debug map should be empty. Add one object file corresponding to // the input file. for (auto &Map : *DebugMapPtrsOrErr) @@ -528,27 +518,27 @@ int main(int argc, char **argv) { // Shared a single binary holder for all the link steps. BinaryHolder BinHolder; - unsigned ThreadCount = - std::min(OptionsOrErr->Threads, DebugMapPtrsOrErr->size()); + unsigned ThreadCount = std::min(Options.LinkOptions.Threads, + DebugMapPtrsOrErr->size()); llvm::ThreadPool Threads(ThreadCount); // If there is more than one link to execute, we need to generate // temporary files. bool NeedsTempFiles = - !DumpDebugMap && (OutputFileOpt != "-") && - (DebugMapPtrsOrErr->size() != 1 || OptionsOrErr->Update); + !Options.DumpDebugMap && (Options.OutputFile != "-") && + (DebugMapPtrsOrErr->size() != 1 || Options.LinkOptions.Update); llvm::SmallVector TempFiles; std::atomic_char AllOK(1); for (auto &Map : *DebugMapPtrsOrErr) { - if (Verbose || DumpDebugMap) + if (Options.LinkOptions.Verbose || Options.DumpDebugMap) Map->print(llvm::outs()); - if (DumpDebugMap) + if (Options.DumpDebugMap) continue; - if (!SymbolMap.empty()) - OptionsOrErr->Translator = SymMapLoader.Load(InputFile, *Map); + if (!Options.SymbolMap.empty()) + Options.LinkOptions.Translator = SymMapLoader.Load(InputFile, *Map); if (Map->begin() == Map->end()) WithColor::warning() @@ -560,12 +550,12 @@ int main(int argc, char **argv) { std::shared_ptr OS; Expected OutputLocationOrErr = - getOutputFileName(InputFile); + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } - OptionsOrErr->ResourceDir = OutputLocationOrErr->getResourceDir(); + Options.LinkOptions.ResourceDir = OutputLocationOrErr->getResourceDir(); std::string OutputFile = OutputLocationOrErr->DWARFFile; if (NeedsTempFiles) { @@ -583,30 +573,33 @@ int main(int argc, char **argv) { OutputFile = TempFile.TmpName; } else { std::error_code EC; - OS = std::make_shared(NoOutput ? "-" : OutputFile, EC, - sys::fs::OF_None); + OS = std::make_shared( + Options.LinkOptions.NoOutput ? "-" : OutputFile, EC, + sys::fs::OF_None); if (EC) { WithColor::error() << OutputFile << ": " << EC.message(); return 1; } } + const bool Verify = Options.Verify && !Options.LinkOptions.NoOutput; auto LinkLambda = [&, OutputFile](std::shared_ptr Stream, LinkOptions Options) { AllOK.fetch_and( linkDwarf(*Stream, BinHolder, *Map, std::move(Options))); Stream->flush(); - if (Verify && !NoOutput) - AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName())); + if (Verify) + AllOK.fetch_and(verify(OutputFile, Map->getTriple().getArchName(), + Options.Verbose)); }; // FIXME: The DwarfLinker can have some very deep recursion that can max // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. if (ThreadCount == 1) - LinkLambda(OS, *OptionsOrErr); + LinkLambda(OS, Options.LinkOptions); else - Threads.async(LinkLambda, OS, *OptionsOrErr); + Threads.async(LinkLambda, OS, Options.LinkOptions); } Threads.wait(); @@ -615,14 +608,15 @@ int main(int argc, char **argv) { return 1; if (NeedsTempFiles) { - Expected OutputLocationOrErr = getOutputFileName(InputFile); + Expected OutputLocationOrErr = + getOutputFileName(InputFile, Options); if (!OutputLocationOrErr) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } if (!MachOUtils::generateUniversalBinary(TempFiles, OutputLocationOrErr->DWARFFile, - *OptionsOrErr, SDKPath)) + Options.LinkOptions, SDKPath)) return 1; } } From 021704a15610570c1a6be8dcc76cdc5d40bfd9ca Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Oct 2019 16:34:44 +0000 Subject: [PATCH 69/82] [dsymutil] Improve consistency by removing redundant namespaces (NFC) The dsymutil implementation file has a using-directive for the llvm namespace. This patch just removes redundant namespace qualifiers. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373623 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/dsymutil/dsymutil.cpp | 84 ++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 44 deletions(-) diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index 6d164c449e5e..b5259a2f5d65 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -114,7 +114,7 @@ static Expected> getInputs(opt::InputArgList &Args, // If we are updating, we might get dSYM bundles as input. std::vector Inputs; for (const auto &Input : InputFiles) { - if (!llvm::sys::fs::is_directory(Input)) { + if (!sys::fs::is_directory(Input)) { Inputs.push_back(Input); continue; } @@ -122,7 +122,7 @@ static Expected> getInputs(opt::InputArgList &Args, // Make sure that we're dealing with a dSYM bundle. SmallString<256> BundlePath(Input); sys::path::append(BundlePath, "Contents", "Resources", "DWARF"); - if (!llvm::sys::fs::is_directory(BundlePath)) + if (!sys::fs::is_directory(BundlePath)) return make_error( Input + " is a directory, but doesn't look like a dSYM bundle.", inconvertibleErrorCode()); @@ -130,8 +130,8 @@ static Expected> getInputs(opt::InputArgList &Args, // Create a directory iterator to iterate over all the entries in the // bundle. std::error_code EC; - llvm::sys::fs::directory_iterator DirIt(BundlePath, EC); - llvm::sys::fs::directory_iterator DirEnd; + sys::fs::directory_iterator DirIt(BundlePath, EC); + sys::fs::directory_iterator DirEnd; if (EC) return errorCodeToError(EC); @@ -147,7 +147,7 @@ static Expected> getInputs(opt::InputArgList &Args, } // Verify that the given combination of options makes sense. -static llvm::Error verifyOptions(const DsymutilOptions &Options) { +static Error verifyOptions(const DsymutilOptions &Options) { if (Options.LinkOptions.Update && std::find(Options.InputFiles.begin(), Options.InputFiles.end(), "-") != Options.InputFiles.end()) { @@ -253,7 +253,7 @@ static Expected getOptions(opt::InputArgList &Args) { if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) Options.LinkOptions.Threads = atoi(NumThreads->getValue()); else - Options.LinkOptions.Threads = llvm::thread::hardware_concurrency(); + Options.LinkOptions.Threads = thread::hardware_concurrency(); if (Options.DumpDebugMap || Options.LinkOptions.Verbose) Options.LinkOptions.Threads = 1; @@ -266,13 +266,13 @@ static Expected getOptions(opt::InputArgList &Args) { return Options; } -static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot, - llvm::StringRef Toolchain) { +static Error createPlistFile(StringRef Bin, StringRef BundleRoot, + StringRef Toolchain) { // Create plist file to write to. - llvm::SmallString<128> InfoPlist(BundleRoot); - llvm::sys::path::append(InfoPlist, "Contents/Info.plist"); + SmallString<128> InfoPlist(BundleRoot); + sys::path::append(InfoPlist, "Contents/Info.plist"); std::error_code EC; - llvm::raw_fd_ostream PL(InfoPlist, EC, llvm::sys::fs::OF_Text); + raw_fd_ostream PL(InfoPlist, EC, sys::fs::OF_Text); if (EC) return make_error( "cannot create Plist: " + toString(errorCodeToError(EC)), EC); @@ -280,9 +280,9 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot, CFBundleInfo BI = getBundleInfo(Bin); if (BI.IDStr.empty()) { - llvm::StringRef BundleID = *llvm::sys::path::rbegin(BundleRoot); - if (llvm::sys::path::extension(BundleRoot) == ".dSYM") - BI.IDStr = llvm::sys::path::stem(BundleID); + StringRef BundleID = *sys::path::rbegin(BundleRoot); + if (sys::path::extension(BundleRoot) == ".dSYM") + BI.IDStr = sys::path::stem(BundleID); else BI.IDStr = BundleID; } @@ -330,19 +330,18 @@ static Error createPlistFile(llvm::StringRef Bin, llvm::StringRef BundleRoot, return Error::success(); } -static Error createBundleDir(llvm::StringRef BundleBase) { - llvm::SmallString<128> Bundle(BundleBase); - llvm::sys::path::append(Bundle, "Contents", "Resources", "DWARF"); +static Error createBundleDir(StringRef BundleBase) { + SmallString<128> Bundle(BundleBase); + sys::path::append(Bundle, "Contents", "Resources", "DWARF"); if (std::error_code EC = - create_directories(Bundle.str(), true, llvm::sys::fs::perms::all_all)) + create_directories(Bundle.str(), true, sys::fs::perms::all_all)) return make_error( "cannot create bundle: " + toString(errorCodeToError(EC)), EC); return Error::success(); } -static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch, - bool Verbose) { +static bool verify(StringRef OutputFile, StringRef Arch, bool Verbose) { if (OutputFile == "-") { WithColor::warning() << "verification skipped for " << Arch << "because writing to stdout.\n"; @@ -372,18 +371,17 @@ static bool verify(llvm::StringRef OutputFile, llvm::StringRef Arch, namespace { struct OutputLocation { - OutputLocation(std::string DWARFFile, - llvm::Optional ResourceDir = {}) + OutputLocation(std::string DWARFFile, Optional ResourceDir = {}) : DWARFFile(DWARFFile), ResourceDir(ResourceDir) {} /// This method is a workaround for older compilers. - llvm::Optional getResourceDir() const { return ResourceDir; } + Optional getResourceDir() const { return ResourceDir; } std::string DWARFFile; - llvm::Optional ResourceDir; + Optional ResourceDir; }; } // namespace static Expected -getOutputFileName(llvm::StringRef InputFile, const DsymutilOptions &Options) { +getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) { if (Options.OutputFile == "-") return OutputLocation(Options.OutputFile); @@ -411,9 +409,8 @@ getOutputFileName(llvm::StringRef InputFile, const DsymutilOptions &Options) { // Resources/ // DWARF/ // - std::string DwarfFile = - InputFile == "-" ? llvm::StringRef("a.out") : InputFile; - llvm::SmallString<128> Path(Options.OutputFile); + std::string DwarfFile = InputFile == "-" ? StringRef("a.out") : InputFile; + SmallString<128> Path(Options.OutputFile); if (Path.empty()) Path = DwarfFile + ".dSYM"; if (!Options.LinkOptions.NoOutput) { @@ -423,9 +420,9 @@ getOutputFileName(llvm::StringRef InputFile, const DsymutilOptions &Options) { return std::move(E); } - llvm::sys::path::append(Path, "Contents", "Resources"); + sys::path::append(Path, "Contents", "Resources"); std::string ResourceDir = Path.str(); - llvm::sys::path::append(Path, "DWARF", llvm::sys::path::filename(DwarfFile)); + sys::path::append(Path, "DWARF", sys::path::filename(DwarfFile)); return OutputLocation(Path.str(), ResourceDir); } @@ -440,13 +437,12 @@ int main(int argc, char **argv) { opt::InputArgList Args = T.ParseArgs(ArgsArr, MAI, MAC); void *P = (void *)(intptr_t)getOutputFileName; - std::string SDKPath = llvm::sys::fs::getMainExecutable(argv[0], P); - SDKPath = llvm::sys::path::parent_path(SDKPath); + std::string SDKPath = sys::fs::getMainExecutable(argv[0], P); + SDKPath = sys::path::parent_path(SDKPath); if (Args.hasArg(OPT_help)) { T.PrintHelp( - llvm::outs(), - (std::string(argv[0]) + " [options] ").c_str(), + outs(), (std::string(argv[0]) + " [options] ").c_str(), "manipulate archived DWARF debug symbol files.\n\n" "dsymutil links the DWARF debug information found in the object files\n" "for the executable by using debug symbols information\n" @@ -456,7 +452,7 @@ int main(int argc, char **argv) { } if (Args.hasArg(OPT_version)) { - llvm::cl::PrintVersionMessage(); + cl::PrintVersionMessage(); return 0; } @@ -468,14 +464,14 @@ int main(int argc, char **argv) { auto &Options = *OptionsOrErr; - llvm::InitializeAllTargetInfos(); - llvm::InitializeAllTargetMCs(); - llvm::InitializeAllTargets(); - llvm::InitializeAllAsmPrinters(); + InitializeAllTargetInfos(); + InitializeAllTargetMCs(); + InitializeAllTargets(); + InitializeAllAsmPrinters(); for (const auto &Arch : Options.Archs) if (Arch != "*" && Arch != "all" && - !llvm::object::MachOObjectFile::isValidArch(Arch)) { + !object::MachOObjectFile::isValidArch(Arch)) { WithColor::error() << "unsupported cpu architecture: '" << Arch << "'\n"; return 1; } @@ -506,7 +502,7 @@ int main(int argc, char **argv) { // the input file. for (auto &Map : *DebugMapPtrsOrErr) Map->addDebugMapObject(InputFile, - llvm::sys::TimePoint()); + sys::TimePoint()); } // Ensure that the debug map is not empty (anymore). @@ -520,7 +516,7 @@ int main(int argc, char **argv) { unsigned ThreadCount = std::min(Options.LinkOptions.Threads, DebugMapPtrsOrErr->size()); - llvm::ThreadPool Threads(ThreadCount); + ThreadPool Threads(ThreadCount); // If there is more than one link to execute, we need to generate // temporary files. @@ -528,11 +524,11 @@ int main(int argc, char **argv) { !Options.DumpDebugMap && (Options.OutputFile != "-") && (DebugMapPtrsOrErr->size() != 1 || Options.LinkOptions.Update); - llvm::SmallVector TempFiles; + SmallVector TempFiles; std::atomic_char AllOK(1); for (auto &Map : *DebugMapPtrsOrErr) { if (Options.LinkOptions.Verbose || Options.DumpDebugMap) - Map->print(llvm::outs()); + Map->print(outs()); if (Options.DumpDebugMap) continue; From 86f1fe8edf669769a02038f81403880c1511d0c5 Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Thu, 3 Oct 2019 16:59:12 +0000 Subject: [PATCH 70/82] gn build: (manually) merge r373622 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373627 91177308-0d34-0410-b5e6-96231b3b80d8 --- utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn b/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn index 721b4c4be115..ee16b0a3a954 100644 --- a/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn +++ b/utils/gn/secondary/llvm/tools/dsymutil/BUILD.gn @@ -1,9 +1,18 @@ +import("//llvm/utils/TableGen/tablegen.gni") + +tablegen("Options") { + visibility = [ ":dsymutil" ] + args = [ "-gen-opt-parser-defs" ] +} + executable("dsymutil") { deps = [ + ":Options", "//llvm/lib/CodeGen/AsmPrinter", "//llvm/lib/DebugInfo/DWARF", "//llvm/lib/MC", "//llvm/lib/Object", + "//llvm/lib/Option", "//llvm/lib/Support", "//llvm/lib/Target", "//llvm/lib/Target:TargetsToBuild", From 9f29eb70dcc07369dde41421c597b31363976263 Mon Sep 17 00:00:00 2001 From: James Molloy Date: Thu, 3 Oct 2019 17:10:32 +0000 Subject: [PATCH 71/82] [ModuloSchedule] removeBranch() *before* creating the trip count condition The Hexagon code assumes there's no existing terminator when inserting its trip count condition check. This causes swp-stages5.ll to break. The generated code looks good to me, it is likely a permutation. I have disabled the new codegen path to keep everything green and will investigate along with the other 3-4 tests that have different codegen. Fixes expensive-checks build. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373629 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/CodeGen/ModuloSchedule.cpp | 3 +-- test/CodeGen/Hexagon/swp-stages5.ll | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/CodeGen/ModuloSchedule.cpp b/lib/CodeGen/ModuloSchedule.cpp index 30aa81487c8b..d891d644664f 100644 --- a/lib/CodeGen/ModuloSchedule.cpp +++ b/lib/CodeGen/ModuloSchedule.cpp @@ -1772,12 +1772,12 @@ void PeelingModuloScheduleExpander::fixupBranches() { MachineBasicBlock *Fallthrough = *Prolog->succ_begin(); MachineBasicBlock *Epilog = *EI; SmallVector Cond; + TII->removeBranch(*Prolog); Optional StaticallyGreater = Info->createTripCountGreaterCondition(TC, *Prolog, Cond); if (!StaticallyGreater.hasValue()) { LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n"); // Dynamically branch based on Cond. - TII->removeBranch(*Prolog); TII->insertBranch(*Prolog, Epilog, Fallthrough, Cond, DebugLoc()); } else if (*StaticallyGreater == false) { LLVM_DEBUG(dbgs() << "Static-false: TC > " << TC << "\n"); @@ -1788,7 +1788,6 @@ void PeelingModuloScheduleExpander::fixupBranches() { P.RemoveOperand(2); P.RemoveOperand(1); } - TII->removeBranch(*Prolog); TII->insertUnconditionalBranch(*Prolog, Epilog, DebugLoc()); KernelDisposed = true; } else { diff --git a/test/CodeGen/Hexagon/swp-stages5.ll b/test/CodeGen/Hexagon/swp-stages5.ll index 1f8463fbc30f..fdfb2101cd36 100644 --- a/test/CodeGen/Hexagon/swp-stages5.ll +++ b/test/CodeGen/Hexagon/swp-stages5.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s -pipeliner-experimental-cg=true | FileCheck %s +; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s | FileCheck %s ; Very similar to swp-stages4.ll, but the pipelined schedule is a little ; different. From ab7e61c53fbe7f033dfb6c9261b0e11a50a31f5f Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Thu, 3 Oct 2019 17:11:47 +0000 Subject: [PATCH 72/82] AMDGPU/SILoadStoreOptimizer: Optimize scanning for mergeable instructions Summary: This adds a pre-pass to this optimization that scans through the basic block and generates lists of mergeable instructions with one list per unique address. In the optimization phase instead of scanning through the basic block for mergeable instructions, we now iterate over the lists generated by the pre-pass. The decision to re-optimize a block is now made per list, so if we fail to merge any instructions with the same address, then we do not attempt to optimize them in future passes over the block. This will help to reduce the time this pass spends re-optimizing instructions. In one pathological test case, this change reduces the time spent in the SILoadStoreOptimizer from 0.2s to 0.03s. This restructuring will also make it possible to implement further solutions in this pass, because we can now add less expensive checks to the pre-pass and filter instructions out early which will avoid the need to do the expensive scanning during the optimization pass. For example, checking for adjacent offsets is an inexpensive test we can move to the pre-pass. Reviewers: arsenm, pendingchaos, rampitec, nhaehnle, vpykhtin Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D65961 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373630 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/SILoadStoreOptimizer.cpp | 267 ++++++++++++++------- 1 file changed, 185 insertions(+), 82 deletions(-) diff --git a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp index a78b62de7151..f9bce4cc9c1c 100644 --- a/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp +++ b/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp @@ -161,6 +161,31 @@ class SILoadStoreOptimizer : public MachineFunctionPass { return true; } + bool hasMergeableAddress(const MachineRegisterInfo &MRI) { + for (unsigned i = 0; i < NumAddresses; ++i) { + const MachineOperand *AddrOp = AddrReg[i]; + // Immediates are always OK. + if (AddrOp->isImm()) + continue; + + // Don't try to merge addresses that aren't either immediates or registers. + // TODO: Should be possible to merge FrameIndexes and maybe some other + // non-register + if (!AddrOp->isReg()) + return false; + + // TODO: We should be able to merge physical reg addreses. + if (Register::isPhysicalRegister(AddrOp->getReg())) + return false; + + // If an address has only one use then there will be on other + // instructions with the same address, so we can't merge this one. + if (MRI.hasOneNonDBGUse(AddrOp->getReg())) + return false; + } + return true; + } + void setMI(MachineBasicBlock::iterator MI, const SIInstrInfo &TII, const GCNSubtarget &STM); void setPaired(MachineBasicBlock::iterator MI, const SIInstrInfo &TII); @@ -220,6 +245,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass { bool promoteConstantOffsetToImm(MachineInstr &CI, MemInfoMap &Visited, SmallPtrSet &Promoted) const; + void addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const; + bool collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const; public: static char ID; @@ -228,7 +257,11 @@ class SILoadStoreOptimizer : public MachineFunctionPass { initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry()); } - bool optimizeBlock(MachineBasicBlock &MBB); + void removeCombinedInst(std::list &MergeList, + const MachineInstr &MI); + bool optimizeInstsWithSameBaseAddr(std::list &MergeList, + bool &OptimizeListAgain); + bool optimizeBlock(std::list > &MergeableInsts); bool runOnMachineFunction(MachineFunction &MF) override; @@ -424,6 +457,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI, AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]); AddrReg[i] = &I->getOperand(AddrIdx[i]); } + + InstsToMove.clear(); } void SILoadStoreOptimizer::CombineInfo::setPaired(MachineBasicBlock::iterator MI, @@ -646,15 +681,6 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI) { if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm()) return false; - for (unsigned i = 0; i < CI.NumAddresses; i++) { - // We only ever merge operations with the same base address register, so - // don't bother scanning forward if there are no other uses. - if (CI.AddrReg[i]->isReg() && - (Register::isPhysicalRegister(CI.AddrReg[i]->getReg()) || - MRI->hasOneNonDBGUse(CI.AddrReg[i]->getReg()))) - return false; - } - ++MBBI; DenseSet RegDefsToMove; @@ -827,12 +853,11 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted read2: " << *Read2 << '\n'); - return Next; + return Read2; } unsigned SILoadStoreOptimizer::write2Opcode(unsigned EltSize) const { @@ -911,12 +936,11 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI) { moveInstsAfter(Write2, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); LLVM_DEBUG(dbgs() << "Inserted write2 inst: " << *Write2 << '\n'); - return Next; + return Write2; } MachineBasicBlock::iterator @@ -938,12 +962,13 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.DLC0) // dlc - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + BuildMI(*MBB, CI.Paired, DL, TII->get(Opcode), DestReg) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::sbase)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.DLC0) // dlc + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -963,10 +988,9 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineBasicBlock::iterator @@ -997,15 +1021,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(MergedOffset) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(MergedOffset) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); std::pair SubRegIdx = getSubRegIdxs(CI); const unsigned SubRegIdx0 = std::get<0>(SubRegIdx); @@ -1025,10 +1050,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI) { moveInstsAfter(Copy1, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } unsigned SILoadStoreOptimizer::getNewOpcode(const CombineInfo &CI) { @@ -1191,22 +1215,22 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI) { const MachineMemOperand *MMOa = *CI.I->memoperands_begin(); const MachineMemOperand *MMOb = *CI.Paired->memoperands_begin(); - MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) - .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) - .addImm(std::min(CI.Offset0, CI.Offset1)) // offset - .addImm(CI.GLC0) // glc - .addImm(CI.SLC0) // slc - .addImm(0) // tfe - .addImm(CI.DLC0) // dlc - .addImm(0) // swz - .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); + MachineInstr *New = + MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::srsrc)) + .add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::soffset)) + .addImm(std::min(CI.Offset0, CI.Offset1)) // offset + .addImm(CI.GLC0) // glc + .addImm(CI.SLC0) // slc + .addImm(0) // tfe + .addImm(CI.DLC0) // dlc + .addImm(0) // swz + .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb)); moveInstsAfter(MIB, CI.InstsToMove); - MachineBasicBlock::iterator Next = std::next(CI.I); CI.I->eraseFromParent(); CI.Paired->eraseFromParent(); - return Next; + return New; } MachineOperand @@ -1519,32 +1543,105 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm( return false; } -// Scan through looking for adjacent LDS operations with constant offsets from -// the same base register. We rely on the scheduler to do the hard work of -// clustering nearby loads, and assume these are all adjacent. -bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { - bool Modified = false; +void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI, + std::list > &MergeableInsts) const { + for (std::list &AddrList : MergeableInsts) { + if (AddrList.front().hasSameBaseAddress(*CI.I) && + AddrList.front().InstClass == CI.InstClass) { + AddrList.emplace_back(CI); + return; + } + } + // Base address not found, so add a new list. + MergeableInsts.emplace_back(1, CI); +} + +bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB, + std::list > &MergeableInsts) const { + bool Modified = false; // Contain the list MemInfoMap Visited; // Contains the list of instructions for which constant offsets are being // promoted to the IMM. SmallPtrSet AnchorList; - for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) { - MachineInstr &MI = *I; - + // Sort potential mergeable instructions into lists. One list per base address. + for (MachineInstr &MI : MBB.instrs()) { + // We run this before checking if an address is mergeable, because it can produce + // better code even if the instructions aren't mergeable. if (promoteConstantOffsetToImm(MI, Visited, AnchorList)) Modified = true; + const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII); + if (InstClass == UNKNOWN) + continue; + // Don't combine if volatile. - if (MI.hasOrderedMemoryRef()) { - ++I; + if (MI.hasOrderedMemoryRef()) continue; - } CombineInfo CI; - CI.setMI(I, *TII, *STM); + CI.setMI(MI, *TII, *STM); + + if (!CI.hasMergeableAddress(*MRI)) + continue; + + addInstToMergeableList(CI, MergeableInsts); + } + return Modified; +} + +// Scan through looking for adjacent LDS operations with constant offsets from +// the same base register. We rely on the scheduler to do the hard work of +// clustering nearby loads, and assume these are all adjacent. +bool SILoadStoreOptimizer::optimizeBlock( + std::list > &MergeableInsts) { + bool Modified = false; + + for (std::list &MergeList : MergeableInsts) { + if (MergeList.size() < 2) + continue; + + bool OptimizeListAgain = false; + if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) { + // We weren't able to make any changes, so clear the list so we don't + // process the same instructions the next time we try to optimize this + // block. + MergeList.clear(); + continue; + } + + // We made changes, but also determined that there were no more optimization + // opportunities, so we don't need to reprocess the list + if (!OptimizeListAgain) + MergeList.clear(); + + OptimizeAgain |= OptimizeListAgain; + Modified = true; + } + return Modified; +} + +void +SILoadStoreOptimizer::removeCombinedInst(std::list &MergeList, + const MachineInstr &MI) { + + for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) { + if (&*CI->I == &MI) { + MergeList.erase(CI); + return; + } + } +} + +bool +SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr( + std::list &MergeList, + bool &OptimizeListAgain) { + bool Modified = false; + for (auto I = MergeList.begin(); I != MergeList.end(); ++I) { + CombineInfo &CI = *I; switch (CI.InstClass) { default: @@ -1552,55 +1649,57 @@ bool SILoadStoreOptimizer::optimizeBlock(MachineBasicBlock &MBB) { case DS_READ: if (findMatchingInst(CI)) { Modified = true; - I = mergeRead2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case DS_WRITE: if (findMatchingInst(CI)) { Modified = true; - I = mergeWrite2Pair(CI); - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI); + CI.setMI(NewMI, *TII, *STM); } - continue; + break; case S_BUFFER_LOAD_IMM: if (findMatchingInst(CI)) { Modified = true; - I = mergeSBufferLoadImmPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 16; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 16; } - continue; + break; case BUFFER_LOAD_OFFEN: case BUFFER_LOAD_OFFSET: case BUFFER_LOAD_OFFEN_exact: case BUFFER_LOAD_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferLoadPair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; case BUFFER_STORE_OFFEN: case BUFFER_STORE_OFFSET: case BUFFER_STORE_OFFEN_exact: case BUFFER_STORE_OFFSET_exact: if (findMatchingInst(CI)) { Modified = true; - I = mergeBufferStorePair(CI); - OptimizeAgain |= (CI.Width0 + CI.Width1) < 4; - } else { - ++I; + removeCombinedInst(MergeList, *CI.Paired); + MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI); + CI.setMI(NewMI, *TII, *STM); + OptimizeListAgain |= (CI.Width0 + CI.Width1) < 4; } - continue; + break; } - - ++I; + // Clear the InstsToMove after we have finished searching so we don't have + // stale values left over if we search for this CI again in another pass + // over the block. + CI.InstsToMove.clear(); } return Modified; @@ -1626,10 +1725,14 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) { bool Modified = false; + for (MachineBasicBlock &MBB : MF) { + std::list > MergeableInsts; + // First pass: Collect list of all instructions we know how to merge. + Modified |= collectMergeableInsts(MBB, MergeableInsts); do { OptimizeAgain = false; - Modified |= optimizeBlock(MBB); + Modified |= optimizeBlock(MergeableInsts); } while (OptimizeAgain); } From 666680987c067285db5569d200ce3611215a8cfd Mon Sep 17 00:00:00 2001 From: Michael Liao Date: Thu, 3 Oct 2019 17:47:46 +0000 Subject: [PATCH 73/82] Fix build failure with GCC on identifier reusing. - GCC is different from clang and other compilers on that. https://godbolt.org/z/CeQE1V git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373633 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/dsymutil/dsymutil.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index b5259a2f5d65..aa23891ab39a 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -95,7 +95,7 @@ struct DsymutilOptions { std::vector Archs; std::vector InputFiles; unsigned NumThreads; - LinkOptions LinkOptions; + dsymutil::LinkOptions LinkOptions; }; /// Return a list of input files. This function has logic for dealing with the From 0e53d95ce42e392bda9c5b3962ebc5645d145fee Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 17:50:29 +0000 Subject: [PATCH 74/82] AMDGPU/GlobalISel: Fix mutationIsSane assert v8s8 and This would try to do FewerElements to v9s8 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373635 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 5 +- .../AMDGPU/GlobalISel/legalize-and.mir | 166 ++++++++++++++++++ 2 files changed, 169 insertions(+), 2 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index 8cf5a54177da..cf4275c23e8c 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -53,7 +53,8 @@ static LegalityPredicate isSmallOddVector(unsigned TypeIdx) { const LLT Ty = Query.Types[TypeIdx]; return Ty.isVector() && Ty.getNumElements() % 2 != 0 && - Ty.getElementType().getSizeInBits() < 32; + Ty.getElementType().getSizeInBits() < 32 && + Ty.getSizeInBits() % 32 != 0; }; } @@ -268,7 +269,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_, .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16}) .clampScalar(0, S32, S64) .moreElementsIf(isSmallOddVector(0), oneMoreElement(0)) - .fewerElementsIf(vectorWiderThan(0, 32), fewerEltsToSize64Vector(0)) + .fewerElementsIf(vectorWiderThan(0, 64), fewerEltsToSize64Vector(0)) .widenScalarToNextPow2(0) .scalarize(0); diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir index 5ea807166fe5..e4cc48d54dd6 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-and.mir @@ -522,3 +522,169 @@ body: | %3:_(<4 x s32>) = G_ANYEXT %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: test_and_v8s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v8s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF]](<8 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<8 x s8>) = G_TRUNC [[DEF1]](<8 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(s8), [[UV1:%[0-9]+]]:_(s8), [[UV2:%[0-9]+]]:_(s8), [[UV3:%[0-9]+]]:_(s8), [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC]](<8 x s8>) + ; CHECK: [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8), [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[TRUNC1]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV1]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV2]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV3]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32), [[COPY12]](s32), [[COPY13]](s32), [[COPY14]](s32), [[COPY15]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY [[BUILD_VECTOR]](<8 x s32>) + %0:_(<8 x s8>) = G_IMPLICIT_DEF + %1:_(<8 x s8>) = G_IMPLICIT_DEF + %2:_(<8 x s8>) = G_AND %0, %1 + %3:_(<8 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY %3 +... + +--- +name: test_and_v16s8 +body: | + bb.0: + + ; CHECK-LABEL: name: test_and_v16s8 + ; CHECK: [[DEF:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF]](<16 x s32>) + ; CHECK: [[DEF1:%[0-9]+]]:_(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[TRUNC1:%[0-9]+]]:_(<16 x s8>) = G_TRUNC [[DEF1]](<16 x s32>) + ; CHECK: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC]](<16 x s8>) + ; CHECK: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[TRUNC1]](<16 x s8>) + ; CHECK: [[UV4:%[0-9]+]]:_(s8), [[UV5:%[0-9]+]]:_(s8), [[UV6:%[0-9]+]]:_(s8), [[UV7:%[0-9]+]]:_(s8), [[UV8:%[0-9]+]]:_(s8), [[UV9:%[0-9]+]]:_(s8), [[UV10:%[0-9]+]]:_(s8), [[UV11:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV]](<8 x s8>) + ; CHECK: [[UV12:%[0-9]+]]:_(s8), [[UV13:%[0-9]+]]:_(s8), [[UV14:%[0-9]+]]:_(s8), [[UV15:%[0-9]+]]:_(s8), [[UV16:%[0-9]+]]:_(s8), [[UV17:%[0-9]+]]:_(s8), [[UV18:%[0-9]+]]:_(s8), [[UV19:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV2]](<8 x s8>) + ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[UV4]](s8) + ; CHECK: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[UV12]](s8) + ; CHECK: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[ANYEXT1]] + ; CHECK: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[UV5]](s8) + ; CHECK: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[UV13]](s8) + ; CHECK: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[ANYEXT3]] + ; CHECK: [[ANYEXT4:%[0-9]+]]:_(s32) = G_ANYEXT [[UV6]](s8) + ; CHECK: [[ANYEXT5:%[0-9]+]]:_(s32) = G_ANYEXT [[UV14]](s8) + ; CHECK: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT4]], [[ANYEXT5]] + ; CHECK: [[ANYEXT6:%[0-9]+]]:_(s32) = G_ANYEXT [[UV7]](s8) + ; CHECK: [[ANYEXT7:%[0-9]+]]:_(s32) = G_ANYEXT [[UV15]](s8) + ; CHECK: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT6]], [[ANYEXT7]] + ; CHECK: [[ANYEXT8:%[0-9]+]]:_(s32) = G_ANYEXT [[UV8]](s8) + ; CHECK: [[ANYEXT9:%[0-9]+]]:_(s32) = G_ANYEXT [[UV16]](s8) + ; CHECK: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT8]], [[ANYEXT9]] + ; CHECK: [[ANYEXT10:%[0-9]+]]:_(s32) = G_ANYEXT [[UV9]](s8) + ; CHECK: [[ANYEXT11:%[0-9]+]]:_(s32) = G_ANYEXT [[UV17]](s8) + ; CHECK: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT10]], [[ANYEXT11]] + ; CHECK: [[ANYEXT12:%[0-9]+]]:_(s32) = G_ANYEXT [[UV10]](s8) + ; CHECK: [[ANYEXT13:%[0-9]+]]:_(s32) = G_ANYEXT [[UV18]](s8) + ; CHECK: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT12]], [[ANYEXT13]] + ; CHECK: [[ANYEXT14:%[0-9]+]]:_(s32) = G_ANYEXT [[UV11]](s8) + ; CHECK: [[ANYEXT15:%[0-9]+]]:_(s32) = G_ANYEXT [[UV19]](s8) + ; CHECK: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT14]], [[ANYEXT15]] + ; CHECK: [[COPY:%[0-9]+]]:_(s32) = COPY [[AND]](s32) + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[AND1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY [[AND2]](s32) + ; CHECK: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AND3]](s32) + ; CHECK: [[COPY4:%[0-9]+]]:_(s32) = COPY [[AND4]](s32) + ; CHECK: [[COPY5:%[0-9]+]]:_(s32) = COPY [[AND5]](s32) + ; CHECK: [[COPY6:%[0-9]+]]:_(s32) = COPY [[AND6]](s32) + ; CHECK: [[COPY7:%[0-9]+]]:_(s32) = COPY [[AND7]](s32) + ; CHECK: [[UV20:%[0-9]+]]:_(s8), [[UV21:%[0-9]+]]:_(s8), [[UV22:%[0-9]+]]:_(s8), [[UV23:%[0-9]+]]:_(s8), [[UV24:%[0-9]+]]:_(s8), [[UV25:%[0-9]+]]:_(s8), [[UV26:%[0-9]+]]:_(s8), [[UV27:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV1]](<8 x s8>) + ; CHECK: [[UV28:%[0-9]+]]:_(s8), [[UV29:%[0-9]+]]:_(s8), [[UV30:%[0-9]+]]:_(s8), [[UV31:%[0-9]+]]:_(s8), [[UV32:%[0-9]+]]:_(s8), [[UV33:%[0-9]+]]:_(s8), [[UV34:%[0-9]+]]:_(s8), [[UV35:%[0-9]+]]:_(s8) = G_UNMERGE_VALUES [[UV3]](<8 x s8>) + ; CHECK: [[ANYEXT16:%[0-9]+]]:_(s32) = G_ANYEXT [[UV20]](s8) + ; CHECK: [[ANYEXT17:%[0-9]+]]:_(s32) = G_ANYEXT [[UV28]](s8) + ; CHECK: [[AND8:%[0-9]+]]:_(s32) = G_AND [[ANYEXT16]], [[ANYEXT17]] + ; CHECK: [[ANYEXT18:%[0-9]+]]:_(s32) = G_ANYEXT [[UV21]](s8) + ; CHECK: [[ANYEXT19:%[0-9]+]]:_(s32) = G_ANYEXT [[UV29]](s8) + ; CHECK: [[AND9:%[0-9]+]]:_(s32) = G_AND [[ANYEXT18]], [[ANYEXT19]] + ; CHECK: [[ANYEXT20:%[0-9]+]]:_(s32) = G_ANYEXT [[UV22]](s8) + ; CHECK: [[ANYEXT21:%[0-9]+]]:_(s32) = G_ANYEXT [[UV30]](s8) + ; CHECK: [[AND10:%[0-9]+]]:_(s32) = G_AND [[ANYEXT20]], [[ANYEXT21]] + ; CHECK: [[ANYEXT22:%[0-9]+]]:_(s32) = G_ANYEXT [[UV23]](s8) + ; CHECK: [[ANYEXT23:%[0-9]+]]:_(s32) = G_ANYEXT [[UV31]](s8) + ; CHECK: [[AND11:%[0-9]+]]:_(s32) = G_AND [[ANYEXT22]], [[ANYEXT23]] + ; CHECK: [[ANYEXT24:%[0-9]+]]:_(s32) = G_ANYEXT [[UV24]](s8) + ; CHECK: [[ANYEXT25:%[0-9]+]]:_(s32) = G_ANYEXT [[UV32]](s8) + ; CHECK: [[AND12:%[0-9]+]]:_(s32) = G_AND [[ANYEXT24]], [[ANYEXT25]] + ; CHECK: [[ANYEXT26:%[0-9]+]]:_(s32) = G_ANYEXT [[UV25]](s8) + ; CHECK: [[ANYEXT27:%[0-9]+]]:_(s32) = G_ANYEXT [[UV33]](s8) + ; CHECK: [[AND13:%[0-9]+]]:_(s32) = G_AND [[ANYEXT26]], [[ANYEXT27]] + ; CHECK: [[ANYEXT28:%[0-9]+]]:_(s32) = G_ANYEXT [[UV26]](s8) + ; CHECK: [[ANYEXT29:%[0-9]+]]:_(s32) = G_ANYEXT [[UV34]](s8) + ; CHECK: [[AND14:%[0-9]+]]:_(s32) = G_AND [[ANYEXT28]], [[ANYEXT29]] + ; CHECK: [[ANYEXT30:%[0-9]+]]:_(s32) = G_ANYEXT [[UV27]](s8) + ; CHECK: [[ANYEXT31:%[0-9]+]]:_(s32) = G_ANYEXT [[UV35]](s8) + ; CHECK: [[AND15:%[0-9]+]]:_(s32) = G_AND [[ANYEXT30]], [[ANYEXT31]] + ; CHECK: [[COPY8:%[0-9]+]]:_(s32) = COPY [[AND8]](s32) + ; CHECK: [[COPY9:%[0-9]+]]:_(s32) = COPY [[AND9]](s32) + ; CHECK: [[COPY10:%[0-9]+]]:_(s32) = COPY [[AND10]](s32) + ; CHECK: [[COPY11:%[0-9]+]]:_(s32) = COPY [[AND11]](s32) + ; CHECK: [[COPY12:%[0-9]+]]:_(s32) = COPY [[AND12]](s32) + ; CHECK: [[COPY13:%[0-9]+]]:_(s32) = COPY [[AND13]](s32) + ; CHECK: [[COPY14:%[0-9]+]]:_(s32) = COPY [[AND14]](s32) + ; CHECK: [[COPY15:%[0-9]+]]:_(s32) = COPY [[AND15]](s32) + ; CHECK: [[COPY16:%[0-9]+]]:_(s32) = COPY [[COPY]](s32) + ; CHECK: [[COPY17:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32) + ; CHECK: [[COPY18:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32) + ; CHECK: [[COPY19:%[0-9]+]]:_(s32) = COPY [[COPY3]](s32) + ; CHECK: [[COPY20:%[0-9]+]]:_(s32) = COPY [[COPY4]](s32) + ; CHECK: [[COPY21:%[0-9]+]]:_(s32) = COPY [[COPY5]](s32) + ; CHECK: [[COPY22:%[0-9]+]]:_(s32) = COPY [[COPY6]](s32) + ; CHECK: [[COPY23:%[0-9]+]]:_(s32) = COPY [[COPY7]](s32) + ; CHECK: [[COPY24:%[0-9]+]]:_(s32) = COPY [[COPY8]](s32) + ; CHECK: [[COPY25:%[0-9]+]]:_(s32) = COPY [[COPY9]](s32) + ; CHECK: [[COPY26:%[0-9]+]]:_(s32) = COPY [[COPY10]](s32) + ; CHECK: [[COPY27:%[0-9]+]]:_(s32) = COPY [[COPY11]](s32) + ; CHECK: [[COPY28:%[0-9]+]]:_(s32) = COPY [[COPY12]](s32) + ; CHECK: [[COPY29:%[0-9]+]]:_(s32) = COPY [[COPY13]](s32) + ; CHECK: [[COPY30:%[0-9]+]]:_(s32) = COPY [[COPY14]](s32) + ; CHECK: [[COPY31:%[0-9]+]]:_(s32) = COPY [[COPY15]](s32) + ; CHECK: [[BUILD_VECTOR:%[0-9]+]]:_(<16 x s32>) = G_BUILD_VECTOR [[COPY16]](s32), [[COPY17]](s32), [[COPY18]](s32), [[COPY19]](s32), [[COPY20]](s32), [[COPY21]](s32), [[COPY22]](s32), [[COPY23]](s32), [[COPY24]](s32), [[COPY25]](s32), [[COPY26]](s32), [[COPY27]](s32), [[COPY28]](s32), [[COPY29]](s32), [[COPY30]](s32), [[COPY31]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BUILD_VECTOR]](<16 x s32>) + %0:_(<16 x s8>) = G_IMPLICIT_DEF + %1:_(<16 x s8>) = G_IMPLICIT_DEF + %2:_(<16 x s8>) = G_AND %0, %1 + %3:_(<16 x s32>) = G_ANYEXT %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... From 5e7be8b1bc82556d94e704606dc52ebba6b176d0 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 17:50:31 +0000 Subject: [PATCH 75/82] AMDGPU/GlobalISel: Add some more tests for G_INSERT legalization git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373636 91177308-0d34-0410-b5e6-96231b3b80d8 --- .../AMDGPU/GlobalISel/legalize-insert.mir | 168 ++++++++++++++++++ 1 file changed, 168 insertions(+) diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir index b36737de837d..066932ec4807 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-insert.mir @@ -33,6 +33,24 @@ body: | %2:_(s64) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s32_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s32_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[COPY1]](s32), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s64) = G_INSERT %0, %1, 16 + $vgpr0_vgpr1 = COPY %2 +... + --- name: test_insert_s96_s32_offset0 body: | @@ -305,6 +323,83 @@ body: | %2:_(s128) = G_INSERT %0, %1, 64 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %2 ... + +--- +name: test_insert_s128_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + +--- +name: test_insert_s128_s16_offset112 +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 + + ; CHECK-LABEL: name: test_insert_s128_s16_offset112 + ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr4 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s128) = G_INSERT [[COPY]], [[TRUNC]](s16), 112 + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[INSERT]](s128) + %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:_(s32) = COPY $vgpr4 + %2:_(s16) = G_TRUNC %1 + %3:_(s128) = G_INSERT %0, %2, 112 + $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 +... + --- name: test_insert_v2s32_s32_offset0 body: | @@ -1079,3 +1174,76 @@ body: | %2:_(<4 x s16>) = G_INSERT %0, %1, 32 $vgpr0_vgpr1 = COPY %2 ... + +--- +name: test_insert_s64_s16_offset0 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset0 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 0 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 0 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset16 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset16 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 16 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 16 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset32 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset32 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 32 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 32 + $vgpr0_vgpr1 = COPY %3 +... +--- +name: test_insert_s64_s16_offset48 +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: test_insert_s64_s16_offset48 + ; CHECK: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[INSERT:%[0-9]+]]:_(s64) = G_INSERT [[COPY]], [[TRUNC]](s16), 48 + ; CHECK: $vgpr0_vgpr1 = COPY [[INSERT]](s64) + %0:_(s64) = COPY $vgpr0_vgpr1 + %1:_(s32) = COPY $vgpr2 + %2:_(s16) = G_TRUNC %1 + %3:_(s64) = G_INSERT %0, %2, 48 + $vgpr0_vgpr1 = COPY %3 +... From bde9662fef07a941079249322d97e3417519c68e Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 17:50:32 +0000 Subject: [PATCH 76/82] AMDGPU/GlobalISel: Allow VGPR to index SGPR register We can still do a waterfall loop over the index if using a VGPR to index an SGPR. The result will still be a VGPR, but we can avoid the wide copy of the source register to a VGPR. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373637 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 10 ++++++---- .../GlobalISel/regbankselect-extract-vector-elt.mir | 5 ++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index 8aa296b1132d..f8ea17d7f3f3 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -2309,14 +2309,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; } case AMDGPU::G_EXTRACT_VECTOR_ELT: { - unsigned OutputBankID = isSALUMapping(MI) ? - AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID; + // VGPR index can be used for waterfall when indexing a SGPR vector. + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned SrcSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); + unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, SrcSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, SrcSize); + OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, DstSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. OpdsMapping[2] = AMDGPU::getValueMapping(IdxBank, IdxSize); diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir index accfaf08624b..5613b1b4c27d 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -35,17 +35,16 @@ body: | ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<16 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(<16 x s32>) = COPY [[COPY]](<16 x s32>) ; CHECK: [[DEF:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %2(s32), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY2]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<16 x s32>), [[V_READFIRSTLANE_B32_]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec From 793d83708fda43a4c30d8c17e6d02d1f28597027 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 17:55:27 +0000 Subject: [PATCH 77/82] AMDGPU/GlobalISel: Split 64-bit vector extracts during RegBankSelect Register indexing 64-bit elements is possible on the SALU, but not the VALU. Handle splitting this into two 32-bit indexes. Extend waterfall loop handling to allow moving a range of instructions. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373638 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 424 +++++++++++------- lib/Target/AMDGPU/AMDGPURegisterBankInfo.h | 14 + .../regbankselect-extract-vector-elt.mir | 120 ++++- 3 files changed, 385 insertions(+), 173 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index f8ea17d7f3f3..cc7b2fb66fbe 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -17,7 +17,6 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIMachineFunctionInfo.h" #include "SIRegisterInfo.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" @@ -659,43 +658,28 @@ static LLT getHalfSizedType(LLT Ty) { /// unique values used. bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineIRBuilder &B, - MachineInstr &MI, - MachineRegisterInfo &MRI, - ArrayRef OpIndices) const { - MachineFunction *MF = &B.getMF(); - MachineBasicBlock::iterator I(MI); - - MachineBasicBlock &MBB = *MI.getParent(); - const DebugLoc &DL = MI.getDebugLoc(); - - // Use a set to avoid extra readfirstlanes in the case where multiple operands - // are the same register. - SmallSet SGPROperandRegs; - for (unsigned Op : OpIndices) { - assert(MI.getOperand(Op).isUse()); - Register Reg = MI.getOperand(Op).getReg(); - const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); - if (OpBank->getID() == AMDGPU::VGPRRegBankID) - SGPROperandRegs.insert(Reg); - } - - // No operands need to be replaced, so no need to loop. - if (SGPROperandRegs.empty()) - return false; - + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const { SmallVector ResultRegs; SmallVector InitResultRegs; SmallVector PhiRegs; - for (MachineOperand &Def : MI.defs()) { - LLT ResTy = MRI.getType(Def.getReg()); - const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); - ResultRegs.push_back(Def.getReg()); - Register InitReg = B.buildUndef(ResTy).getReg(0); - Register PhiReg = MRI.createGenericVirtualRegister(ResTy); - InitResultRegs.push_back(InitReg); - PhiRegs.push_back(PhiReg); - MRI.setRegBank(PhiReg, *DefBank); - MRI.setRegBank(InitReg, *DefBank); + + MachineBasicBlock &MBB = B.getMBB(); + MachineFunction *MF = &B.getMF(); + + for (MachineInstr &MI : Range) { + for (MachineOperand &Def : MI.defs()) { + LLT ResTy = MRI.getType(Def.getReg()); + const RegisterBank *DefBank = getRegBank(Def.getReg(), MRI, *TRI); + ResultRegs.push_back(Def.getReg()); + Register InitReg = B.buildUndef(ResTy).getReg(0); + Register PhiReg = MRI.createGenericVirtualRegister(ResTy); + InitResultRegs.push_back(InitReg); + PhiRegs.push_back(PhiReg); + MRI.setRegBank(PhiReg, *DefBank); + MRI.setRegBank(InitReg, *DefBank); + } } Register SaveExecReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); @@ -724,7 +708,7 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( // Move the rest of the block into a new block. RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB); - RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end()); + RemainderBB->splice(RemainderBB->begin(), &MBB, Range.end(), MBB.end()); MBB.addSuccessor(LoopBB); RestoreExecBB->addSuccessor(RemainderBB); @@ -747,139 +731,56 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addMBB(LoopBB); } - // Move the instruction into the loop. - LoopBB->splice(LoopBB->end(), &MBB, I); - I = std::prev(LoopBB->end()); - - B.setInstr(*I); - - Register CondReg; - - for (MachineOperand &Op : MI.uses()) { - if (!Op.isReg()) - continue; - - assert(!Op.isDef()); - if (SGPROperandRegs.count(Op.getReg())) { - LLT OpTy = MRI.getType(Op.getReg()); - unsigned OpSize = OpTy.getSizeInBits(); - - // Can only do a readlane of 32-bit pieces. - if (OpSize == 32) { - // Avoid extra copies in the simple case of one 32-bit register. - Register CurrentLaneOpReg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); - MRI.setType(CurrentLaneOpReg, OpTy); - - constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentLaneOpReg) - .addReg(Op.getReg()); - - Register NewCondReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); - bool First = CondReg == AMDGPU::NoRegister; - if (First) - CondReg = NewCondReg; - - // Compare the just read M0 value to all possible Idx values. - B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) - .addDef(NewCondReg) - .addReg(CurrentLaneOpReg) - .addReg(Op.getReg()); - Op.setReg(CurrentLaneOpReg); - - if (!First) { - Register AndReg = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); - - // If there are multiple operands to consider, and the conditions. - B.buildInstr(AMDGPU::S_AND_B64) - .addDef(AndReg) - .addReg(NewCondReg) - .addReg(CondReg); - CondReg = AndReg; - } - } else { - LLT S32 = LLT::scalar(32); - SmallVector ReadlanePieces; - - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. - - bool Is64 = OpSize % 64 == 0; + const DebugLoc &DL = B.getDL(); - LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); - unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 - : AMDGPU::V_CMP_EQ_U32_e64; + // Figure out the iterator range after splicing the instructions. + auto NewBegin = std::prev(LoopBB->end()); - // The compares can be done as 64-bit, but the extract needs to be done - // in 32-bit pieces. + // Move the instruction into the loop. Note we moved everything after + // Range.end() already into a new block, so Range.end() is no longer valid. + LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end()); - // Insert the unmerge before the loop. + auto NewEnd = LoopBB->end(); - B.setMBB(MBB); - auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); - B.setInstr(*I); + MachineBasicBlock::iterator I = Range.begin(); + B.setInsertPt(*LoopBB, I); - unsigned NumPieces = Unmerge->getNumOperands() - 1; - for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { - Register UnmergePiece = Unmerge.getReg(PieceIdx); - - Register CurrentLaneOpReg; - if (Is64) { - Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); - Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); - - MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); - MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); - MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegLo) - .addReg(UnmergePiece, 0, AMDGPU::sub0); - - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpRegHi) - .addReg(UnmergePiece, 0, AMDGPU::sub1); + Register CondReg; - CurrentLaneOpReg = - B.buildMerge(LLT::scalar(64), - {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) - .getReg(0); + for (MachineInstr &MI : make_range(NewBegin, NewEnd)) { + for (MachineOperand &Op : MI.uses()) { + if (!Op.isReg() || Op.isDef()) + continue; - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + if (SGPROperandRegs.count(Op.getReg())) { + LLT OpTy = MRI.getType(Op.getReg()); + unsigned OpSize = OpTy.getSizeInBits(); - if (OpTy.getScalarSizeInBits() == 64) { - // If we need to produce a 64-bit element vector, so use the - // merged pieces - ReadlanePieces.push_back(CurrentLaneOpReg); - } else { - // 32-bit element type. - ReadlanePieces.push_back(CurrentLaneOpRegLo); - ReadlanePieces.push_back(CurrentLaneOpRegHi); - } - } else { - CurrentLaneOpReg = MRI.createGenericVirtualRegister(LLT::scalar(32)); - MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); - MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + // Can only do a readlane of 32-bit pieces. + if (OpSize == 32) { + // Avoid extra copies in the simple case of one 32-bit register. + Register CurrentLaneOpReg + = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass); + MRI.setType(CurrentLaneOpReg, OpTy); - // Read the next variant <- also loop target. - BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), - CurrentLaneOpReg) - .addReg(UnmergePiece); - ReadlanePieces.push_back(CurrentLaneOpReg); - } + constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI); + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(Op.getReg()); Register NewCondReg - = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); bool First = CondReg == AMDGPU::NoRegister; if (First) CondReg = NewCondReg; - B.buildInstr(CmpOp) + // Compare the just read M0 value to all possible Idx values. + B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64) .addDef(NewCondReg) .addReg(CurrentLaneOpReg) - .addReg(UnmergePiece); + .addReg(Op.getReg()); + Op.setReg(CurrentLaneOpReg); if (!First) { Register AndReg @@ -892,19 +793,115 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( .addReg(CondReg); CondReg = AndReg; } - } - - // FIXME: Build merge seems to switch to CONCAT_VECTORS but not - // BUILD_VECTOR - if (OpTy.isVector()) { - auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); } else { - auto Merge = B.buildMerge(OpTy, ReadlanePieces); - Op.setReg(Merge.getReg(0)); - } + LLT S32 = LLT::scalar(32); + SmallVector ReadlanePieces; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + bool Is64 = OpSize % 64 == 0; + + LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32); + unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64 + : AMDGPU::V_CMP_EQ_U32_e64; + + // The compares can be done as 64-bit, but the extract needs to be done + // in 32-bit pieces. + + // Insert the unmerge before the loop. + + B.setMBB(MBB); + auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg()); + B.setInstr(*I); + + unsigned NumPieces = Unmerge->getNumOperands() - 1; + for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) { + Register UnmergePiece = Unmerge.getReg(PieceIdx); - MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + Register CurrentLaneOpReg; + if (Is64) { + Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32); + Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32); + + MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass); + MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass); + MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegLo) + .addReg(UnmergePiece, 0, AMDGPU::sub0); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpRegHi) + .addReg(UnmergePiece, 0, AMDGPU::sub1); + + CurrentLaneOpReg = + B.buildMerge(LLT::scalar(64), + {CurrentLaneOpRegLo, CurrentLaneOpRegHi}) + .getReg(0); + + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass); + + if (OpTy.getScalarSizeInBits() == 64) { + // If we need to produce a 64-bit element vector, so use the + // merged pieces + ReadlanePieces.push_back(CurrentLaneOpReg); + } else { + // 32-bit element type. + ReadlanePieces.push_back(CurrentLaneOpRegLo); + ReadlanePieces.push_back(CurrentLaneOpRegHi); + } + } else { + CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32); + MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass); + MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass); + + // Read the next variant <- also loop target. + BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), + CurrentLaneOpReg) + .addReg(UnmergePiece); + ReadlanePieces.push_back(CurrentLaneOpReg); + } + + Register NewCondReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + bool First = CondReg == AMDGPU::NoRegister; + if (First) + CondReg = NewCondReg; + + B.buildInstr(CmpOp) + .addDef(NewCondReg) + .addReg(CurrentLaneOpReg) + .addReg(UnmergePiece); + + if (!First) { + Register AndReg + = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass); + + // If there are multiple operands to consider, and the conditions. + B.buildInstr(AMDGPU::S_AND_B64) + .addDef(AndReg) + .addReg(NewCondReg) + .addReg(CondReg); + CondReg = AndReg; + } + } + + // FIXME: Build merge seems to switch to CONCAT_VECTORS but not + // BUILD_VECTOR + if (OpTy.isVector()) { + auto Merge = B.buildBuildVector(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } else { + auto Merge = B.buildMerge(OpTy, ReadlanePieces); + Op.setReg(Merge.getReg(0)); + } + + MRI.setRegBank(Op.getReg(), getRegBank(AMDGPU::SGPRRegBankID)); + } } } } @@ -947,6 +944,40 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop( return true; } +// Return any unique registers used by \p MI at \p OpIndices that need to be +// handled in a waterfall loop. Returns these registers in \p +// SGPROperandRegs. Returns true if there are any operansd to handle and a +// waterfall loop is necessary. +bool AMDGPURegisterBankInfo::collectWaterfallOperands( + SmallSet &SGPROperandRegs, MachineInstr &MI, + MachineRegisterInfo &MRI, ArrayRef OpIndices) const { + for (unsigned Op : OpIndices) { + assert(MI.getOperand(Op).isUse()); + Register Reg = MI.getOperand(Op).getReg(); + const RegisterBank *OpBank = getRegBank(Reg, MRI, *TRI); + if (OpBank->getID() == AMDGPU::VGPRRegBankID) + SGPROperandRegs.insert(Reg); + } + + // No operands need to be replaced, so no need to loop. + return !SGPROperandRegs.empty(); +} + +bool AMDGPURegisterBankInfo::executeInWaterfallLoop( + MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, + ArrayRef OpIndices) const { + // Use a set to avoid extra readfirstlanes in the case where multiple operands + // are the same register. + SmallSet SGPROperandRegs; + + if (!collectWaterfallOperands(SGPROperandRegs, MI, MRI, OpIndices)) + return false; + + MachineBasicBlock::iterator I = MI.getIterator(); + return executeInWaterfallLoop(B, make_range(I, std::next(I)), + SGPROperandRegs, MRI); +} + bool AMDGPURegisterBankInfo::executeInWaterfallLoop( MachineInstr &MI, MachineRegisterInfo &MRI, ArrayRef OpIndices) const { @@ -1602,10 +1633,69 @@ void AMDGPURegisterBankInfo::applyMappingImpl( MI.eraseFromParent(); return; } - case AMDGPU::G_EXTRACT_VECTOR_ELT: - applyDefaultMapping(OpdMapper); - executeInWaterfallLoop(MI, MRI, { 2 }); + case AMDGPU::G_EXTRACT_VECTOR_ELT: { + SmallVector DstRegs(OpdMapper.getVRegs(0)); + + assert(empty(OpdMapper.getVRegs(1)) && empty(OpdMapper.getVRegs(2))); + + if (DstRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 2 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register IdxReg = MI.getOperand(2).getReg(); + LLT DstTy = MRI.getType(DstReg); + + assert(DstTy.getSizeInBits() == 64); + + LLT SrcTy = MRI.getType(SrcReg); + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo); + B.buildExtractVectorElement(DstRegs[1], CastSrc, IdxHi); + + const ValueMapping &DstMapping + = OpdMapper.getInstrMapping().getOperandMapping(0); + + // FIXME: Should be getting from mapping or not? + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + MRI.setRegBank(DstReg, *DstMapping.BreakDown[0].RegBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 2 })) { + MI.eraseFromParent(); + return; + } + + // Remove the original instruction to avoid potentially confusing the + // waterfall loop logic. + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { @@ -2317,7 +2407,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned IdxBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); unsigned OutputBankID = regBankUnion(SrcBankID, IdxBank); - OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, DstSize); + OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(OutputBankID, DstSize); OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, SrcSize); // The index can be either if the source vector is VGPR. diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h index 584b23c0c220..a14b74961118 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h @@ -13,6 +13,8 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERBANKINFO_H +#include "llvm/ADT/SmallSet.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/Register.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" @@ -42,6 +44,18 @@ class AMDGPURegisterBankInfo : public AMDGPUGenRegisterBankInfo { const SIRegisterInfo *TRI; const SIInstrInfo *TII; + bool collectWaterfallOperands( + SmallSet &SGPROperandRegs, + MachineInstr &MI, + MachineRegisterInfo &MRI, + ArrayRef OpIndices) const; + + bool executeInWaterfallLoop( + MachineIRBuilder &B, + iterator_range Range, + SmallSet &SGPROperandRegs, + MachineRegisterInfo &MRI) const; + bool executeInWaterfallLoop(MachineIRBuilder &B, MachineInstr &MI, MachineRegisterInfo &MRI, diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir index 5613b1b4c27d..b390e33e4ae6 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-extract-vector-elt.mir @@ -116,6 +116,102 @@ body: | $vgpr0 = COPY %2 ... +--- +name: extract_vector_elt_v8s64_ss +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; CHECK-LABEL: name: extract_vector_elt_v8s64_ss + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; CHECK: [[EVEC:%[0-9]+]]:sgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[COPY1]](s32) + ; CHECK: $sgpr0_sgpr1 = COPY [[EVEC]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $sgpr16 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $sgpr0_sgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_vs +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + + ; CHECK-LABEL: name: extract_vector_elt_v8s64_vs + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY1]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s32) = COPY $sgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + +--- +name: extract_vector_elt_v8s64_sv +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; CHECK-LABEL: name: extract_vector_elt_v8s64_sv + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s32) = COPY $vgpr0 + %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 + $vgpr0_vgpr1 = COPY %2 +... + --- name: extract_vector_elt_v8s64_vv legalized: true @@ -129,16 +225,27 @@ body: | ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; CHECK: [[COPY1:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 - ; CHECK: [[DEF:%[0-9]+]]:vgpr(s64) = G_IMPLICIT_DEF - ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec ; CHECK: .1: ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %8, %bb.1 - ; CHECK: [[PHI1:%[0-9]+]]:vgpr(s64) = G_PHI [[DEF]](s64), %bb.0, %2(s64), %bb.1 + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF4]], %bb.0, %20, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %7(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF2]](s32), %bb.0, %3(s32), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF3]](s32), %bb.0, %4(s32), %bb.1 ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY1]](s32), implicit $exec ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY1]](s32), implicit $exec - ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s64) = G_EXTRACT_VECTOR_ELT [[COPY]](<8 x s64>), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[EVEC:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[SHL]](s32) + ; CHECK: [[EVEC1:%[0-9]+]]:vgpr(s32) = G_EXTRACT_VECTOR_ELT [[BITCAST]](<16 x s32>), [[ADD]](s32) ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec @@ -146,7 +253,8 @@ body: | ; CHECK: successors: %bb.3(0x80000000) ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] ; CHECK: .3: - ; CHECK: $vgpr0_vgpr1 = COPY [[EVEC]](s64) + ; CHECK: [[MV:%[0-9]+]]:vgpr(s64) = G_MERGE_VALUES [[EVEC]](s32), [[EVEC1]](s32) + ; CHECK: $vgpr0_vgpr1 = COPY [[MV]](s64) %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 %1:_(s32) = COPY $vgpr16 %2:_(s64) = G_EXTRACT_VECTOR_ELT %0, %1 From 204eb4858d79a0adbf7fa348e9d4443c6fdf0600 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 3 Oct 2019 17:59:03 +0000 Subject: [PATCH 78/82] AMDGPU/GlobalISel: Handle RegBankSelect of G_INSERT_VECTOR_ELT git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373639 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 82 +++- .../regbankselect-insert-vector-elt.mir | 395 +++++++++++++++++- 2 files changed, 460 insertions(+), 17 deletions(-) diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index cc7b2fb66fbe..b868e1d6437a 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1696,6 +1696,75 @@ void AMDGPURegisterBankInfo::applyMappingImpl( OpsToWaterfall, MRI); return; } + case AMDGPU::G_INSERT_VECTOR_ELT: { + SmallVector InsRegs(OpdMapper.getVRegs(2)); + + assert(empty(OpdMapper.getVRegs(0))); + assert(empty(OpdMapper.getVRegs(1))); + assert(empty(OpdMapper.getVRegs(3))); + + if (InsRegs.empty()) { + applyDefaultMapping(OpdMapper); + executeInWaterfallLoop(MI, MRI, { 3 }); + return; + } + + Register DstReg = MI.getOperand(0).getReg(); + Register SrcReg = MI.getOperand(1).getReg(); + Register InsReg = MI.getOperand(2).getReg(); + Register IdxReg = MI.getOperand(3).getReg(); + LLT SrcTy = MRI.getType(SrcReg); + LLT InsTy = MRI.getType(InsReg); + + assert(InsTy.getSizeInBits() == 64); + + const LLT S32 = LLT::scalar(32); + LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32); + + MachineIRBuilder B(MI); + auto CastSrc = B.buildBitcast(Vec32, SrcReg); + auto One = B.buildConstant(S32, 1); + + // Split the vector index into 32-bit pieces. Prepare to move all of the + // new instructions into a waterfall loop if necessary. + // + // Don't put the bitcast or constant in the loop. + MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB()); + + // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1). + auto IdxLo = B.buildShl(S32, IdxReg, One); + auto IdxHi = B.buildAdd(S32, IdxLo, One); + + auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo); + auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi); + B.buildBitcast(DstReg, InsHi); + + const RegisterBank *DstBank = getRegBank(DstReg, MRI, *TRI); + const RegisterBank *SrcBank = getRegBank(SrcReg, MRI, *TRI); + const RegisterBank *InsSrcBank = getRegBank(InsReg, MRI, *TRI); + + MRI.setRegBank(InsReg, *InsSrcBank); + MRI.setRegBank(CastSrc.getReg(0), *SrcBank); + MRI.setRegBank(InsLo.getReg(0), *DstBank); + MRI.setRegBank(InsHi.getReg(0), *DstBank); + MRI.setRegBank(One.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxLo.getReg(0), AMDGPU::SGPRRegBank); + MRI.setRegBank(IdxHi.getReg(0), AMDGPU::SGPRRegBank); + + + SmallSet OpsToWaterfall; + if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) { + MI.eraseFromParent(); + return; + } + + B.setInstr(*Span.begin()); + MI.eraseFromParent(); + + executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()), + OpsToWaterfall, MRI); + return; + } case AMDGPU::G_INTRINSIC: { switch (MI.getIntrinsicID()) { case Intrinsic::amdgcn_s_buffer_load: { @@ -2421,15 +2490,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(); unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits(); - unsigned InsertEltBank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI); - unsigned IdxBank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); + unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI); + unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(), + MRI, *TRI); + unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI); OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize); - OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBank, InsertSize); + OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize); + OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID, + InsertSize); // The index can be either if the source vector is VGPR. - OpdsMapping[3] = AMDGPU::getValueMapping(IdxBank, IdxSize); + OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize); break; } case AMDGPU::G_UNMERGE_VALUES: { diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir index a3e7d7423b38..18f97fbeb91b 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect-insert-vector-elt.mir @@ -56,8 +56,7 @@ body: | ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -69,17 +68,35 @@ body: | --- name: insert_vector_elt_v4i32_s_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $sgpr4 @@ -91,17 +108,35 @@ body: | --- name: insert_vector_elt_v4i32_s_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK-LABEL: name: insert_vector_elt_v4i32_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3, $vgpr0, $vgpr1 ; CHECK: [[COPY:%[0-9]+]]:sgpr(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr1 - ; CHECK: [[COPY3:%[0-9]+]]:vgpr(<4 x s32>) = COPY [[COPY]](<4 x s32>) - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY3]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr1 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3 %1:_(s32) = COPY $vgpr0 @@ -113,16 +148,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_s_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $sgpr4, $vgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $sgpr4 @@ -134,12 +188,14 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_s legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $sgpr0 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr0 ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr0 @@ -155,16 +211,35 @@ body: | --- name: insert_vector_elt_var_v4i32_v_v_v legalized: true +tracksRegLiveness: true body: | bb.0: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK-LABEL: name: insert_vector_elt_var_v4i32_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4, $vgpr5 ; CHECK: [[COPY:%[0-9]+]]:vgpr(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s32) = COPY $vgpr4 - ; CHECK: [[COPY2:%[0-9]+]]:vgpr(s32) = COPY $vgpr5 - ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[COPY2]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr5 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<4 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<4 x s32>) = G_PHI [[DEF]](<4 x s32>), %bb.0, %3(<4 x s32>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<4 x s32>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s32), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[IVEC]](<4 x s32>) %0:_(<4 x s32>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 %1:_(s32) = COPY $vgpr4 @@ -172,3 +247,299 @@ body: | %3:_(<4 x s32>) = G_INSERT_VECTOR_ELT %0, %1, %2 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3 ... + +--- +name: insert_vector_elt_v8s64_s_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:sgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr16_sgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[COPY2]](s32) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_s + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $sgpr16 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr16 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $sgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $sgpr16_sgpr17, $vgpr0 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr16_sgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr0 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $sgpr16_sgpr17 + %2:_(s32) = COPY $vgpr0 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_s_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_s_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, $vgpr0_vgpr1, $vgpr2 + ; CHECK: [[COPY:%[0-9]+]]:sgpr(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr0_vgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr2 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:sgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + %1:_(s64) = COPY $vgpr0_vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_s +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_s + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $sgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY $sgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[COPY2]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $sgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_s_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_s_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $sgpr0_sgpr1, $vgpr16 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:sgpr(s64) = COPY $sgpr0_sgpr1 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr16 + ; CHECK: [[DEF:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF1]], %bb.0, %9, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<8 x s64>) = G_INSERT_VECTOR_ELT [[COPY]], [[COPY1]](s64), [[V_READFIRSTLANE_B32_]](s32) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[IVEC]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $sgpr0_sgpr1 + %2:_(s32) = COPY $vgpr16 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... + +--- +name: insert_vector_elt_v8s64_v_v_v +legalized: true +tracksRegLiveness: true + +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + + ; CHECK-LABEL: name: insert_vector_elt_v8s64_v_v_v + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, $vgpr16_vgpr17, $vgpr18 + ; CHECK: [[COPY:%[0-9]+]]:vgpr(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + ; CHECK: [[COPY1:%[0-9]+]]:vgpr(s64) = COPY $vgpr16_vgpr17 + ; CHECK: [[COPY2:%[0-9]+]]:vgpr_32(s32) = COPY $vgpr18 + ; CHECK: [[UV:%[0-9]+]]:vgpr(s32), [[UV1:%[0-9]+]]:vgpr(s32) = G_UNMERGE_VALUES [[COPY1]](s64) + ; CHECK: [[BITCAST:%[0-9]+]]:vgpr(<16 x s32>) = G_BITCAST [[COPY]](<8 x s64>) + ; CHECK: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1 + ; CHECK: [[DEF:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF1:%[0-9]+]]:sgpr(s32) = G_IMPLICIT_DEF + ; CHECK: [[DEF2:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF3:%[0-9]+]]:vgpr(<16 x s32>) = G_IMPLICIT_DEF + ; CHECK: [[DEF4:%[0-9]+]]:vgpr(<8 x s64>) = G_IMPLICIT_DEF + ; CHECK: [[DEF5:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF + ; CHECK: [[S_MOV_B64_term:%[0-9]+]]:sreg_64_xexec = S_MOV_B64_term $exec + ; CHECK: .1: + ; CHECK: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK: [[PHI:%[0-9]+]]:sreg_64 = PHI [[DEF5]], %bb.0, %25, %bb.1 + ; CHECK: [[PHI1:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF]](s32), %bb.0, %8(s32), %bb.1 + ; CHECK: [[PHI2:%[0-9]+]]:sgpr(s32) = G_PHI [[DEF1]](s32), %bb.0, %9(s32), %bb.1 + ; CHECK: [[PHI3:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF2]](<16 x s32>), %bb.0, %10(<16 x s32>), %bb.1 + ; CHECK: [[PHI4:%[0-9]+]]:vgpr(<16 x s32>) = G_PHI [[DEF3]](<16 x s32>), %bb.0, %11(<16 x s32>), %bb.1 + ; CHECK: [[PHI5:%[0-9]+]]:vgpr(<8 x s64>) = G_PHI [[DEF4]](<8 x s64>), %bb.0, %3(<8 x s64>), %bb.1 + ; CHECK: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY2]](s32), implicit $exec + ; CHECK: [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY2]](s32), implicit $exec + ; CHECK: [[SHL:%[0-9]+]]:sgpr(s32) = G_SHL [[V_READFIRSTLANE_B32_]], [[C]](s32) + ; CHECK: [[ADD:%[0-9]+]]:sgpr(s32) = G_ADD [[SHL]], [[C]] + ; CHECK: [[IVEC:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[BITCAST]], [[UV]](s32), [[SHL]](s32) + ; CHECK: [[IVEC1:%[0-9]+]]:vgpr(<16 x s32>) = G_INSERT_VECTOR_ELT [[IVEC]], [[UV1]](s32), [[ADD]](s32) + ; CHECK: [[BITCAST1:%[0-9]+]]:vgpr(<8 x s64>) = G_BITCAST [[IVEC1]](<16 x s32>) + ; CHECK: [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64 = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec + ; CHECK: $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc + ; CHECK: S_CBRANCH_EXECNZ %bb.1, implicit $exec + ; CHECK: .2: + ; CHECK: successors: %bb.3(0x80000000) + ; CHECK: $exec = S_MOV_B64_term [[S_MOV_B64_term]] + ; CHECK: .3: + ; CHECK: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY [[BITCAST1]](<8 x s64>) + %0:_(<8 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 + %1:_(s64) = COPY $vgpr16_vgpr17 + %2:_(s32) = COPY $vgpr18 + %3:_(<8 x s64>) = G_INSERT_VECTOR_ELT %0, %1, %2 + $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY %3 +... From 78a388a985b468dde9509368b5a9528d4ce4f01c Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Thu, 3 Oct 2019 18:02:09 +0000 Subject: [PATCH 79/82] [dsymutil] Don't overload LinkOptions. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should fix the build bots: error: declaration of ‘llvm::dsymutil::LinkOptions DsymutilOptions::LinkOptions’ [-fpermissive] git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373640 91177308-0d34-0410-b5e6-96231b3b80d8 --- tools/dsymutil/dsymutil.cpp | 69 ++++++++++++++++++------------------- 1 file changed, 34 insertions(+), 35 deletions(-) diff --git a/tools/dsymutil/dsymutil.cpp b/tools/dsymutil/dsymutil.cpp index aa23891ab39a..fe69abed0a82 100644 --- a/tools/dsymutil/dsymutil.cpp +++ b/tools/dsymutil/dsymutil.cpp @@ -95,7 +95,7 @@ struct DsymutilOptions { std::vector Archs; std::vector InputFiles; unsigned NumThreads; - dsymutil::LinkOptions LinkOptions; + dsymutil::LinkOptions LinkOpts; }; /// Return a list of input files. This function has logic for dealing with the @@ -148,7 +148,7 @@ static Expected> getInputs(opt::InputArgList &Args, // Verify that the given combination of options makes sense. static Error verifyOptions(const DsymutilOptions &Options) { - if (Options.LinkOptions.Update && + if (Options.LinkOpts.Update && std::find(Options.InputFiles.begin(), Options.InputFiles.end(), "-") != Options.InputFiles.end()) { // FIXME: We cannot use stdin for an update because stdin will be @@ -209,15 +209,15 @@ static Expected getOptions(opt::InputArgList &Args) { Options.PaperTrailWarnings = Args.hasArg(OPT_papertrail); Options.Verify = Args.hasArg(OPT_verify); - Options.LinkOptions.Minimize = Args.hasArg(OPT_minimize); - Options.LinkOptions.NoODR = Args.hasArg(OPT_no_odr); - Options.LinkOptions.NoOutput = Args.hasArg(OPT_no_output); - Options.LinkOptions.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp); - Options.LinkOptions.Update = Args.hasArg(OPT_update); - Options.LinkOptions.Verbose = Args.hasArg(OPT_verbose); + Options.LinkOpts.Minimize = Args.hasArg(OPT_minimize); + Options.LinkOpts.NoODR = Args.hasArg(OPT_no_odr); + Options.LinkOpts.NoOutput = Args.hasArg(OPT_no_output); + Options.LinkOpts.NoTimestamp = Args.hasArg(OPT_no_swiftmodule_timestamp); + Options.LinkOpts.Update = Args.hasArg(OPT_update); + Options.LinkOpts.Verbose = Args.hasArg(OPT_verbose); if (Expected AccelKind = getAccelTableKind(Args)) { - Options.LinkOptions.TheAccelTableKind = *AccelKind; + Options.LinkOpts.TheAccelTableKind = *AccelKind; } else { return AccelKind.takeError(); } @@ -226,10 +226,10 @@ static Expected getOptions(opt::InputArgList &Args) { Options.SymbolMap = SymbolMap->getValue(); if (Args.hasArg(OPT_symbolmap)) - Options.LinkOptions.Update = true; + Options.LinkOpts.Update = true; if (Expected> InputFiles = - getInputs(Args, Options.LinkOptions.Update)) { + getInputs(Args, Options.LinkOpts.Update)) { Options.InputFiles = std::move(*InputFiles); } else { return InputFiles.takeError(); @@ -239,7 +239,7 @@ static Expected getOptions(opt::InputArgList &Args) { Options.Archs.push_back(Arch->getValue()); if (opt::Arg *OsoPrependPath = Args.getLastArg(OPT_oso_prepend_path)) - Options.LinkOptions.PrependPath = OsoPrependPath->getValue(); + Options.LinkOpts.PrependPath = OsoPrependPath->getValue(); if (opt::Arg *OutputFile = Args.getLastArg(OPT_output)) Options.OutputFile = OutputFile->getValue(); @@ -248,15 +248,15 @@ static Expected getOptions(opt::InputArgList &Args) { Options.Toolchain = Toolchain->getValue(); if (Args.hasArg(OPT_assembly)) - Options.LinkOptions.FileType = OutputFileType::Assembly; + Options.LinkOpts.FileType = OutputFileType::Assembly; if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) - Options.LinkOptions.Threads = atoi(NumThreads->getValue()); + Options.LinkOpts.Threads = atoi(NumThreads->getValue()); else - Options.LinkOptions.Threads = thread::hardware_concurrency(); + Options.LinkOpts.Threads = thread::hardware_concurrency(); - if (Options.DumpDebugMap || Options.LinkOptions.Verbose) - Options.LinkOptions.Threads = 1; + if (Options.DumpDebugMap || Options.LinkOpts.Verbose) + Options.LinkOpts.Threads = 1; if (getenv("RC_DEBUG_OPTIONS")) Options.PaperTrailWarnings = true; @@ -387,7 +387,7 @@ getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) { // When updating, do in place replacement. if (Options.OutputFile.empty() && - (Options.LinkOptions.Update || !Options.SymbolMap.empty())) + (Options.LinkOpts.Update || !Options.SymbolMap.empty())) return OutputLocation(InputFile); // If a flat dSYM has been requested, things are pretty simple. @@ -413,7 +413,7 @@ getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) { SmallString<128> Path(Options.OutputFile); if (Path.empty()) Path = DwarfFile + ".dSYM"; - if (!Options.LinkOptions.NoOutput) { + if (!Options.LinkOpts.NoOutput) { if (auto E = createBundleDir(Path)) return std::move(E); if (auto E = createPlistFile(DwarfFile, Path, Options.Toolchain)) @@ -481,14 +481,14 @@ int main(int argc, char **argv) { for (auto &InputFile : Options.InputFiles) { // Dump the symbol table for each input file and requested arch if (Options.DumpStab) { - if (!dumpStab(InputFile, Options.Archs, Options.LinkOptions.PrependPath)) + if (!dumpStab(InputFile, Options.Archs, Options.LinkOpts.PrependPath)) return 1; continue; } auto DebugMapPtrsOrErr = - parseDebugMap(InputFile, Options.Archs, Options.LinkOptions.PrependPath, - Options.PaperTrailWarnings, Options.LinkOptions.Verbose, + parseDebugMap(InputFile, Options.Archs, Options.LinkOpts.PrependPath, + Options.PaperTrailWarnings, Options.LinkOpts.Verbose, Options.InputIsYAMLDebugMap); if (auto EC = DebugMapPtrsOrErr.getError()) { @@ -497,7 +497,7 @@ int main(int argc, char **argv) { return 1; } - if (Options.LinkOptions.Update) { + if (Options.LinkOpts.Update) { // The debug map should be empty. Add one object file corresponding to // the input file. for (auto &Map : *DebugMapPtrsOrErr) @@ -514,27 +514,27 @@ int main(int argc, char **argv) { // Shared a single binary holder for all the link steps. BinaryHolder BinHolder; - unsigned ThreadCount = std::min(Options.LinkOptions.Threads, - DebugMapPtrsOrErr->size()); + unsigned ThreadCount = + std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); ThreadPool Threads(ThreadCount); // If there is more than one link to execute, we need to generate // temporary files. bool NeedsTempFiles = !Options.DumpDebugMap && (Options.OutputFile != "-") && - (DebugMapPtrsOrErr->size() != 1 || Options.LinkOptions.Update); + (DebugMapPtrsOrErr->size() != 1 || Options.LinkOpts.Update); SmallVector TempFiles; std::atomic_char AllOK(1); for (auto &Map : *DebugMapPtrsOrErr) { - if (Options.LinkOptions.Verbose || Options.DumpDebugMap) + if (Options.LinkOpts.Verbose || Options.DumpDebugMap) Map->print(outs()); if (Options.DumpDebugMap) continue; if (!Options.SymbolMap.empty()) - Options.LinkOptions.Translator = SymMapLoader.Load(InputFile, *Map); + Options.LinkOpts.Translator = SymMapLoader.Load(InputFile, *Map); if (Map->begin() == Map->end()) WithColor::warning() @@ -551,7 +551,7 @@ int main(int argc, char **argv) { WithColor::error() << toString(OutputLocationOrErr.takeError()); return 1; } - Options.LinkOptions.ResourceDir = OutputLocationOrErr->getResourceDir(); + Options.LinkOpts.ResourceDir = OutputLocationOrErr->getResourceDir(); std::string OutputFile = OutputLocationOrErr->DWARFFile; if (NeedsTempFiles) { @@ -570,15 +570,14 @@ int main(int argc, char **argv) { } else { std::error_code EC; OS = std::make_shared( - Options.LinkOptions.NoOutput ? "-" : OutputFile, EC, - sys::fs::OF_None); + Options.LinkOpts.NoOutput ? "-" : OutputFile, EC, sys::fs::OF_None); if (EC) { WithColor::error() << OutputFile << ": " << EC.message(); return 1; } } - const bool Verify = Options.Verify && !Options.LinkOptions.NoOutput; + const bool Verify = Options.Verify && !Options.LinkOpts.NoOutput; auto LinkLambda = [&, OutputFile](std::shared_ptr Stream, LinkOptions Options) { AllOK.fetch_and( @@ -593,9 +592,9 @@ int main(int argc, char **argv) { // out the (significantly smaller) stack when using threads. We don't // want this limitation when we only have a single thread. if (ThreadCount == 1) - LinkLambda(OS, Options.LinkOptions); + LinkLambda(OS, Options.LinkOpts); else - Threads.async(LinkLambda, OS, Options.LinkOptions); + Threads.async(LinkLambda, OS, Options.LinkOpts); } Threads.wait(); @@ -612,7 +611,7 @@ int main(int argc, char **argv) { } if (!MachOUtils::generateUniversalBinary(TempFiles, OutputLocationOrErr->DWARFFile, - Options.LinkOptions, SDKPath)) + Options.LinkOpts, SDKPath)) return 1; } } From 0b50512f06f27242b39cc3290d76bf8f2463f5c3 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 3 Oct 2019 18:13:50 +0000 Subject: [PATCH 80/82] [X86] matchShuffleWithSHUFPD - use Zeroable element mask directly. NFCI. We can make use of the Zeroable mask to indicate which elements we can safely set to zero instead of creating a target shuffle mask on the fly. This only leaves one user of createTargetShuffleMask which we can hopefully get rid of in a similar manner. This is part of the work to fix PR43024 and allow us to use SimplifyDemandedElts to simplify shuffle chains - we need to get to a point where the target shuffle masks isn't adjusted by its source inputs in setTargetShuffleZeroElements but instead we cache them in a parallel Zeroable mask. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373641 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 088af6c5e3e5..ae2ef76a850d 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -10300,6 +10300,7 @@ static bool isTargetShuffleEquivalent(ArrayRef Mask, // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle // mask. +// TODO: Do we need this? It might be better to use Mask+Zeroable directly. static SmallVector createTargetShuffleMask(ArrayRef Mask, const APInt &Zeroable) { int NumElts = Mask.size(); @@ -15452,7 +15453,8 @@ static SDValue lowerShuffleAsRepeatedMaskAndLanePermute( static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool &ForceV1Zero, bool &ForceV2Zero, - unsigned &ShuffleImm, ArrayRef Mask) { + unsigned &ShuffleImm, ArrayRef Mask, + const APInt &Zeroable) { int NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() == 64 && (NumElts == 2 || NumElts == 4 || NumElts == 8) && @@ -15462,7 +15464,7 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, bool ZeroLane[2] = { true, true }; for (int i = 0; i < NumElts; ++i) - ZeroLane[i & 1] &= isUndefOrZero(Mask[i]); + ZeroLane[i & 1] &= Zeroable[i]; // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, .. // Mask for V4F64; 0/1, 4/5, 2/3, 6/7.. @@ -15495,19 +15497,17 @@ static bool matchShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2, } static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef Original, + SDValue V2, ArrayRef Mask, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64) && "Unexpected data type for VSHUFPD"); - SmallVector Mask = createTargetShuffleMask(Original, Zeroable); - unsigned Immediate = 0; bool ForceV1Zero = false, ForceV2Zero = false; if (!matchShuffleWithSHUFPD(VT, V1, V2, ForceV1Zero, ForceV2Zero, Immediate, - Mask)) + Mask, Zeroable)) return SDValue(); // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs. @@ -32103,7 +32103,7 @@ static bool matchBinaryPermuteShuffle( (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) { bool ForceV1Zero = false, ForceV2Zero = false; if (matchShuffleWithSHUFPD(MaskVT, V1, V2, ForceV1Zero, ForceV2Zero, - PermuteImm, Mask)) { + PermuteImm, Mask, Zeroable)) { V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1; V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2; Shuffle = X86ISD::SHUFP; From b8a1095607972495853657878edcf14873ea2308 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 3 Oct 2019 18:34:42 +0000 Subject: [PATCH 81/82] [X86] Add v32i8 shuffle lowering strategy to recognize two v4i64 vectors truncated to v4i8 and concatenated into the lower 8 bytes with undef/zero upper bytes. This patch recognizes the shuffle pattern we get from a v8i64->v8i8 truncate when v8i64 isn't a legal type. With VLX we can use two VTRUNCs, unpckldq, and a insert_subvector. Diffrential Revision: https://reviews.llvm.org/D68374 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373645 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/X86/X86ISelLowering.cpp | 44 ++++++++++++++++++++++ test/CodeGen/X86/min-legal-vector-width.ll | 19 +++------- test/CodeGen/X86/shuffle-vs-trunc-512.ll | 38 ++++++------------- 3 files changed, 62 insertions(+), 39 deletions(-) diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index ae2ef76a850d..407c4c8137d3 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -15520,6 +15520,42 @@ static SDValue lowerShuffleWithSHUFPD(const SDLoc &DL, MVT VT, SDValue V1, DAG.getTargetConstant(Immediate, DL, MVT::i8)); } +// Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed +// by zeroable elements in the remaining 24 elements. Turn this into two +// vmovqb instructions shuffled together. +static SDValue lowerShuffleAsVTRUNCAndUnpack(const SDLoc &DL, MVT VT, + SDValue V1, SDValue V2, + ArrayRef Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + assert(VT == MVT::v32i8 && "Unexpected type!"); + + // The first 8 indices should be every 8th element. + if (!isSequentialOrUndefInRange(Mask, 0, 8, 0, 8)) + return SDValue(); + + // Remaining elements need to be zeroable. + if (Zeroable.countLeadingOnes() < (Mask.size() - 8)) + return SDValue(); + + V1 = DAG.getBitcast(MVT::v4i64, V1); + V2 = DAG.getBitcast(MVT::v4i64, V2); + + V1 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V1); + V2 = DAG.getNode(X86ISD::VTRUNC, DL, MVT::v16i8, V2); + + // The VTRUNCs will put 0s in the upper 12 bytes. Use them to put zeroes in + // the upper bits of the result using an unpckldq. + SDValue Unpack = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, + { 0, 1, 2, 3, 16, 17, 18, 19, + 4, 5, 6, 7, 20, 21, 22, 23 }); + // Insert the unpckldq into a zero vector to widen to v32i8. + return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v32i8, + DAG.getConstant(0, DL, MVT::v32i8), Unpack, + DAG.getIntPtrConstant(0, DL)); +} + + /// Handle lowering of 4-lane 64-bit floating point shuffles. /// /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2 @@ -16120,6 +16156,14 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef Mask, DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget)) return V; + // Look for {0, 8, 16, 24, 32, 40, 48, 56 } in the first 8 elements. Followed + // by zeroable elements in the remaining 24 elements. Turn this into two + // vmovqb instructions shuffled together. + if (Subtarget.hasVLX()) + if (SDValue V = lowerShuffleAsVTRUNCAndUnpack(DL, MVT::v32i8, V1, V2, + Mask, Zeroable, DAG)) + return V; + // Otherwise fall back on generic lowering. return lowerShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG); diff --git a/test/CodeGen/X86/min-legal-vector-width.ll b/test/CodeGen/X86/min-legal-vector-width.ll index d0bc67a4485e..46e73c1f8542 100644 --- a/test/CodeGen/X86/min-legal-vector-width.ll +++ b/test/CodeGen/X86/min-legal-vector-width.ll @@ -831,19 +831,12 @@ define <16 x i8> @trunc_v16i32_v16i8(<16 x i32>* %x) nounwind "min-legal-vector- define <8 x i8> @trunc_v8i64_v8i8(<8 x i64>* %x) nounwind "min-legal-vector-width"="256" { ; CHECK-AVX512-LABEL: trunc_v8i64_v8i8: ; CHECK-AVX512: # %bb.0: -; CHECK-AVX512-NEXT: vmovdqa (%rdi), %xmm0 -; CHECK-AVX512-NEXT: vmovdqa 16(%rdi), %xmm1 -; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; CHECK-AVX512-NEXT: vmovdqa 48(%rdi), %xmm3 -; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm4 = -; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; CHECK-AVX512-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; CHECK-AVX512-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; CHECK-AVX512-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; CHECK-AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; CHECK-AVX512-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; CHECK-AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; CHECK-AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; CHECK-AVX512-NEXT: vpmovqb %ymm1, %xmm1 +; CHECK-AVX512-NEXT: vpmovqb %ymm0, %xmm0 +; CHECK-AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; CHECK-AVX512-NEXT: vzeroupper ; CHECK-AVX512-NEXT: retq ; ; CHECK-VBMI-LABEL: trunc_v8i64_v8i8: diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll index ebb3b623c467..720cabee9122 100644 --- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll +++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll @@ -549,20 +549,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512VL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512VL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512VL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512VL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512VL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512VL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512VL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512VL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512VL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512VL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512VL-NEXT: vmovq %xmm0, (%rsi) +; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq ; ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8: @@ -585,20 +578,13 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind { ; ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8: ; AVX512BWVL: # %bb.0: -; AVX512BWVL-NEXT: vmovdqa (%rdi), %xmm0 -; AVX512BWVL-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512BWVL-NEXT: vmovdqa 48(%rdi), %xmm3 -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm4 = -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX512BWVL-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512BWVL-NEXT: vmovdqa {{.*#+}} xmm3 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u> -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX512BWVL-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX512BWVL-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512BWVL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] +; AVX512BWVL-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512BWVL-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512BWVL-NEXT: vpmovqb %ymm1, %xmm1 +; AVX512BWVL-NEXT: vpmovqb %ymm0, %xmm0 +; AVX512BWVL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; AVX512BWVL-NEXT: vmovq %xmm0, (%rsi) +; AVX512BWVL-NEXT: vzeroupper ; AVX512BWVL-NEXT: retq ; ; AVX512VBMI-LABEL: shuffle_v64i8_to_v8i8: From 0fe42534131a63e6938c7922dbf93eac506bd70e Mon Sep 17 00:00:00 2001 From: Jordan Rupprecht Date: Thu, 3 Oct 2019 18:35:44 +0000 Subject: [PATCH 82/82] [NFC] Fix unused variable in release builds git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@373646 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp index b868e1d6437a..35040af98121 100644 --- a/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp @@ -1648,6 +1648,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register SrcReg = MI.getOperand(1).getReg(); Register IdxReg = MI.getOperand(2).getReg(); LLT DstTy = MRI.getType(DstReg); + (void)DstTy; assert(DstTy.getSizeInBits() == 64); @@ -1715,6 +1716,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl( Register IdxReg = MI.getOperand(3).getReg(); LLT SrcTy = MRI.getType(SrcReg); LLT InsTy = MRI.getType(InsReg); + (void)InsTy; assert(InsTy.getSizeInBits() == 64); @@ -2921,4 +2923,3 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { getOperandsMapping(OpdsMapping), MI.getNumOperands()); } -