From e28e93550a74752714db6fffe50233aa96e536a5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 23 Jan 2025 20:58:02 +0700 Subject: [PATCH] AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (#123684) For VALU shuffles, this saves an instruction in some case. --- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 119 ++++++++ llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h | 1 + llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 + .../AMDGPU/shufflevector.v2f32.v2f32.ll | 49 ++-- .../AMDGPU/shufflevector.v2f32.v3f32.ll | 40 ++- .../AMDGPU/shufflevector.v2f32.v4f32.ll | 84 +++--- .../AMDGPU/shufflevector.v2f32.v8f32.ll | 272 ++++++++---------- .../AMDGPU/shufflevector.v2i32.v2i32.ll | 49 ++-- .../AMDGPU/shufflevector.v2i32.v3i32.ll | 40 ++- .../AMDGPU/shufflevector.v2i32.v4i32.ll | 84 +++--- .../AMDGPU/shufflevector.v2i32.v8i32.ll | 272 ++++++++---------- .../CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll | 49 ++-- .../CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll | 40 ++- .../CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll | 84 +++--- .../CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll | 272 ++++++++---------- .../CodeGen/AMDGPU/vector_shuffle.packed.ll | 96 +++---- .../InferAddressSpaces/AMDGPU/flat_atomic.ll | 3 +- 17 files changed, 728 insertions(+), 833 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 6d5c3b5e0742..8e90754103ff 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -489,6 +489,95 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) { CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs); } +void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) { + EVT VT = N->getValueType(0); + EVT EltVT = VT.getVectorElementType(); + + // TODO: Handle 16-bit element vectors with even aligned masks. + if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) || + VT.getVectorNumElements() != 2) { + SelectCode(N); + return; + } + + auto *SVN = cast(N); + + SDValue Src0 = SVN->getOperand(0); + SDValue Src1 = SVN->getOperand(1); + ArrayRef Mask = SVN->getMask(); + SDLoc DL(N); + + assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 && + Mask[0] < 4 && Mask[1] < 4); + + SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1; + SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1; + unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; + unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0; + + if (Mask[0] < 0) { + Src0SubReg = Src1SubReg; + MachineSDNode *ImpDef = + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + VSrc0 = SDValue(ImpDef, 0); + } + + if (Mask[1] < 0) { + Src1SubReg = Src0SubReg; + MachineSDNode *ImpDef = + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT); + VSrc1 = SDValue(ImpDef, 0); + } + + // SGPR case needs to lower to copies. + // + // Also use subregister extract when we can directly blend the registers with + // a simple subregister copy. + // + // TODO: Maybe we should fold this out earlier + if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 && + Src1SubReg == AMDGPU::sub0) { + // The low element of the result always comes from src0. + // The high element of the result always comes from src1. + // op_sel selects the high half of src0. + // op_sel_hi selects the high half of src1. + + unsigned Src0OpSel = + Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + unsigned Src1OpSel = + Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE; + + // Enable op_sel_hi to avoid printing it. This should have no effect on the + // result. + Src0OpSel |= SISrcMods::OP_SEL_1; + Src1OpSel |= SISrcMods::OP_SEL_1; + + SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32); + SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32); + SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32); + + CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(), + {Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1, + ZeroMods, // clamp + ZeroMods, // op_sel + ZeroMods, // op_sel_hi + ZeroMods, // neg_lo + ZeroMods}); // neg_hi + return; + } + + SDValue ResultElt0 = + CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0); + SDValue ResultElt1 = + CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1); + + const SDValue Ops[] = { + CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32), + ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32), + ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)}; + CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops); +} + void AMDGPUDAGToDAGISel::Select(SDNode *N) { unsigned int Opc = N->getOpcode(); if (N->isMachineOpcode()) { @@ -562,6 +651,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) { SelectBuildVector(N, RegClassID); return; } + case ISD::VECTOR_SHUFFLE: + SelectVectorShuffle(N); + return; case ISD::BUILD_PAIR: { SDValue RC, SubReg0, SubReg1; SDLoc DL(N); @@ -3101,6 +3193,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src, } Mods = VecMods; + } else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE && + Src.getNumOperands() == 2) { + + // TODO: We should repeat the build_vector source check above for the + // vector_shuffle for negates and casts of individual elements. + + auto *SVN = cast(Src); + ArrayRef Mask = SVN->getMask(); + + if (Mask[0] < 2 && Mask[1] < 2) { + // src1 should be undef. + SDValue ShuffleSrc = SVN->getOperand(0); + + if (ShuffleSrc.getOpcode() == ISD::FNEG) { + ShuffleSrc = ShuffleSrc.getOperand(0); + Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI); + } + + if (Mask[0] == 1) + Mods |= SISrcMods::OP_SEL_0; + if (Mask[1] == 1) + Mods |= SISrcMods::OP_SEL_1; + + Src = ShuffleSrc; + SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32); + return true; + } } // Packed instructions do not have abs modifiers. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h index 7e61eb470622..7dcd208a9cdd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h @@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel { protected: void SelectBuildVector(SDNode *N, unsigned RegClassID); + void SelectVectorShuffle(SDNode *N); private: std::pair foldFrameIndex(SDValue N) const; diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6cf5774fc53b..1aeca7f370aa 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32}, Expand); + if (Subtarget->hasPkMovB32()) { + // TODO: 16-bit element vectors should be legal with even aligned elements. + // TODO: Can be legal with wider source types than the result with + // subregister extracts. + setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); + } + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, Custom); diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll index 3410b067fb5b..2f6ddc63cb3e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll index 7edb6939f884..3d42e66eb865 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x float> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll index ea02b31bff04..a312b40a99a8 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v4f32.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2f32_v4f32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2f32_v4f32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2f32_v4f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2f32_v4f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2f32_v4f32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2f32_v4f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2f32_v4f32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2f32_v4f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x float> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2f32_v4f32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2f32_v4f32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll index 0fc63853f63a..2568390d8d7a 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v8f32.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2f32_v8f32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2f32_v8f32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2f32_v8f32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2f32_v8f32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2f32_v8f32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2f32_v8f32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2f32_v8f32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2f32_v8f32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2f32_v8f32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2f32_v8f32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2f32_v8f32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2f32_v8f32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2f32_v8f32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2f32_v8f32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2f32_v8f32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2f32_v8f32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2f32_v8f32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2f32_v8f32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2f32_v8f32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2f32_v8f32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2f32_v8f32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2f32_v8f32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x float> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2f32_v8f32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2f32_v8f32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2f32_v8f32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2f32_v8f32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2f32_v8f32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2f32_v8f32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2f32_v8f32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2f32_v8f32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2f32_v8f32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2f32_v8f32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2f32_v8f32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2f32_v8f32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2f32_v8f32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2f32_v8f32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll index 632e8d2a32ba..2d27d7199ddf 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v2i32.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2i32_v2i32__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2i32_v2i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2i32_v2i32__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2i32_v2i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2i32_v2i32__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2i32_v2i32__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll index fb6671ca7870..ea08df2e4f50 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v3i32.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2i32_v3i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2i32_v3i32__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2i32_v3i32__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x i32> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll index b4051228a443..a2431d56ce2f 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v4i32.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2i32_v4i32__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2i32_v4i32__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2i32_v4i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2i32_v4i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2i32_v4i32__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2i32_v4i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2i32_v4i32__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2i32_v4i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x i32> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2i32_v4i32__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2i32_v4i32__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll index 11d1b88a938f..83a51bc87ecc 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2i32.v8i32.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2i32_v8i32__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2i32_v8i32__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2i32_v8i32__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2i32_v8i32__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2i32_v8i32__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2i32_v8i32__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2i32_v8i32__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2i32_v8i32__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2i32_v8i32__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2i32_v8i32__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2i32_v8i32__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2i32_v8i32__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2i32_v8i32__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2i32_v8i32__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2i32_v8i32__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2i32_v8i32__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2i32_v8i32__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2i32_v8i32__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2i32_v8i32__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2i32_v8i32__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2i32_v8i32__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2i32_v8i32__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x i32> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2i32_v8i32__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2i32_v8i32__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2i32_v8i32__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2i32_v8i32__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2i32_v8i32__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2i32_v8i32__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2i32_v8i32__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2i32_v8i32__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2i32_v8i32__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2i32_v8i32__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2i32_v8i32__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2i32_v8i32__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2i32_v8i32__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2i32_v8i32__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll index 2cb50e0493ae..6d5005a89983 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v2p3.ll @@ -171,15 +171,14 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[2:3] +; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def v[0:1] +; GFX90A-NEXT: ; def v[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -187,15 +186,15 @@ define void @v_shuffle_v2p3_v2p3__3_0(ptr addrspace(1) inreg %ptr) { ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[2:3] +; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART -; GFX940-NEXT: ; def v[0:1] +; GFX940-NEXT: ; def v[2:3] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -274,27 +273,24 @@ define void @v_shuffle_v2p3_v2p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__3_2: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2p3_v2p3__3_2: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -447,27 +443,24 @@ define void @v_shuffle_v2p3_v2p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-LABEL: v_shuffle_v2p3_v2p3__1_0: ; GFX90A: ; %bb.0: ; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; ; GFX940-LABEL: v_shuffle_v2p3_v2p3__1_0: ; GFX940: ; %bb.0: ; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX940-NEXT: v_mov_b32_e32 v4, 0 ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:1] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v2, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <2 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll index b92fa40a2699..2c8f2952fd10 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v3p3.ll @@ -632,10 +632,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -645,10 +644,9 @@ define void @v_shuffle_v2p3_v3p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -765,13 +763,12 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v5, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:4] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v5, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -786,9 +783,8 @@ define void @v_shuffle_v2p3_v3p3__4_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:4] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:2] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_mov_b32_e32 v3, 0 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2p3_v3p3__4_3(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[0:2] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_mov_b32_e32 v3, 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <3 x ptr addrspace(3)> asm "; def $0", "=v"() diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll index 8080c22d7921..20abdd10f949 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v4p3.ll @@ -335,13 +335,12 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -356,9 +355,8 @@ define void @v_shuffle_v2p3_v4p3__7_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -447,8 +445,7 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -463,8 +460,8 @@ define void @v_shuffle_v2p3_v4p3__7_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -637,8 +634,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -650,8 +646,7 @@ define void @v_shuffle_v2p3_v4p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -809,9 +804,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -822,9 +816,8 @@ define void @v_shuffle_v2p3_v4p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -984,13 +977,12 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v6, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v6, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v6, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1005,9 +997,8 @@ define void @v_shuffle_v2p3_v4p3__5_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v6, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1607,8 +1598,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1620,8 +1610,7 @@ define void @v_shuffle_v2p3_v4p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1698,8 +1687,7 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1714,8 +1702,8 @@ define void @v_shuffle_v2p3_v4p3__5_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -2331,9 +2319,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:3] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v4, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2344,9 +2331,8 @@ define void @v_shuffle_v2p3_v4p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:3] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v4, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <4 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2898,8 +2884,7 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:5] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v6, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -2914,8 +2899,8 @@ define void @v_shuffle_v2p3_v4p3__1_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:5] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3005,8 +2990,7 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:7] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3021,8 +3005,8 @@ define void @v_shuffle_v2p3_v4p3__3_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:7] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll index 02a5800ce189..df7bdbf04d4e 100644 --- a/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll +++ b/llvm/test/CodeGen/AMDGPU/shufflevector.v2p3.v8p3.ll @@ -659,13 +659,12 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v9 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -680,9 +679,8 @@ define void @v_shuffle_v2p3_v8p3__15_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v9 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -771,8 +769,7 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -787,8 +784,8 @@ define void @v_shuffle_v2p3_v8p3__15_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -878,8 +875,7 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -894,8 +890,8 @@ define void @v_shuffle_v2p3_v8p3__15_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -985,8 +981,7 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v15 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1001,8 +996,8 @@ define void @v_shuffle_v2p3_v8p3__15_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v15 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[14:15], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1351,8 +1346,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -1364,8 +1358,7 @@ define void @v_shuffle_v2p3_v8p3__15_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -1523,9 +1516,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1536,9 +1528,8 @@ define void @v_shuffle_v2p3_v8p3__1_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1870,13 +1861,12 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v3 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1891,9 +1881,8 @@ define void @v_shuffle_v2p3_v8p3__9_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v3 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -1978,13 +1967,12 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v5 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -1999,9 +1987,8 @@ define void @v_shuffle_v2p3_v8p3__11_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v5 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -2086,13 +2073,12 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND +; GFX90A-NEXT: v_mov_b32_e32 v10, 0 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v10, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v7 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v10, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -2107,9 +2093,8 @@ define void @v_shuffle_v2p3_v8p3__13_0(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: s_nop 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v7 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v10, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -3089,8 +3074,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3102,8 +3086,7 @@ define void @v_shuffle_v2p3_v8p3__3_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3352,8 +3335,7 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3368,8 +3350,8 @@ define void @v_shuffle_v2p3_v8p3__9_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3459,8 +3441,7 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3475,8 +3456,8 @@ define void @v_shuffle_v2p3_v8p3__11_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -3566,8 +3547,7 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -3582,8 +3562,8 @@ define void @v_shuffle_v2p3_v8p3__13_2(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v2 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4650,8 +4630,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4663,8 +4642,7 @@ define void @v_shuffle_v2p3_v8p3__5_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4827,8 +4805,7 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4843,8 +4820,8 @@ define void @v_shuffle_v2p3_v8p3__9_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -4934,8 +4911,7 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -4950,8 +4926,8 @@ define void @v_shuffle_v2p3_v8p3__11_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -5041,8 +5017,7 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -5057,8 +5032,8 @@ define void @v_shuffle_v2p3_v8p3__13_4(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v4 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6211,8 +6186,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6224,8 +6198,7 @@ define void @v_shuffle_v2p3_v8p3__7_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6302,8 +6275,7 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v9 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6318,8 +6290,8 @@ define void @v_shuffle_v2p3_v8p3__9_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v9 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[8:9], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6409,8 +6381,7 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v11 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6425,8 +6396,8 @@ define void @v_shuffle_v2p3_v8p3__11_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v11 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[10:11], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -6516,8 +6487,7 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v13 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -6532,8 +6502,8 @@ define void @v_shuffle_v2p3_v8p3__13_6(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v1, v6 -; GFX940-NEXT: v_mov_b32_e32 v0, v13 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[12:13], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -7689,9 +7659,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v2, v1 -; GFX90A-NEXT: v_mov_b32_e32 v3, v0 -; GFX90A-NEXT: global_store_dwordx2 v8, v[2:3], s[16:17] +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] ; @@ -7702,9 +7671,8 @@ define void @v_shuffle_v2p3_v8p3__9_8(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v2, v1 -; GFX940-NEXT: v_mov_b32_e32 v3, v0 -; GFX940-NEXT: global_store_dwordx2 v8, v[2:3], s[0:1] sc0 sc1 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0] +; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] %vec0 = call <8 x ptr addrspace(3)> asm "; def $0", "=v"() @@ -8816,8 +8784,7 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8832,8 +8799,8 @@ define void @v_shuffle_v2p3_v8p3__1_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -8923,8 +8890,7 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -8939,8 +8905,8 @@ define void @v_shuffle_v2p3_v8p3__3_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9030,8 +8996,7 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9046,8 +9011,8 @@ define void @v_shuffle_v2p3_v8p3__5_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9137,8 +9102,7 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9153,8 +9117,8 @@ define void @v_shuffle_v2p3_v8p3__7_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -9315,8 +9279,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v2 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -9328,8 +9291,7 @@ define void @v_shuffle_v2p3_v8p3__11_10(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v2 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10354,8 +10316,7 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v6 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10370,8 +10331,8 @@ define void @v_shuffle_v2p3_v8p3__1_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v6 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[6:7] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10461,8 +10422,7 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10477,8 +10437,8 @@ define void @v_shuffle_v2p3_v8p3__3_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10568,8 +10528,7 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10584,8 +10543,8 @@ define void @v_shuffle_v2p3_v8p3__5_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10675,8 +10634,7 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10691,8 +10649,8 @@ define void @v_shuffle_v2p3_v8p3__7_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -10941,8 +10899,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ; def v[0:7] ; GFX90A-NEXT: ;;#ASMEND ; GFX90A-NEXT: v_mov_b32_e32 v8, 0 -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v4 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v8, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -10954,8 +10911,7 @@ define void @v_shuffle_v2p3_v8p3__13_12(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ; def v[0:7] ; GFX940-NEXT: ;;#ASMEND ; GFX940-NEXT: v_mov_b32_e32 v8, 0 -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v4 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v8, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11892,8 +11848,7 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[2:9] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v1 -; GFX90A-NEXT: v_mov_b32_e32 v1, v8 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v10, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -11908,8 +11863,8 @@ define void @v_shuffle_v2p3_v8p3__1_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[2:9] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v1 -; GFX940-NEXT: v_mov_b32_e32 v1, v8 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[8:9] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v10, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -11999,8 +11954,7 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[4:11] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v3 -; GFX90A-NEXT: v_mov_b32_e32 v1, v10 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v12, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12015,8 +11969,8 @@ define void @v_shuffle_v2p3_v8p3__3_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[4:11] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v3 -; GFX940-NEXT: v_mov_b32_e32 v1, v10 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[10:11] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v12, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12106,8 +12060,7 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[6:13] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v5 -; GFX90A-NEXT: v_mov_b32_e32 v1, v12 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v14, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12122,8 +12075,8 @@ define void @v_shuffle_v2p3_v8p3__5_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[6:13] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v5 -; GFX940-NEXT: v_mov_b32_e32 v1, v12 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[12:13] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v14, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] @@ -12213,8 +12166,7 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def v[8:15] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_mov_b32_e32 v0, v7 -; GFX90A-NEXT: v_mov_b32_e32 v1, v14 +; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX90A-NEXT: global_store_dwordx2 v16, v[0:1], s[16:17] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_setpc_b64 s[30:31] @@ -12229,8 +12181,8 @@ define void @v_shuffle_v2p3_v8p3__7_14(ptr addrspace(1) inreg %ptr) { ; GFX940-NEXT: ;;#ASMSTART ; GFX940-NEXT: ; def v[8:15] ; GFX940-NEXT: ;;#ASMEND -; GFX940-NEXT: v_mov_b32_e32 v0, v7 -; GFX940-NEXT: v_mov_b32_e32 v1, v14 +; GFX940-NEXT: s_nop 0 +; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[6:7], v[14:15] op_sel:[1,0] ; GFX940-NEXT: global_store_dwordx2 v16, v[0:1], s[0:1] sc0 sc1 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll index e7ae9d831424..b85bd4c63466 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -4942,78 +4942,78 @@ define amdgpu_kernel void @fma_shuffle_v2bf16(ptr addrspace(1) nocapture readonl ; ; GFX940-LABEL: fma_shuffle_v2bf16: ; GFX940: ; %bb.0: ; %entry -; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX940-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 +; GFX940-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x10 ; GFX940-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v6, 3, v0 ; GFX940-NEXT: s_movk_i32 s2, 0x7fff ; GFX940-NEXT: s_mov_b32 s3, 0x7060302 ; GFX940-NEXT: s_waitcnt lgkmcnt(0) -; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1] -; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[8:9] +; GFX940-NEXT: global_load_dwordx2 v[0:1], v6, s[8:9] +; GFX940-NEXT: global_load_dwordx2 v[2:3], v6, s[0:1] ; GFX940-NEXT: global_load_dwordx2 v[4:5], v6, s[10:11] ; GFX940-NEXT: s_waitcnt vmcnt(2) -; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v7, 16, v0 ; GFX940-NEXT: s_waitcnt vmcnt(1) -; GFX940-NEXT: v_lshlrev_b32_e32 v8, 16, v2 +; GFX940-NEXT: v_and_b32_e32 v8, 0xffff0000, v2 ; GFX940-NEXT: s_waitcnt vmcnt(0) ; GFX940-NEXT: v_and_b32_e32 v9, 0xffff0000, v4 -; GFX940-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX940-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX940-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX940-NEXT: v_and_b32_e32 v11, 0xffff0000, v1 -; GFX940-NEXT: v_lshlrev_b32_e32 v12, 16, v3 -; GFX940-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX940-NEXT: v_fmac_f32_e32 v7, v8, v9 -; GFX940-NEXT: v_fmac_f32_e32 v0, v8, v4 -; GFX940-NEXT: v_fmac_f32_e32 v1, v12, v4 -; GFX940-NEXT: v_bfe_u32 v4, v7, 16, 1 -; GFX940-NEXT: v_fmac_f32_e32 v11, v12, v9 -; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v7 -; GFX940-NEXT: v_bfe_u32 v9, v0, 16, 1 -; GFX940-NEXT: v_add3_u32 v4, v4, v7, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v7, v7 -; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v0 -; GFX940-NEXT: v_bfe_u32 v13, v11, 16, 1 -; GFX940-NEXT: v_add3_u32 v9, v9, v0, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 -; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v11 -; GFX940-NEXT: v_bfe_u32 v15, v1, 16, 1 -; GFX940-NEXT: v_add3_u32 v13, v13, v11, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v9, v12, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v11, v11 -; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; GFX940-NEXT: v_lshlrev_b32_e32 v11, 16, v1 +; GFX940-NEXT: v_and_b32_e32 v12, 0xffff0000, v3 +; GFX940-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX940-NEXT: v_fmac_f32_e32 v8, v7, v9 +; GFX940-NEXT: v_fmac_f32_e32 v2, v7, v4 +; GFX940-NEXT: v_fmac_f32_e32 v3, v11, v4 +; GFX940-NEXT: v_bfe_u32 v4, v8, 16, 1 +; GFX940-NEXT: v_fmac_f32_e32 v12, v11, v9 +; GFX940-NEXT: v_or_b32_e32 v7, 0x400000, v8 +; GFX940-NEXT: v_bfe_u32 v9, v2, 16, 1 +; GFX940-NEXT: v_add3_u32 v4, v4, v8, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v8, v8 +; GFX940-NEXT: v_or_b32_e32 v11, 0x400000, v2 +; GFX940-NEXT: v_bfe_u32 v13, v12, 16, 1 +; GFX940-NEXT: v_add3_u32 v9, v9, v2, s2 +; GFX940-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 +; GFX940-NEXT: v_or_b32_e32 v14, 0x400000, v12 +; GFX940-NEXT: v_bfe_u32 v15, v3, 16, 1 +; GFX940-NEXT: v_add3_u32 v13, v13, v12, s2 +; GFX940-NEXT: v_cndmask_b32_e32 v2, v9, v11, vcc +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v12, v12 +; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 ; GFX940-NEXT: v_lshlrev_b32_e32 v10, 16, v5 -; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v1 -; GFX940-NEXT: v_add3_u32 v15, v15, v1, s2 +; GFX940-NEXT: v_or_b32_e32 v16, 0x400000, v3 +; GFX940-NEXT: v_add3_u32 v15, v15, v3, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v7, v13, v14, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 -; GFX940-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 +; GFX940-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 ; GFX940-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; GFX940-NEXT: v_cndmask_b32_e32 v1, v15, v16, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v3, v15, v16, vcc ; GFX940-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; GFX940-NEXT: v_fmac_f32_e32 v0, v2, v10 -; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; GFX940-NEXT: v_fmac_f32_e32 v2, v0, v10 ; GFX940-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 +; GFX940-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 ; GFX940-NEXT: v_and_b32_e32 v7, 0xffff0000, v7 -; GFX940-NEXT: v_fmac_f32_e32 v4, v2, v5 -; GFX940-NEXT: v_bfe_u32 v2, v0, 16, 1 -; GFX940-NEXT: v_fmac_f32_e32 v1, v3, v10 -; GFX940-NEXT: v_fmac_f32_e32 v7, v3, v5 -; GFX940-NEXT: v_or_b32_e32 v3, 0x400000, v0 +; GFX940-NEXT: v_fmac_f32_e32 v4, v0, v5 +; GFX940-NEXT: v_bfe_u32 v0, v2, 16, 1 +; GFX940-NEXT: v_fmac_f32_e32 v3, v1, v10 +; GFX940-NEXT: v_fmac_f32_e32 v7, v1, v5 +; GFX940-NEXT: v_or_b32_e32 v1, 0x400000, v2 ; GFX940-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX940-NEXT: v_add3_u32 v2, v2, v0, s2 -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; GFX940-NEXT: v_add3_u32 v0, v0, v2, s2 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v2, v2 ; GFX940-NEXT: v_or_b32_e32 v8, 0x400000, v4 -; GFX940-NEXT: v_bfe_u32 v9, v1, 16, 1 +; GFX940-NEXT: v_bfe_u32 v9, v3, 16, 1 ; GFX940-NEXT: v_add3_u32 v5, v5, v4, s2 -; GFX940-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GFX940-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v4, v4 -; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v1 +; GFX940-NEXT: v_or_b32_e32 v10, 0x400000, v3 ; GFX940-NEXT: v_bfe_u32 v11, v7, 16, 1 -; GFX940-NEXT: v_add3_u32 v9, v9, v1, s2 +; GFX940-NEXT: v_add3_u32 v9, v9, v3, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v2, v5, v8, vcc -; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v1, v1 +; GFX940-NEXT: v_cmp_u_f32_e32 vcc, v3, v3 ; GFX940-NEXT: v_or_b32_e32 v12, 0x400000, v7 ; GFX940-NEXT: v_add3_u32 v11, v11, v7, s2 ; GFX940-NEXT: v_cndmask_b32_e32 v1, v9, v10, vcc diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll index 6633cec659d8..39af91b81110 100644 --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll @@ -70,8 +70,7 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; CHECK-NEXT: s_mov_b64 s[6:7], exec ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s8 -; CHECK-NEXT: v_mov_b32_e32 v1, s9 +; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1] ; CHECK-NEXT: v_pk_mov_b32 v[2:3], s[2:3], s[2:3] op_sel:[0,1] ; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)