Skip to content

Commit

Permalink
AMDGPU: Make vector_shuffle legal for v2i32 with v_pk_mov_b32 (#123684)
Browse files Browse the repository at this point in the history
For VALU shuffles, this saves an instruction in some case.
  • Loading branch information
arsenm authored Jan 23, 2025
1 parent 92b839e commit e28e935
Show file tree
Hide file tree
Showing 17 changed files with 728 additions and 833 deletions.
119 changes: 119 additions & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,95 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(), RegSeqArgs);
}

void AMDGPUDAGToDAGISel::SelectVectorShuffle(SDNode *N) {
EVT VT = N->getValueType(0);
EVT EltVT = VT.getVectorElementType();

// TODO: Handle 16-bit element vectors with even aligned masks.
if (!Subtarget->hasPkMovB32() || !EltVT.bitsEq(MVT::i32) ||
VT.getVectorNumElements() != 2) {
SelectCode(N);
return;
}

auto *SVN = cast<ShuffleVectorSDNode>(N);

SDValue Src0 = SVN->getOperand(0);
SDValue Src1 = SVN->getOperand(1);
ArrayRef<int> Mask = SVN->getMask();
SDLoc DL(N);

assert(Src0.getValueType().getVectorNumElements() == 2 && Mask.size() == 2 &&
Mask[0] < 4 && Mask[1] < 4);

SDValue VSrc0 = Mask[0] < 2 ? Src0 : Src1;
SDValue VSrc1 = Mask[1] < 2 ? Src0 : Src1;
unsigned Src0SubReg = Mask[0] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;
unsigned Src1SubReg = Mask[1] & 1 ? AMDGPU::sub1 : AMDGPU::sub0;

if (Mask[0] < 0) {
Src0SubReg = Src1SubReg;
MachineSDNode *ImpDef =
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
VSrc0 = SDValue(ImpDef, 0);
}

if (Mask[1] < 0) {
Src1SubReg = Src0SubReg;
MachineSDNode *ImpDef =
CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
VSrc1 = SDValue(ImpDef, 0);
}

// SGPR case needs to lower to copies.
//
// Also use subregister extract when we can directly blend the registers with
// a simple subregister copy.
//
// TODO: Maybe we should fold this out earlier
if (N->isDivergent() && Src0SubReg == AMDGPU::sub1 &&
Src1SubReg == AMDGPU::sub0) {
// The low element of the result always comes from src0.
// The high element of the result always comes from src1.
// op_sel selects the high half of src0.
// op_sel_hi selects the high half of src1.

unsigned Src0OpSel =
Src0SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;
unsigned Src1OpSel =
Src1SubReg == AMDGPU::sub1 ? SISrcMods::OP_SEL_0 : SISrcMods::NONE;

// Enable op_sel_hi to avoid printing it. This should have no effect on the
// result.
Src0OpSel |= SISrcMods::OP_SEL_1;
Src1OpSel |= SISrcMods::OP_SEL_1;

SDValue Src0OpSelVal = CurDAG->getTargetConstant(Src0OpSel, DL, MVT::i32);
SDValue Src1OpSelVal = CurDAG->getTargetConstant(Src1OpSel, DL, MVT::i32);
SDValue ZeroMods = CurDAG->getTargetConstant(0, DL, MVT::i32);

CurDAG->SelectNodeTo(N, AMDGPU::V_PK_MOV_B32, N->getVTList(),
{Src0OpSelVal, VSrc0, Src1OpSelVal, VSrc1,
ZeroMods, // clamp
ZeroMods, // op_sel
ZeroMods, // op_sel_hi
ZeroMods, // neg_lo
ZeroMods}); // neg_hi
return;
}

SDValue ResultElt0 =
CurDAG->getTargetExtractSubreg(Src0SubReg, DL, EltVT, VSrc0);
SDValue ResultElt1 =
CurDAG->getTargetExtractSubreg(Src1SubReg, DL, EltVT, VSrc1);

const SDValue Ops[] = {
CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, DL, MVT::i32),
ResultElt0, CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
ResultElt1, CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
CurDAG->SelectNodeTo(N, TargetOpcode::REG_SEQUENCE, VT, Ops);
}

void AMDGPUDAGToDAGISel::Select(SDNode *N) {
unsigned int Opc = N->getOpcode();
if (N->isMachineOpcode()) {
Expand Down Expand Up @@ -562,6 +651,9 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
SelectBuildVector(N, RegClassID);
return;
}
case ISD::VECTOR_SHUFFLE:
SelectVectorShuffle(N);
return;
case ISD::BUILD_PAIR: {
SDValue RC, SubReg0, SubReg1;
SDLoc DL(N);
Expand Down Expand Up @@ -3101,6 +3193,33 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
}

Mods = VecMods;
} else if (Src.getOpcode() == ISD::VECTOR_SHUFFLE &&
Src.getNumOperands() == 2) {

// TODO: We should repeat the build_vector source check above for the
// vector_shuffle for negates and casts of individual elements.

auto *SVN = cast<ShuffleVectorSDNode>(Src);
ArrayRef<int> Mask = SVN->getMask();

if (Mask[0] < 2 && Mask[1] < 2) {
// src1 should be undef.
SDValue ShuffleSrc = SVN->getOperand(0);

if (ShuffleSrc.getOpcode() == ISD::FNEG) {
ShuffleSrc = ShuffleSrc.getOperand(0);
Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
}

if (Mask[0] == 1)
Mods |= SISrcMods::OP_SEL_0;
if (Mask[1] == 1)
Mods |= SISrcMods::OP_SEL_1;

Src = ShuffleSrc;
SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
return true;
}
}

// Packed instructions do not have abs modifiers.
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {

protected:
void SelectBuildVector(SDNode *N, unsigned RegClassID);
void SelectVectorShuffle(SDNode *N);

private:
std::pair<SDValue, SDValue> foldFrameIndex(SDValue N) const;
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Target/AMDGPU/SIISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
{MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32},
Expand);

if (Subtarget->hasPkMovB32()) {
// TODO: 16-bit element vectors should be legal with even aligned elements.
// TODO: Can be legal with wider source types than the result with
// subregister extracts.
setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
}

setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
Custom);

Expand Down
49 changes: 21 additions & 28 deletions llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v2f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,31 +171,30 @@ define void @v_shuffle_v2f32_v2f32__3_0(ptr addrspace(1) inreg %ptr) {
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ; def v[2:3]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v4, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ; def v[2:3]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -274,27 +273,24 @@ define void @v_shuffle_v2f32_v2f32__3_2(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__3_2:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -447,27 +443,24 @@ define void @v_shuffle_v2f32_v2f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:1]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v2, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
; GFX940-LABEL: v_shuffle_v2f32_v2f32__1_0:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:1]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <2 x float> asm "; def $0", "=v"()
Expand Down
40 changes: 17 additions & 23 deletions llvm/test/CodeGen/AMDGPU/shufflevector.v2f32.v3f32.ll
Original file line number Diff line number Diff line change
Expand Up @@ -632,10 +632,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -645,10 +644,9 @@ define void @v_shuffle_v2f32_v3f32__1_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v3, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -765,13 +763,12 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[2:4]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v5, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v3
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v5, v[2:3], s[16:17]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v5, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -786,9 +783,8 @@ define void @v_shuffle_v2f32_v3f32__4_0(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ; def v[2:4]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: s_nop 0
; GFX940-NEXT: v_mov_b32_e32 v2, v3
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v5, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v5, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down Expand Up @@ -1480,10 +1476,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def v[0:2]
; GFX90A-NEXT: ;;#ASMEND
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, v1
; GFX90A-NEXT: v_mov_b32_e32 v3, v0
; GFX90A-NEXT: global_store_dwordx2 v4, v[2:3], s[16:17]
; GFX90A-NEXT: v_mov_b32_e32 v3, 0
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX90A-NEXT: global_store_dwordx2 v3, v[0:1], s[16:17]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: s_setpc_b64 s[30:31]
;
Expand All @@ -1493,10 +1488,9 @@ define void @v_shuffle_v2f32_v3f32__4_3(ptr addrspace(1) inreg %ptr) {
; GFX940-NEXT: ;;#ASMSTART
; GFX940-NEXT: ; def v[0:2]
; GFX940-NEXT: ;;#ASMEND
; GFX940-NEXT: v_mov_b32_e32 v4, 0
; GFX940-NEXT: v_mov_b32_e32 v2, v1
; GFX940-NEXT: v_mov_b32_e32 v3, v0
; GFX940-NEXT: global_store_dwordx2 v4, v[2:3], s[0:1] sc0 sc1
; GFX940-NEXT: v_mov_b32_e32 v3, 0
; GFX940-NEXT: v_pk_mov_b32 v[0:1], v[0:1], v[0:1] op_sel:[1,0]
; GFX940-NEXT: global_store_dwordx2 v3, v[0:1], s[0:1] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: s_setpc_b64 s[30:31]
%vec0 = call <3 x float> asm "; def $0", "=v"()
Expand Down
Loading

0 comments on commit e28e935

Please sign in to comment.