-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] selecting v_sat_pk instruction, version 2 #123297
base: main
Are you sure you want to change the base?
Changes from 9 commits
5844fd4
8505185
506ccd3
7fce2b2
424938f
88e52c1
1dc8b9c
c5e3e65
97755d2
c99c42a
3cbf7aa
7b166f9
f0e5101
bb6edd1
b3147a8
28d2560
17b2a49
64e2125
e7f3a17
6e16e60
c78b5d8
81ae12d
9a7d148
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -461,6 +461,7 @@ enum NodeType : unsigned { | |
FMED3, | ||
SMED3, | ||
UMED3, | ||
SAT_PK_CAST, | ||
FMAXIMUM3, | ||
FMINIMUM3, | ||
FDOT2, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -332,6 +332,9 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp, | |
[] | ||
>; | ||
|
||
// Special node to handle v_sat_pk to avoid v2i8 | ||
def AMDGPUsat_pk_cast : SDNode<"AMDGPUISD::SAT_PK_CAST", SDTUnaryOp, []>; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need to document what this is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Explain what the node is, not just to avoid v2i8. It's to pack a v2i18 into i16 |
||
|
||
def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>; | ||
|
||
def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2", | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -816,6 +816,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, | |
{MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32}, | ||
Custom); | ||
} | ||
|
||
// Avoid true 16 instruction | ||
if (!Subtarget->hasTrue16BitInsts() || !Subtarget->useRealTrue16Insts()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Needs predicate for has the instruction |
||
// MVT::v2i16 for src type check in foldToSaturated | ||
// MVT::v2i8 for dst type check in CustomLowerNode | ||
setOperationAction(ISD::TRUNCATE_SSAT_U, {MVT::v2i16, MVT::v2i8}, Custom); | ||
} | ||
} | ||
|
||
setOperationAction({ISD::FNEG, ISD::FABS}, MVT::v4f16, Custom); | ||
|
@@ -1975,6 +1982,10 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const { | |
if (VT == MVT::i1 && Op == ISD::SETCC) | ||
return false; | ||
|
||
// v2i8 is illegal and only allowed in specific cases | ||
if (VT == MVT::v2i8 && Op == ISD::TRUNCATE_SSAT_U) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I wouldn't expect to need this, but this also should check the operation is actually available if it is actually needed There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , I think it is needed, because the dst type has to be v2i8, otherwise we do not have instructions to select (since this is specially made for v_sat_pk) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The combiner checks isOperationLegalOrCustom though Plus this wouldn't be a check specific to this vector size. We would want wider vectors split into pieces There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , could you be more specific?? Do you mean that we should remove the check here, and try to handle different dst type of truncate_ssat_u latter in ReplaceNodeResults?? Or should I accept any vNi8, and:
while N is any even integer?? |
||
return true; | ||
|
||
return TargetLowering::isTypeDesirableForOp(Op, VT); | ||
} | ||
|
||
|
@@ -6605,6 +6616,14 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, | |
Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG)); | ||
break; | ||
} | ||
case ISD::TRUNCATE_SSAT_U: { | ||
SDLoc SL(N); | ||
SDValue Op = | ||
DAG.getNode(AMDGPUISD::SAT_PK_CAST, SL, MVT::i16, N->getOperand(0)); | ||
Op = DAG.getNode(ISD::BITCAST, SL, MVT::v2i8, Op); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't hardcode the v2i8, use the type from N |
||
Results.push_back(Op); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You need to insert a cast to the original type, does this not assert as-is? |
||
break; | ||
} | ||
default: | ||
AMDGPUTargetLowering::ReplaceNodeResults(N, Results, DAG); | ||
break; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3324,6 +3324,21 @@ def : GCNPat < | |
(v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0)))) | ||
>; | ||
|
||
multiclass V_SAT_PK_Pat<Instruction inst> { | ||
def : GCNPat< | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This pattern isn't doing much, you should be able to pass the node to the SDNodeOperator argument to the instruction definition There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , could you be more specific? Should I use other type of pattern? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The node is basically the same as the instruction definition, so you should be able to use the built-in pattern attached to the instruction def. in VOP1Instructions.td: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , by adding the node I got the following:
I think there are 2 problems:
If the instruction cannot cover any type of (i16 (AMDGPUsat_pk_cast v2i8)), we gain risk of failing in selection. I also tried to create a new VOP_I16_V2I16 type, but it makes V_SAT_PK_U8_I16_e64 and V_SAT_PK_U8_I16_fake16_e64 4 operands instructions (with modifier, clamp and opsel) I think in order to make the passing node work, I need to modify related complex pattern functions and replace (v2i8 (truncssat_u v2i16)) with some patterns that can fit the complex pattern functions |
||
(i16 (AMDGPUsat_pk_cast v2i16:$src)), | ||
(inst VRegSrc_32:$src) | ||
>; | ||
} | ||
|
||
let OtherPredicates = [NotHasTrue16BitInsts] in { | ||
defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_e64>; | ||
} // End OtherPredicates = [NotHasTrue16BitInsts] | ||
|
||
let True16Predicate = UseFakeTrue16Insts in { | ||
defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_fake16_e64>; | ||
} // End True16Predicate = UseFakeTrue16Insts | ||
|
||
// With multiple uses of the shift, this will duplicate the shift and | ||
// increase register pressure. | ||
def : GCNPat < | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX11 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s | ||
|
||
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX11 %s | ||
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s | ||
|
||
; <GFX9 has no V_SAT_PK, GFX9+ has V_SAT_PK, GFX11 has V_SAT_PK with t16 | ||
|
@@ -815,15 +815,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { | |
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GFX11-LABEL: basic_smax_smin_bit_or: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff | ||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; SDAG-GFX11-LABEL: basic_smax_smin_bit_or: | ||
; SDAG-GFX11: ; %bb.0: | ||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff | ||
; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX12-LABEL: basic_smax_smin_bit_or: | ||
; SDAG-GFX12: ; %bb.0: | ||
|
@@ -860,6 +860,16 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { | |
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX11-LABEL: basic_smax_smin_bit_or: | ||
; GISEL-GFX11: ; %bb.0: | ||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GISEL-GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff | ||
; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX12-LABEL: basic_smax_smin_bit_or: | ||
; GISEL-GFX12: ; %bb.0: | ||
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
|
@@ -873,6 +883,15 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { | |
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] | ||
; GFX11-LABEL: basic_smax_smin_bit_or: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_med3_i16 v1, v1, 0, 0xff | ||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
|
||
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) | ||
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) | ||
|
@@ -902,15 +921,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { | |
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GFX11-LABEL: basic_umax_umin_bit_or: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_min_u16 v1, 0xff, v1 | ||
; GFX11-NEXT: v_min_u16 v0, 0xff, v0 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; SDAG-GFX11-LABEL: basic_umax_umin_bit_or: | ||
; SDAG-GFX11: ; %bb.0: | ||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX11-NEXT: v_min_u16 v1, 0xff, v1 | ||
; SDAG-GFX11-NEXT: v_min_u16 v0, 0xff, v0 | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX12-LABEL: basic_umax_umin_bit_or: | ||
; SDAG-GFX12: ; %bb.0: | ||
|
@@ -944,6 +963,16 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { | |
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX11-LABEL: basic_umax_umin_bit_or: | ||
; GISEL-GFX11: ; %bb.0: | ||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GISEL-GFX11-NEXT: v_min_u16 v1, 0xff, v1 | ||
; GISEL-GFX11-NEXT: v_min_u16 v0, 0xff, v0 | ||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX12-LABEL: basic_umax_umin_bit_or: | ||
; GISEL-GFX12: ; %bb.0: | ||
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
|
@@ -957,6 +986,15 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { | |
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] | ||
; GFX11-LABEL: basic_umax_umin_bit_or: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_min_u16 v1, 0xff, v1 | ||
; GFX11-NEXT: v_min_u16 v0, 0xff, v0 | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
|
||
%src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should test these patterns in more vector types, at least 2 x, 3 x and 4 x cases There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , currently the pattern will only work for v2i8 result type. Or maybe for N x i8 case we could make it to N/2 of v2i8?? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should still have baseline tests with the multiples. Yes, ideally we would take multiples and split them |
||
%src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255) | ||
|
@@ -1093,15 +1131,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { | |
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GFX11-LABEL: basic_smax_smin_bit_shl: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_max_i16 v1, v1, 0 | ||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; SDAG-GFX11-LABEL: basic_smax_smin_bit_shl: | ||
; SDAG-GFX11: ; %bb.0: | ||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX11-NEXT: v_max_i16 v1, v1, 0 | ||
; SDAG-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl: | ||
; SDAG-GFX12: ; %bb.0: | ||
|
@@ -1137,6 +1175,16 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { | |
; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX11-LABEL: basic_smax_smin_bit_shl: | ||
; GISEL-GFX11: ; %bb.0: | ||
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GISEL-GFX11-NEXT: v_max_i16 v1, v1, 0 | ||
; GISEL-GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GISEL-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl: | ||
; GISEL-GFX12: ; %bb.0: | ||
; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
|
@@ -1150,6 +1198,15 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { | |
; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] | ||
; GFX11-LABEL: basic_smax_smin_bit_shl: | ||
; GFX11: ; %bb.0: | ||
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; GFX11-NEXT: v_max_i16 v1, v1, 0 | ||
; GFX11-NEXT: v_med3_i16 v0, v0, 0, 0xff | ||
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; GFX11-NEXT: s_setpc_b64 s[30:31] | ||
|
||
%src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) | ||
%src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) | ||
|
@@ -1174,24 +1231,13 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { | |
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input: | ||
; SDAG-GFX9: ; %bb.0: | ||
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff | ||
; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] | ||
; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 | ||
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input: | ||
; SDAG-GFX11: ; %bb.0: | ||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input: | ||
|
@@ -1201,13 +1247,7 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { | |
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 | ||
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 | ||
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 | ||
; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] | ||
; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-VI-LABEL: basic_smax_smin_vec_input: | ||
|
@@ -1290,24 +1330,13 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { | |
; SDAG-GFX9-LABEL: basic_smax_smin_vec_input_rev: | ||
; SDAG-GFX9: ; %bb.0: | ||
; SDAG-GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX9-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX9-NEXT: s_movk_i32 s4, 0xff | ||
; SDAG-GFX9-NEXT: v_pk_min_i16 v0, v0, s4 op_sel_hi:[1,0] | ||
; SDAG-GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX9-NEXT: v_lshlrev_b16_e32 v1, 8, v1 | ||
; SDAG-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX9-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX9-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX11-LABEL: basic_smax_smin_vec_input_rev: | ||
; SDAG-GFX11: ; %bb.0: | ||
; SDAG-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) | ||
; SDAG-GFX11-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] | ||
; SDAG-GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX11-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX11-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev: | ||
|
@@ -1317,13 +1346,7 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { | |
; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 | ||
; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 | ||
; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 | ||
; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 | ||
; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] | ||
; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 | ||
; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) | ||
; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 | ||
; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 | ||
; SDAG-GFX12-NEXT: v_sat_pk_u8_i16_e32 v0, v0 | ||
; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we need this extra node, can't we just select from
TRUNC_SSAT_U
directly?Is it because it gets transformed/lost otherwise?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is for printing and dumping, without this the debug dump will show unknown node
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because the type signature is different. This is forcing the pack to use a legal integer type instead of v2i8