llvm · Shoreshen · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024 · Dec 26, 2024
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -315,6 +315,55 @@ def srl_16 : PatFrag<
   (ops node:$src0), (srl_oneuse node:$src0, (i32 16))
 >;
 
+def clamp_s16_u8 : PatFrag<
+  (ops node:$src),
+  (i16 (AMDGPUsmed3 $src, (i16 0), (i16 255)))
+>;
+
+def conc_lo_u8_i16 : PatFrags<
+  (ops node:$src0, node:$src1),
+  [
+    (or
+      (i16 $src0),
+      (shl (i16 $src1), (i16 8))
+    ),
+    (or
+      (and (i16 $src0), (i16 255)),
+      (shl (i16 $src1), (i16 8))
+    )
+  ]
+>;
+
+def clamp_v2i16_u8 : PatFrags<
+  (ops node:$src),
+  [
+    (v2i16 (smax (smin $src, (build_vector (i16 255), (i16 255))), (build_vector (i16 0), (i16 0)))),
+    (v2i16 (smin (smax $src, (build_vector (i16 0), (i16 0))), (build_vector (i16 255), (i16 255))))
+  ]
+>;
+
+def conc_lo_v2i16_i16 : PatFrags<
+  (ops node:$src),
+  [
+    (or
+      (i16 (trunc (i32 (bitconvert $src)))),
+      (shl 
+        (i16 (trunc(srl (i32 (bitconvert $src)), (i32 16)))), 
+        (i16 8)
+      )
+    ),
+    (or
+      (and (i16 (trunc (i32 (bitconvert $src)))), (i16 255)),
+      (shl
+        (and
+          (i16 (trunc (srl (i32 (bitconvert $src)), (i32 16)))),
+          (i16 255)
+        ),
+        (i16 8)
+      )
+    )
+  ]
+>;
 
 def hi_i16_elt : PatFrag<
   (ops node:$src0), (i16 (trunc (i32 (srl_16 node:$src0))))

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -10,7 +10,10 @@
 // that are not yet supported remain commented out.
 //===----------------------------------------------------------------------===//
 
-class GCNPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl;
+class GCNPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl, GISelFlags;
+
+let GIIgnoreCopies = 1 in
+class GCNPatIgnoreCopies<dag pattern, dag result> : GCNPat<pattern, result>;
 
 class UniformSextInreg<ValueType VT> : PatFrag<
   (ops node:$src),
@@ -3298,6 +3301,34 @@ def : GCNPat <
   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
 >;
 
+multiclass V_SAT_PK_Pat<Instruction inst> {
+  def: GCNPatIgnoreCopies<
+    (i16 (conc_lo_u8_i16 (clamp_s16_u8 i16:$lo), (clamp_s16_u8 i16:$hi))),
+    (inst
+      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (S_MOV_B32 (i32 16)),
+      VRegSrc_32:$lo))
+  >;
+
+  def: GCNPatIgnoreCopies<
+    (i16 (conc_lo_u8_i16 (clamp_s16_u8 i16:$lo), (smax i16:$hi, (i16 0)))),
+    (inst
+      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (S_MOV_B32 (i32 16)),
-      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (S_MOV_B32 (i32 16)),
+      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (i32 16),
-      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (S_MOV_B32 (i32 16)),
+      (V_LSHL_OR_B32_e64 VRegSrc_32:$hi, (i32 16),
+      VRegSrc_32:$lo))
+  >;
+
+  def: GCNPatIgnoreCopies<
+    (i16 (conc_lo_v2i16_i16 (clamp_v2i16_u8 v2i16:$src))),
+    (inst VRegSrc_32:$src)
+  >;
+}
+
+let OtherPredicates = [NotHasTrue16BitInsts] in {
+  defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_e64>;
+} // End OtherPredicates = [NotHasTrue16BitInsts]
+let True16Predicate = UseFakeTrue16Insts in {
+  defm : V_SAT_PK_Pat<V_SAT_PK_U8_I16_fake16_e64>;
+}
+
 // With multiple uses of the shift, this will duplicate the shift and
 // increase register pressure.
 def : GCNPat <