-
Notifications
You must be signed in to change notification settings - Fork 547
MetalPerformancePrimitives iOS xcode26.1 b2
Rolf Bjarne Kvinge edited this page Oct 7, 2025
·
1 revision
#MetalPerformancePrimitives.framework
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2025-09-08 05:44:05
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h 2025-09-26 16:15:07
@@ -8,22 +8,36 @@
// C = A*B + C;
// A and B can be tensor_handle, tensor_offset, and tensor_inline.
// C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
-// Data type combinations supported by this operation are as follows
-// A B C
+// Data type combinations supported by this operation are as follows:
+//
+// A B C
+// ---------------------------
+// half half half
+// half int8_t half
+// int8_t half half
+// half half float
+// half float float
+// half int8_t float
+// float half float
+// float float float
+// float int8_t float
+// int8_t half float
+// int8_t float float
// int8_t int8_t int32_t
-// int8_t int8_t float
-// int8_t int8_t half
-// uint8_t int8_t int32_t
-// uint8_t int8_t float
-// uint8_t int8_t half
-// int8_t uint8_t int32_t
-// int8_t uint8_t float
-// int8_t uint8_t half
-// uint8_t uint8_t int32_t
-// uint8_t uint8_t float
-// uint8_t uint8_t half
-// half half float
-// half half half
+// bfloat bfloat bfloat
+// bfloat bfloat float
+// bfloat float float
+// bfloat int8_t bfloat
+// bfloat int8_t float
+// float bfloat float
+// int8_t bfloat bfloat
+// int8_t bfloat float
+// bfloat half bfloat
+// bfloat half half
+// bfloat half float
+// half bfloat bfloat
+// half bfloat half
+// half bfloat float
//
// Basic usage is in the following example which takes M x K matrix A of type
// half, K x N matrix B of type half, both in device memory and produces M x N
@@ -43,72 +57,43 @@
// [encoder dispatchThreadgroups:threadgroups
// threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
//
-// kernel void simpleMatMul(tensor<device half, dextents<int32_t, 2>,
-// tensor_handle> A,
-// tensor<device half, dextents<int32_t, 2>,
-// tensor_handle> B, tensor<device float,
-// dextents<int32_t, 2>, tensor_handle> C, constant
-// uint& M, constant uint& N, constant uint& K, uint2
-// tgid [[threadgroup_position_in_grid]])
+// kernel void simpleMatMul(tensor<device half, dextents<int32_t, 2>> A,
+// tensor<device half, dextents<int32_t, 2>> B,
+// tensor<device float, dextents<int32_t, 2>> C,
+// constant uint& M, constant uint& N, constant uint& K,
+// uint2 tgid [[threadgroup_position_in_grid]])
// {
// // descriptor to create matmul operation that does 64x32 times 32x32
-// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
-// //m outer dim of local tile
-// 32, //n outer dim
-// of local tile
-// 0, //k inner
-// dimension. 0 means
-// //operation
-// will read K
-// from
-// //input tensor
-// //K =
-// A.extents().extent(0)
-// or
-// B.extents().extent(1)
-// for NN
-// //K =
-// A.extents().extent(0)
-// or
-// B.extents().extent(0)
-// for NT
-// //and so on..
-// false, //transpse_left
-// = false for NN and NT
-// and true for TN and TT
-// false,
-// //transpse_right =
-// false for NN and TN
-// and true for NT and TT
-// false,
-// //relaxed_precision =
-// false, set it to true
-// to allow
-// implementation
-// //sacrifice
-// accurancy for
-// performance.
-// );
+// // producing 64x32
+// constexpr auto matmulDescriptor =
+// matmul2d_descriptor(64, // m outer dim of local tile
+// 32, // n outer dim of local tile
+// static_cast<int>(dynamic_extent), // k inner dimension. dynamic_extent means operation will read K from input tensor
+// // K = A.extents().extent(0) or B.extents().extent(1) for NN
+// // K = A.extents().extent(0) or B.extents().extent(0) for NT
+// // and so on..
+// false, // transpse_left = false for NN and NT and true for TN and TT
+// false, // transpse_right = false for NN and TN and true for NT and TT
+// false); // relaxed_precision = false, set it to true to allow implementation
+// // to sacrifice accurancy for performance.
//
// // create matmul op from above descriptor with 4 SIMD-Groups. All 4
-// SIMD-Groups in this threadgroup will execute this
+// // SIMD-Groups in this threadgroup will execute this
// // matmul cooperatively. More on this scope below.
-// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+// matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
//
// // Following three lines of code create appropriate slice for this thread
-// group to work on.
-// // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
-// tensor_offset>
+// // group to work on. E.g. A.slice below creates a
+// // tensor<device half, dextents<int32_t, 2>, tensor_offset>
// // which has same extents as original tensor A but origin shifted to
-// (0,tgid.y*64) i.e.
-// // mA[x,y] == A[x,tgid.y*64+y]
+// // (0,tgid.y*64) i.e. mA[x,y] == A[x,tgid.y*64+y]
//
-// auto mA = A.offset(0, tgid.y*64);
-// auto mB = B.offset(tgid.x*32, 0);
-// auto mC = C.offset(tgid.x*32, tgid.y*64);
+// auto mA = A.slice(0, tgid.y*64);
+// auto mB = B.slice(tgid.x*32, 0);
+// auto mC = C.slice(tgid.x*32, tgid.y*64);
//
-// // execute the operation. Assumes C is is initialized to zero.
-// op.run(mA, mB, mC);
+// // execute the operation. Assumes C is is initialized to zero.
+// op.run(mA, mB, mC);
// }
//
// Above matrix multiplication implementation will do edge checking for all
@@ -117,23 +102,19 @@
// bounds check. In high performance code we can avoid edge checking for inside
// thread groups and get better performance
//
-// kernel void matMul(tensor<device half, dextents<int32_t, 2>, tensor_handle>
-// A,
-// tensor<device half, dextents<int32_t, 2>, tensor_handle>
-// B, tensor<device float, dextents<int32_t, 2>,
-// tensor_handle> C, constant uint& M, constant uint& N,
-// constant uint& K, uint2 tgid
-// [[threadgroup_position_in_grid]])
+// kernel void matMul(tensor<device half, dextents<int32_t, 2>> A,
+// tensor<device half, dextents<int32_t, 2>> B,
+// tensor<device float, dextents<int32_t, 2>> C,
+// constant uint& M, constant uint& N,
+// constant uint& K, uint2 tgid [[threadgroup_position_in_grid]])
// {
// // descriptor to create matmul operation that does 64x32 times 32x32
-// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// // producing 64x32
+// constexpr auto matmulDescriptor = matmul2d_descriptor(64,
// 32,
-// 0,
-// false,
-// false,
-// false);
+// static_cast<int>(dynamic_extent));
//
-// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+// matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
//
// // Inside thredgroup in both outer dimensions M and N.
// if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
@@ -146,41 +127,32 @@
// }
// else
// {
-// auto tA = A.offset(0, tgid.y*64);
-// auto tB = B.offset(tgid.x*32, 0);
-// auto tC = C.offset(tgid.x*32, tgid.y*64);
+// auto tA = A.slice(0, tgid.y*64);
+// auto tB = B.slice(tgid.x*32, 0);
+// auto tC = C.slice(tgid.x*32, tgid.y*64);
//
// op.run(tA, tB, tC);
// }
// }
//
-// User can also take ownership of looping over in reduction or k-dimension by
+// User can also take ownership of looping over reduction or k-dimension by
// choosing appropriate chunk size in k (called k-tile or tilek) For following
-// example, we choose 16. kernel void matMulKLoop(tensor<device half,
-// dextents<int32_t, 2>, tensor_handle> A,
-// tensor<device half, dextents<int32_t, 2>,
-// tensor_handle> B, tensor<device float,
-// dextents<int32_t, 2>, tensor_handle> C, constant
-// uint& M, constant uint& N, constant uint& K, uint2
-// tgid [[threadgroup_position_in_grid]])
+// example, we choose 16.
+// kernel void matMulKLoop(tensor<device half, dextents<int32_t, 2>> A,
+// tensor<device half, dextents<int32_t, 2>> B,
+// tensor<device float, dextents<int32_t, 2>> C,
+// constant uint& M, constant uint& N, constant uint& K,
+// uint2 tgid [[threadgroup_position_in_grid]])
// {
// // descriptor to create matmul operation that does 64x32 times 32x32
-// producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+// // producing 64x32
+// constexpr auto matmulDescriptor = matmul2d_descriptor(64,
// 32,
-// 16, // tilek = 16,
-// we loop over K in
-// chucks of 16
-// // rather than
-// letting matmul
-// op run method
-// looping over K
-// // internally
-// choosing tileK
-// false,
-// false,
-// false);
+// 16); // tilek = 16, we loop over K in chunks of 16 rather than
+// // letting matmul op run method looping over K
+// // internally choose tileK
//
-// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+// matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
//
// constexpr int tilek = 16;
//
@@ -203,9 +175,9 @@
// }
// else
// {
-// auto tA = A.offset(0, tgid.y*64);
-// auto tB = B.offset(tgid.x*32, 0);
-// auto tC = C.offset(tgid.x*32, tgid.y*64);
+// auto tA = A.slice(0, tgid.y*64);
+// auto tB = B.slice(tgid.x*32, 0);
+// auto tC = C.slice(tgid.x*32, tgid.y*64);
//
// op.run(tA, tB, tC);
// }
@@ -219,90 +191,80 @@
// performance and power. User can apply post processing in-register where GEMM
// output is computed using cooperative_tensor. Unlike tensor_handle,
// tensor_offset and tensor_inline which are non-owning meaning these are
-// wrappers around resource in device, threadgroup or thread addressspce,
+// wrappers around resource in device, threadgroup or thread address space,
// cooperative_tensor owns thread private data and divides the data for entire
-// tensor among threads (participating the scope of operation) in implementation
+// tensor among threads (participating in the scope of operation) in implementation
// defined manner. This thread private memory is allocated at construction of
// cooperative_tensor and deallocated when this cooperative_tensor goes out of
// scope. The layout of cooperative_tensor depends on operation, data type,
// number of threads in opscope with which op was created. Note that
// cooperative_tensor created from an op is only valid for threads that are part
-// of opscope on which op was created. Though the layout of cooperative_tensor
+// of execution scope on which op was created. Though the layout of cooperative_tensor
// is implementation defined, we provide accessor functions as shown in the
// example below
//
-// kernel void simpleMatMulCooperative(tensor<device half, dextents<int32_t,
-// 2>, tensor_handle> A,
-// tensor<device half, dextents<int32_t, 2>,
-// tensor_handle> B, tensor<device float,
-// dextents<int32_t, 2>, tensor_handle> C,
-// tensor<device half, dextents<int32_t, 2>,
-// tensor_handle> bias, constant uint& M, constant
-// uint& N, constant uint& K, uint2 tgid
-// [[threadgroup_position_in_grid]])
+// kernel void simpleMatMulCooperative(tensor<device half, dextents<int32_t, 2>> A,
+// tensor<device half, dextents<int32_t, 2>> B,
+// tensor<device float, dextents<int32_t, 2>> C,
+// tensor<device half, dextents<int32_t, 2>> bias,
+// constant uint& M, constant uint& N, constant uint& K,
+// uint2 tgid [[threadgroup_position_in_grid]])
// {
// constexpr auto matmulDescriptor = matmul2d_descriptor(64,
// 32,
-// 0,
-// false,
-// false,
-// false);
+// static_cast<int>(dynamic_extent));
//
-// matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+// matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
//
-// auto mA = A.offset(0, tgid.y*64);
-// auto mB = B.offset(tgid.x*32, 0);
-// auto mC = C.offset(tgid.x*32, tgid.y*64);
+// auto mA = A.slice(0, tgid.y*64);
+// auto mB = B.slice(tgid.x*32, 0);
+// auto mC = C.slice(tgid.x*32, tgid.y*64);
//
// // This creates cooperative destination tensor of float element type.
// // Since matmul op above descriptor is created with 4 SIMD-Groups,
-// coopeartive tensor will divide data among the threads on these
+// // cooperative tensor will divide data among the threads in these
// // 4 SIMD-Groups. The layout of data among lanes is implementation defined
-// and not all threads and even all elements within a thread need
-// // be valid. We provide valid element check shown below which developer
-// should use to guard their access to elements of cooperative_tensor
+// // and not all threads and even all elements within a thread need
+// // be valid. Use the valid element check shown below to guard
+// // access to elements of cooperative_tensor
//
-// auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
-// decltype(mB), float>();
+// auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
//
// // Loop over all the elements of cooperative_tensor thread elements owned
-// by "this" thread and initialize to zero.
-// // Its imperative for performance to include "unroll pragma" so compiler
-// fully unrolls the loop.
+// // by "this" thread and initialize to zero.
+// // It is imperative for performance to include "unroll pragma" so compiler
+// // fully unrolls the loop.
//
// #pragma unroll full
-// for (uint16_t i = 0, i < cT.capacity(); ++i) {
-//
-// if(cT.mask(i))
+// for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
+// if(cT.get_mask(i))
// cT[i] = 0;
// }
//
-// // execute the operation. All threads computes the matmul cooperatively
-// and results are written to cooperative_tensor. op.run(mA, mB, cT);
+// // execute the operation. All threads compute the matmul cooperatively
+// // and results are written to cooperative_tensor.
+// op.run(mA, mB, cT);
//
-// // create cooperative bias tensor with same layout as destination
-// cooperative_tensor of matmul auto biasT =
-// matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
-// float>();
+// // create cooperative bias tensor with same layout as destination
+// // cooperative_tensor of matmul
+// auto biasT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
//
-// // load data from bias tensor_handle into biasT cooperative_tensor using
-// layout and distribution of element among threads of scope
-// // on which matmul was created.
-// biasT.load(bias);
+// // load data from bias tensor_handle into biasT cooperative_tensor using
+// // layout and distribution of element among threads of scope on which matmul was created.
+// biasT.load(bias);
//
// #pragma unroll full
-// for (uint16_t i = 0, i < cT.capacity(); ++i) {
+// for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
//
-// if(cT.mask(i)) {
+// if(cT.get_mask(i)) {
// //add bias
// cT[i] += biasT[i];
//
// // get the 2-dimensional local coordinate of this thread's i-th
-// element in destination local coordinate system (in this example
+// // element in destination local coordinate system (in this example
// // 32 x 64 tile).
-// auto ids = cT.multidimensional_indices(i);
-// cT[i] = foo(cT[i], idx); // do some operation based on coordinate
-// values
+// auto ids = cT.get_multidimensional_index(i);
+// cT[i] = foo(cT[i], idx); // do some operation based on coordinate values
// }
// }
//
@@ -313,29 +275,28 @@
// Note on scope of operation
// ==========================
// A tensor operation may be executed on a single thread entirely or
-// cooperatively among a set of SIMD groups. We call these set of threads
-// "execution scop" of the tensor operation. A tensor ops must be created with
-// execution scope provided as template argument. All the threads in this
+// cooperatively among a set of SIMD groups. The set of threads is called the
+// "execution scope" of the tensor operation. A tensor op must be created with
+// an execution scope provided as template argument. All the threads in this
// execution scope must enter the run method i.e. call to run methods must be
// "execution scope" uniform. Use the following types to configure the execution
-// modes of each operation: metal::execution_thread: the operation will be run on a
-// single thread.
-// Fragment shaders only support this execution scope.
-// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
-// this SIMD group.
-// May be used for finer control over tiling by slicing
-// tensors with SIMD IDs.
-// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
-// SIMD groups.
-// Must be used when all threads in a threadgroup are
-// cooperatively performing the operation.
+// modes of each operation:
+// metal::execution_thread: The operation will be run on a single thread.
+// Fragment shaders only support this execution scope.
+// metal::execution_simdgroup: The operation will be run cooperatively by all
+// threads in the SIMD group. May be used for finer
+// control over tiling by slicing tensors with SIMD IDs.
+// metal::execution_simdgroups<N>: The operation will be executed cooperatively by N
+// SIMD groups. Must be used when all threads in a
+// threadgroup are cooperatively performing the operation.
+//
// It is undefined behavior if the number of SIMD groups dispatched does not
// match the number of SIMD groups that the operation was configured with.
//
-// Even though each thread in execution scope can potentially independently
-// enter and exit run method, developer cannot assume that threads in execution
-// scope are working completely independently i.e. tensor operation run
-// implementation may need for (for correctness or performance) synchronize
+// Even though each thread in the execution scope can potentially independently
+// enter and exit run method, the threads in the execution scope are working
+// do not necessarily operate completely independently. For example, the tensor
+// operation may need for (for correctness or performance) to synchronize
// among the threads in execution scope it was created with.
//
//
@@ -383,7 +344,7 @@
mode matmul_mode;
public:
- constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+ constexpr matmul2d_descriptor(int __m, int __n, int __k = static_cast<int>(metal::dynamic_extent),
bool __transpose_left = false,
bool __transpose_right = false,
bool __relaxed_precision = false,
@@ -404,9 +365,9 @@
{
static const constant ElementType sum_identity = (ElementType)0;
static const constant ElementType max_identity =
- metal::numeric_limits<ElementType>::lowest;
+ metal::numeric_limits<ElementType>::lowest();
static const constant ElementType min_identity =
- metal::numeric_limits<ElementType>::max;
+ metal::numeric_limits<ElementType>::max();
};
#include "__impl/MPPTensorOpsMatMul2dImpl.h"
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2025-09-17 08:16:34
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h 2025-09-26 16:27:14
@@ -156,6 +156,26 @@
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype,
int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
@@ -211,6 +231,24 @@
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype,
int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ int threads);
extern "C" EXTERNALLY_DEFINED_ATTR size_t
__tensorops_impl_matmul2d_op_cooperative_reduction_destination_data_size(
@@ -359,6 +397,26 @@
int,
__tensor_ops_detail::__tensor_ops_datatype leftDataType,
__tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+ __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+ __tensor_ops_detail::__tensor_ops_datatype rightDataType);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f16(
@@ -420,6 +478,26 @@
int,
__tensor_ops_detail::__tensor_ops_datatype leftDataType,
__tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+ __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+ int,
+ int,
+ __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+ __tensor_ops_detail::__tensor_ops_datatype rightDataType);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
@@ -448,6 +526,15 @@
__reduction_operation,
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__thread_void_t,
+ bfloat,
+ __reduction_operation,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
@@ -476,6 +563,15 @@
__reduction_operation,
__tensor_ops_detail::__tensor_ops_datatype,
__tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+ __matmul2d_descriptor,
+ __tensor_ops_detail::__const_thread_void_t,
+ __tensor_ops_detail::__thread_void_t,
+ bfloat,
+ __reduction_operation,
+ __tensor_ops_detail::__tensor_ops_datatype,
+ __tensor_ops_detail::__tensor_ops_datatype);
extern "C" EXTERNALLY_DEFINED_ATTR void
__tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
@@ -2640,6 +2736,287 @@
__tensor_ops_detail::__tensor_ops_tensor_descriptor_type
destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+
template <__matmul2d_descriptor descriptor,
__matmul2d_cooperative_operand_index operand_index, typename scope,
typename left_operand, typename right_operand, typename element_type,
@@ -2652,9 +3029,10 @@
"only destination can be cooperative tensor");
static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
__tensor_ops_detail::__is_same_v<element_type, half> ||
+ __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
__tensor_ops_detail::__is_same_v<element_type, int32_t>,
"cooperative tensor data type can only be one of "
- "float/half/int32_t");
+ "float/half/bfloat/int32_t");
static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
using element_t = element_type;
@@ -2827,6 +3205,21 @@
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+ desc, storage, source, sourceDescType, sourceRank, leftDataType,
+ rightDataType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+ desc, storage, source, sourceDescType, sourceRank, leftDataType,
+ rightDataType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -2913,6 +3306,22 @@
static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+ desc, storage, destination, destinationDescType, leftDataType,
+ rightDataType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+ desc, storage, destination, destinationDescType, leftDataType,
+ rightDataType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -3089,9 +3498,10 @@
{
static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
__tensor_ops_detail::__is_same_v<element_type, half> ||
+ __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
__tensor_ops_detail::__is_same_v<element_type, int32_t>,
"cooperative tensor data type can only be one of "
- "float/half/int32_t");
+ "float/half/bfloat/int32_t");
static constant constexpr __tensor_ops_detail::__rank_t rank = 1;
using element_t = element_type;
@@ -3259,6 +3669,19 @@
static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+ desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ sourcePtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+ desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -3339,6 +3762,20 @@
static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+ desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+ destinationPtrType>)
+ __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+ desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+ "Unsupported address space");
+ }
else
static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
"Unsupported type");
@@ -4460,6 +4897,552 @@
__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
else
static_assert(
__tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5394,6 +6377,552 @@
__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+ __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
else
static_assert(
__tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5881,6 +7410,272 @@
__tensor_ops_detail::__assert_false_v<destinationPtrType>,
"Unsupported address space");
}
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
+ else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+ __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+ __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+ {
+ if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+ __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+ __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+ else
+ static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+ }
else
static_assert(
__tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5938,6 +7733,9 @@
else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
desc, src, dst, identity, op, leftDataType, rightDataType);
+ else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+ desc, src, dst, identity, op, leftDataType, rightDataType);
else
static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
"Unsupported type");
@@ -5992,6 +7790,9 @@
else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
desc, src, dst, identity, op, leftDataType, rightDataType);
+ else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+ __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+ desc, src, dst, identity, op, leftDataType, rightDataType);
else
static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
"Unsupported type");
@@ -5999,8 +7800,8 @@
template <class SrcElementType, class DstElementType, class SrcExtents, class DstExtents, class SrcLayout, class DstLayout>
inline bool __is_iterator_compatible(
- thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
- thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
+ const thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
+ const thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
{
if (!SrcLayout::is_matmul2d_cooperative_destination_layout ||
!DstLayout::is_matmul2d_reduction_cooperative_destination_layout ||
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h 2025-09-08 05:52:50
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h 2025-09-26 17:51:19
@@ -130,7 +130,7 @@
template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>
struct dynamic_length
{
- static constexpr constant T value = metal::numeric_limits<T>::max();
+ static constexpr constant T value = metal::numeric_limits<T>::max();
};
template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>