Skip to content

MetalPerformancePrimitives iOS xcode26.1 b2

Rolf Bjarne Kvinge edited this page Oct 7, 2025 · 1 revision

#MetalPerformancePrimitives.framework

diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-09-08 05:44:05
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/MPPTensorOpsMatMul2d.h	2025-09-26 16:15:07
@@ -8,22 +8,36 @@
 //             C = A*B + C;
 // A and B can be tensor_handle, tensor_offset, and tensor_inline.
 // C can be tensor_handle, tensor_offset, tensor_inline or cooperative_tensor.
-// Data type combinations supported by this operation are as follows
-//   A           B         C
+// Data type combinations supported by this operation are as follows:
+//
+//  A          B         C
+//  ---------------------------
+//  half       half      half
+//  half       int8_t    half
+//  int8_t     half      half
+//  half       half      float
+//  half       float     float
+//  half       int8_t    float
+//  float      half      float
+//  float      float     float
+//  float      int8_t    float
+//  int8_t     half      float
+//  int8_t     float     float
 //  int8_t     int8_t    int32_t
-//  int8_t     int8_t    float
-//  int8_t     int8_t    half
-//  uint8_t    int8_t    int32_t
-//  uint8_t    int8_t    float
-//  uint8_t    int8_t    half
-//  int8_t     uint8_t   int32_t
-//  int8_t     uint8_t   float
-//  int8_t     uint8_t   half
-//  uint8_t    uint8_t   int32_t
-//  uint8_t    uint8_t   float
-//  uint8_t    uint8_t   half
-//   half       half     float
-//   half       half     half
+//  bfloat     bfloat    bfloat
+//  bfloat     bfloat    float
+//  bfloat     float     float
+//  bfloat     int8_t    bfloat
+//  bfloat     int8_t    float
+//  float      bfloat    float
+//  int8_t     bfloat    bfloat
+//  int8_t     bfloat    float
+//  bfloat     half      bfloat
+//  bfloat     half      half
+//  bfloat     half      float
+//  half       bfloat    bfloat
+//  half       bfloat    half
+//  half       bfloat    float
 //
 // Basic usage is in the following example which takes M x K matrix A of type
 // half, K x N matrix B of type half, both in device memory and produces M x N
@@ -43,72 +57,43 @@
 //    [encoder dispatchThreadgroups:threadgroups
 //    threadPerThreadgroups:MTLSizeMake(simdgroupWidth*4, 1, 1)];
 //
-// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>,
-// tensor_handle> A,
-//                          tensor<device half,  dextents<int32_t, 2>,
-//                          tensor_handle> B, tensor<device float,
-//                          dextents<int32_t, 2>, tensor_handle> C, constant
-//                          uint& M, constant uint& N, constant uint& K, uint2
-//                          tgid [[threadgroup_position_in_grid]])
+// kernel void simpleMatMul(tensor<device half,  dextents<int32_t, 2>> A,
+//                          tensor<device half,  dextents<int32_t, 2>> B,
+//                          tensor<device float, dextents<int32_t, 2>> C,
+//                          constant uint& M, constant uint& N, constant uint& K,
+//                          uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
-//     //m outer dim of local tile
-//                                                           32, //n outer dim
-//                                                           of local tile
-//                                                            0, //k inner
-//                                                            dimension. 0 means
-//                                                               //operation
-//                                                               will read K
-//                                                               from
-//                                                               //input tensor
-//                                                               //K =
-//                                                               A.extents().extent(0)
-//                                                               or
-//                                                               B.extents().extent(1)
-//                                                               for NN
-//                                                               //K =
-//                                                               A.extents().extent(0)
-//                                                               or
-//                                                               B.extents().extent(0)
-//                                                               for NT
-//                                                               //and so on..
-//                                                        false, //transpse_left
-//                                                        = false for NN and NT
-//                                                        and true for TN and TT
-//                                                        false,
-//                                                        //transpse_right =
-//                                                        false for NN and TN
-//                                                        and true for NT and TT
-//                                                        false,
-//                                                        //relaxed_precision =
-//                                                        false, set it to true
-//                                                        to allow
-//                                                        implementation
-//                                                               //sacrifice
-//                                                               accurancy for
-//                                                               performance.
-//                                                          );
+//     // producing 64x32
+//     constexpr auto matmulDescriptor =
+//         matmul2d_descriptor(64, // m outer dim of local tile
+//                             32, // n outer dim of local tile
+//                             static_cast<int>(dynamic_extent), // k inner dimension. dynamic_extent means operation will read K from input tensor
+//                                                               // K = A.extents().extent(0) or B.extents().extent(1) for NN
+//                                                               // K = A.extents().extent(0) or B.extents().extent(0) for NT
+//                                                               // and so on..
+//                             false,  // transpse_left = false for NN and NT and true for TN and TT
+//                             false,  // transpse_right = false for NN and TN and true for NT and TT
+//                             false); // relaxed_precision = false, set it to true to allow implementation
+//                                     // to sacrifice accurancy for performance.
 //
 //    // create matmul op from above descriptor with 4 SIMD-Groups. All 4
-//    SIMD-Groups in this threadgroup will execute this
+//    // SIMD-Groups in this threadgroup will execute this
 //    // matmul cooperatively. More on this scope below.
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//    matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //    // Following three lines of code create appropriate slice for this thread
-//    group to work on.
-//    // E.g. A.offset below creates a tensor<device half, dextents<int32_t, 2>,
-//    tensor_offset>
+//    // group to work on. E.g. A.slice below creates a
+//    // tensor<device half, dextents<int32_t, 2>, tensor_offset>
 //    // which has same extents as original tensor A but origin shifted to
-//    (0,tgid.y*64) i.e.
-//    // mA[x,y] == A[x,tgid.y*64+y]
+//    // (0,tgid.y*64) i.e. mA[x,y] == A[x,tgid.y*64+y]
 //
-//    auto mA = A.offset(0, tgid.y*64);
-//    auto mB = B.offset(tgid.x*32, 0);
-//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//    auto mA = A.slice(0, tgid.y*64);
+//    auto mB = B.slice(tgid.x*32, 0);
+//    auto mC = C.slice(tgid.x*32, tgid.y*64);
 //
-//     // execute the operation. Assumes C is is initialized to zero.
-//     op.run(mA, mB, mC);
+//    // execute the operation. Assumes C is is initialized to zero.
+//    op.run(mA, mB, mC);
 // }
 //
 // Above matrix multiplication implementation will do edge checking for all
@@ -117,23 +102,19 @@
 // bounds check. In high performance code we can avoid edge checking for inside
 // thread groups and get better performance
 //
-// kernel void matMul(tensor<device half,  dextents<int32_t, 2>, tensor_handle>
-// A,
-//                    tensor<device half,  dextents<int32_t, 2>, tensor_handle>
-//                    B, tensor<device float, dextents<int32_t, 2>,
-//                    tensor_handle> C, constant uint& M, constant uint& N,
-//                    constant uint& K, uint2 tgid
-//                    [[threadgroup_position_in_grid]])
+// kernel void matMul(tensor<device half,  dextents<int32_t, 2>> A,
+//                    tensor<device half,  dextents<int32_t, 2>> B,
+//                    tensor<device float, dextents<int32_t, 2>> C,
+//                    constant uint& M, constant uint& N,
+//                    constant uint& K, uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     // producing 64x32
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                            0,
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           static_cast<int>(dynamic_extent));
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //    // Inside thredgroup in both outer dimensions M and N.
 //    if ( tgid.x*64 + 63 < M && tgid.y*32 + 31 < N)
@@ -146,41 +127,32 @@
 //    }
 //    else
 //    {
-//      auto tA = A.offset(0, tgid.y*64);
-//      auto tB = B.offset(tgid.x*32, 0);
-//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//      auto tA = A.slice(0, tgid.y*64);
+//      auto tB = B.slice(tgid.x*32, 0);
+//      auto tC = C.slice(tgid.x*32, tgid.y*64);
 //
 //      op.run(tA, tB, tC);
 //    }
 // }
 //
-// User can also take ownership of looping over in reduction or k-dimension by
+// User can also take ownership of looping over reduction or k-dimension by
 // choosing appropriate chunk size in k (called k-tile or tilek) For following
-// example, we choose 16. kernel void matMulKLoop(tensor<device half,
-// dextents<int32_t, 2>, tensor_handle> A,
-//                         tensor<device half,  dextents<int32_t, 2>,
-//                         tensor_handle> B, tensor<device float,
-//                         dextents<int32_t, 2>, tensor_handle> C, constant
-//                         uint& M, constant uint& N, constant uint& K, uint2
-//                         tgid [[threadgroup_position_in_grid]])
+// example, we choose 16.
+// kernel void matMulKLoop(tensor<device half, dextents<int32_t, 2>> A,
+//                         tensor<device half, dextents<int32_t, 2>> B,
+//                         tensor<device float, dextents<int32_t, 2>> C,
+//                         constant uint& M, constant uint& N, constant uint& K,
+//                         uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     // descriptor to create matmul operation that does 64x32 times 32x32
-//     producing 64x32 constexpr auto matmulDescriptor = matmul2d_descriptor(64,
+//     // producing 64x32
+//     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                           16, // tilek = 16,
-//                                                           we loop over K in
-//                                                           chucks of 16
-//                                                               // rather than
-//                                                               letting matmul
-//                                                               op run method
-//                                                               looping over K
-//                                                               // internally
-//                                                               choosing tileK
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           16); // tilek = 16, we loop over K in chunks of 16 rather than
+//                                                                // letting matmul op run method looping over K
+//                                                                // internally choose tileK
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
 //     constexpr int tilek = 16;
 //
@@ -203,9 +175,9 @@
 //    }
 //    else
 //    {
-//      auto tA = A.offset(0, tgid.y*64);
-//      auto tB = B.offset(tgid.x*32, 0);
-//      auto tC = C.offset(tgid.x*32, tgid.y*64);
+//      auto tA = A.slice(0, tgid.y*64);
+//      auto tB = B.slice(tgid.x*32, 0);
+//      auto tC = C.slice(tgid.x*32, tgid.y*64);
 //
 //      op.run(tA, tB, tC);
 //    }
@@ -219,90 +191,80 @@
 // performance and power. User can apply post processing in-register where GEMM
 // output is computed using cooperative_tensor. Unlike tensor_handle,
 // tensor_offset and tensor_inline which are non-owning meaning these are
-// wrappers around resource in device, threadgroup or thread addressspce,
+// wrappers around resource in device, threadgroup or thread address space,
 // cooperative_tensor owns thread private data and divides the data for entire
-// tensor among threads (participating the scope of operation) in implementation
+// tensor among threads (participating in the scope of operation) in implementation
 // defined manner. This thread private memory is allocated at construction of
 // cooperative_tensor and deallocated when this cooperative_tensor goes out of
 // scope. The layout of cooperative_tensor depends on operation, data type,
 // number of threads in opscope with which op was created. Note that
 // cooperative_tensor created from an op is only valid for threads that are part
-// of opscope on which op was created. Though the layout of cooperative_tensor
+// of execution scope on which op was created. Though the layout of cooperative_tensor
 // is implementation defined, we provide accessor functions as shown in the
 // example below
 //
-// kernel void simpleMatMulCooperative(tensor<device half,  dextents<int32_t,
-// 2>, tensor_handle> A,
-//                          tensor<device half,  dextents<int32_t, 2>,
-//                          tensor_handle> B, tensor<device float,
-//                          dextents<int32_t, 2>, tensor_handle> C,
-//                          tensor<device half, dextents<int32_t, 2>,
-//                          tensor_handle> bias, constant uint& M, constant
-//                          uint& N, constant uint& K, uint2 tgid
-//                          [[threadgroup_position_in_grid]])
+// kernel void simpleMatMulCooperative(tensor<device half, dextents<int32_t, 2>> A,
+//                                     tensor<device half, dextents<int32_t, 2>> B,
+//                                     tensor<device float, dextents<int32_t, 2>> C,
+//                                     tensor<device half, dextents<int32_t, 2>> bias,
+//                                     constant uint& M, constant uint& N, constant uint& K,
+//                                     uint2 tgid [[threadgroup_position_in_grid]])
 // {
 //     constexpr auto matmulDescriptor = matmul2d_descriptor(64,
 //                                                           32,
-//                                                            0,
-//                                                        false,
-//                                                        false,
-//                                                        false);
+//                                                           static_cast<int>(dynamic_extent));
 //
-//     matmul2d<matmulDescriptor, opscope_SIMD-Groups<4>> matmulOp;
+//     matmul2d<matmulDescriptor, execution_simdgroups<4>> matmulOp;
 //
-//    auto mA = A.offset(0, tgid.y*64);
-//    auto mB = B.offset(tgid.x*32, 0);
-//    auto mC = C.offset(tgid.x*32, tgid.y*64);
+//    auto mA = A.slice(0, tgid.y*64);
+//    auto mB = B.slice(tgid.x*32, 0);
+//    auto mC = C.slice(tgid.x*32, tgid.y*64);
 //
 //    // This creates cooperative destination tensor of float element type.
 //    // Since matmul op above descriptor is created with 4 SIMD-Groups,
-//    coopeartive tensor will divide data among the threads on these
+//    // cooperative tensor will divide data among the threads in these
 //    // 4 SIMD-Groups. The layout of data among lanes is implementation defined
-//    and not all threads and even all elements within a thread need
-//    // be valid. We provide valid element check shown below which developer
-//    should use to guard their access to elements of cooperative_tensor
+//    // and not all threads and even all elements within a thread need
+//    // be valid. Use the valid element check shown below to guard
+//    // access to elements of cooperative_tensor
 //
-//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA),
-//    decltype(mB), float>();
+//    auto cT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
 //
 //    // Loop over all the elements of cooperative_tensor thread elements owned
-//    by "this" thread and initialize to zero.
-//    // Its imperative for performance to include "unroll pragma" so compiler
-//    fully unrolls the loop.
+//    // by "this" thread and initialize to zero.
+//    // It is imperative for performance to include "unroll pragma" so compiler
+//    // fully unrolls the loop.
 //
 //    #pragma unroll full
-//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
-//
-//      if(cT.mask(i))
+//    for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
+//      if(cT.get_mask(i))
 //        cT[i] = 0;
 //    }
 //
-//    // execute the operation. All threads computes the matmul cooperatively
-//    and results are written to cooperative_tensor. op.run(mA, mB, cT);
+//    // execute the operation. All threads compute the matmul cooperatively
+//    // and results are written to cooperative_tensor.
+//    op.run(mA, mB, cT);
 //
-//   // create cooperative bias tensor with same layout as destination
-//   cooperative_tensor of matmul auto biasT =
-//   matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB),
-//   float>();
+//    // create cooperative bias tensor with same layout as destination
+//    // cooperative_tensor of matmul
+//    auto biasT = matmulOp.get_destination_cooperative_tensor<decltype(mA), decltype(mB), float>();
 //
-//   // load data from bias tensor_handle into biasT cooperative_tensor using
-//   layout and distribution of element among threads of scope
-//   // on which matmul was created.
-//   biasT.load(bias);
+//    // load data from bias tensor_handle into biasT cooperative_tensor using
+//    // layout and distribution of element among threads of scope on which matmul was created.
+//    biasT.load(bias);
 //
 //    #pragma unroll full
-//    for (uint16_t i = 0, i < cT.capacity(); ++i) {
+//    for (uint16_t i = 0, i < cT.get_capacity(); ++i) {
 //
-//      if(cT.mask(i)) {
+//      if(cT.get_mask(i)) {
 //        //add bias
 //        cT[i] += biasT[i];
 //
 //        // get the 2-dimensional local coordinate of this thread's i-th
-//        element in destination local coordinate system (in this example
+//        // element in destination local coordinate system (in this example
 //        // 32 x 64 tile).
-//        auto ids = cT.multidimensional_indices(i);
-//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate
-//        values
+//        auto ids = cT.get_multidimensional_index(i);
+//        cT[i] = foo(cT[i], idx); // do some operation based on coordinate values
 //      }
 //    }
 //
@@ -313,29 +275,28 @@
 // Note on scope of operation
 // ==========================
 // A tensor operation may be executed on a single thread entirely or
-// cooperatively among a set of SIMD groups. We call these set of threads
-// "execution scop" of the tensor operation. A tensor ops must be created with
-// execution scope provided as template argument. All the threads in this
+// cooperatively among a set of SIMD groups. The set of threads is called the
+// "execution scope" of the tensor operation. A tensor op must be created with
+// an execution scope provided as template argument. All the threads in this
 // execution scope must enter the run method i.e. call to run methods must be
 // "execution scope" uniform. Use the following types to configure the execution
-// modes of each operation: metal::execution_thread: the operation will be run on a
-// single thread.
-//                 Fragment shaders only support this execution scope.
-// metal::execution_simdgroup - the operation will be run cooperatively by all threads in
-// this SIMD group.
-//                     May be used for finer control over tiling by slicing
-//                     tensors with SIMD IDs.
-// opscope_SIMD-Groups<N> - the operation will be executed cooperatively by N
-// SIMD groups.
-//                          Must be used when all threads in a threadgroup are
-//                          cooperatively performing the operation.
+// modes of each operation:
+//     metal::execution_thread: The operation will be run on a single thread.
+//                              Fragment shaders only support this execution scope.
+//     metal::execution_simdgroup: The operation will be run cooperatively by all
+//                                 threads in the SIMD group. May be used for finer
+//                                 control over tiling by slicing tensors with SIMD IDs.
+//     metal::execution_simdgroups<N>: The operation will be executed cooperatively by N
+//                                     SIMD groups. Must be used when all threads in a
+//                                     threadgroup are cooperatively performing the operation.
+//
 // It is undefined behavior if the number of SIMD groups dispatched does not
 // match the number of SIMD groups that the operation was configured with.
 //
-// Even though each thread in execution scope can potentially independently
-// enter and exit run method, developer cannot assume that threads in execution
-// scope are working completely independently i.e. tensor operation run
-// implementation may need for (for correctness or performance) synchronize
+// Even though each thread in the execution scope can potentially independently
+// enter and exit run method, the threads in the execution scope are working
+// do not necessarily operate completely independently. For example, the tensor
+// operation may need for (for correctness or performance) to synchronize
 // among the threads in execution scope it was created with.
 //
 //
@@ -383,7 +344,7 @@
   mode matmul_mode;
 
 public:
-  constexpr matmul2d_descriptor(int __m, int __n, int __k = dynamic_length_v<int>,
+  constexpr matmul2d_descriptor(int __m, int __n, int __k = static_cast<int>(metal::dynamic_extent),
                                 bool __transpose_left = false,
                                 bool __transpose_right = false,
                                 bool __relaxed_precision = false,
@@ -404,9 +365,9 @@
 {
   static const constant ElementType sum_identity = (ElementType)0;
   static const constant ElementType max_identity =
-      metal::numeric_limits<ElementType>::lowest;
+      metal::numeric_limits<ElementType>::lowest();
   static const constant ElementType min_identity =
-      metal::numeric_limits<ElementType>::max;
+      metal::numeric_limits<ElementType>::max();
 };
 
 #include "__impl/MPPTensorOpsMatMul2dImpl.h"
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-09-17 08:16:34
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsMatMul2dImpl.h	2025-09-26 16:27:14
@@ -156,6 +156,26 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_f16(
@@ -211,6 +231,24 @@
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype,
     int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    int threads);
 
 extern "C" EXTERNALLY_DEFINED_ATTR size_t
 __tensorops_impl_matmul2d_op_cooperative_reduction_destination_data_size(
@@ -359,6 +397,26 @@
     int,
     __tensor_ops_detail::__tensor_ops_datatype leftDataType,
     __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_f16(
@@ -420,6 +478,26 @@
     int,
     __tensor_ops_detail::__tensor_ops_datatype leftDataType,
     __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__tensor_ops_tensor_descriptor_type,
+    int,
+    int,
+    __tensor_ops_detail::__tensor_ops_datatype leftDataType,
+    __tensor_ops_detail::__tensor_ops_datatype rightDataType);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f16(
@@ -448,6 +526,15 @@
     __reduction_operation,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__thread_void_t,
+    bfloat,
+    __reduction_operation,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f16(
@@ -476,6 +563,15 @@
     __reduction_operation,
     __tensor_ops_detail::__tensor_ops_datatype,
     __tensor_ops_detail::__tensor_ops_datatype);
+extern "C" EXTERNALLY_DEFINED_ATTR void
+__tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+    __matmul2d_descriptor,
+    __tensor_ops_detail::__const_thread_void_t,
+    __tensor_ops_detail::__thread_void_t,
+    bfloat,
+    __reduction_operation,
+    __tensor_ops_detail::__tensor_ops_datatype,
+    __tensor_ops_detail::__tensor_ops_datatype);
 
 extern "C" EXTERNALLY_DEFINED_ATTR void
 __tensorops_impl_matmul2d_op_run_dv_f16_dv_f16_dv_f16(
@@ -2640,6 +2736,287 @@
     __tensor_ops_detail::__tensor_ops_tensor_descriptor_type
         destinationDescType);
 
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, int threads);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+extern "C" EXTERNALLY_DEFINED_ATTR void __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(thread __matmul2d_descriptor &desc, thread void *left, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type leftDescType, thread void *right, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type rightDescType, thread void *destination, __tensor_ops_detail::__tensor_ops_tensor_descriptor_type destinationDescType);
+
 template <__matmul2d_descriptor descriptor,
           __matmul2d_cooperative_operand_index operand_index, typename scope,
           typename left_operand, typename right_operand, typename element_type,
@@ -2652,9 +3029,10 @@
                 "only destination can be cooperative tensor");
   static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
                     __tensor_ops_detail::__is_same_v<element_type, half> ||
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
                     __tensor_ops_detail::__is_same_v<element_type, int32_t>,
                 "cooperative tensor data type can only be one of "
-                "float/half/int32_t");
+                "float/half/bfloat/int32_t");
 
   static constant constexpr __tensor_ops_detail::__rank_t rank = 2;
   using element_t = element_type;
@@ -2827,6 +3205,21 @@
         static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_dv_b16(
+            desc, storage, source, sourceDescType, sourceRank, leftDataType,
+            rightDataType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_load_tg_b16(
+            desc, storage, source, sourceDescType, sourceRank, leftDataType,
+            rightDataType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -2913,6 +3306,22 @@
         static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_dv_b16(
+            desc, storage, destination, destinationDescType, leftDataType,
+            rightDataType, threads);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_destination_tensor_store_tg_b16(
+            desc, storage, destination, destinationDescType, leftDataType,
+            rightDataType, threads);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -3089,9 +3498,10 @@
 {
   static_assert(__tensor_ops_detail::__is_same_v<element_type, float> ||
                     __tensor_ops_detail::__is_same_v<element_type, half> ||
+                    __tensor_ops_detail::__is_same_v<element_type, bfloat> ||
                     __tensor_ops_detail::__is_same_v<element_type, int32_t>,
                 "cooperative tensor data type can only be one of "
-                "float/half/int32_t");
+                "float/half/bfloat/int32_t");
 
   static constant constexpr __tensor_ops_detail::__rank_t rank = 1;
   using element_t = element_type;
@@ -3259,6 +3669,19 @@
         static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_dv_b16(
+            desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             sourcePtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_load_tg_b16(
+            desc, storage, source, sourceDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<sourcePtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -3339,6 +3762,20 @@
         static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
                       "Unsupported address space");
     }
+    else if constexpr (__tensor_ops_detail::__is_same_v<elem_t, bfloat>)
+    {
+      if constexpr (__tensor_ops_detail::__is_device_addrspace_v<
+                        destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_dv_b16(
+            desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<
+                             destinationPtrType>)
+        __tensorops_impl_matmul2d_op_cooperative_reduction_destination_tensor_store_tg_b16(
+            desc, storage, destination, destinationDescType, reduction_dim, threads, leftDataType, rightDataType);
+      else
+        static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>,
+                      "Unsupported address space");
+    }
     else
       static_assert(__tensor_ops_detail::__assert_false_v<elem_t>,
                     "Unsupported type");
@@ -4460,6 +4897,552 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f32_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_i8_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f32_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_i8_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_dv_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_b16_th_f16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_dv_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_dv_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else if constexpr (__tensor_ops_detail::__is_thread_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_thread_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_single_thread_th_f16_th_b16_th_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5394,6 +6377,552 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f32_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_i8_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f32_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_i8_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_dv_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_b16_tg_f16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_b16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f16(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_dv_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_dv_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_dv_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<destinationPtrType>)
+          __tensorops_impl_matmul2d_op_run_tg_f16_tg_b16_tg_f32(desc, left, leftDescType, right, rightDescType, destination, destinationDescType, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5881,6 +7410,272 @@
               __tensor_ops_detail::__assert_false_v<destinationPtrType>,
               "Unsupported address space");
       }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f32_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_i8_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, float> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f32_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, int8_t> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_i8_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_dv_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_b16_tg_f16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, bfloat>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_b16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, half>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f16(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
+      else if constexpr (__tensor_ops_detail::__is_same_v<leftValueType, half> &&
+                         __tensor_ops_detail::__is_same_v<rightValueType, bfloat> &&
+                         __tensor_ops_detail::__is_same_v<destinationValueType, float>)
+      {
+        if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                      __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_device_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_dv_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_device_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_dv_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else if constexpr (__tensor_ops_detail::__is_threadgroup_addrspace_v<leftPtrType> &&
+                           __tensor_ops_detail::__is_threadgroup_addrspace_v<rightPtrType>)
+          __tensorops_impl_matmul2d_op_run_cooperative_tg_f16_tg_b16_f32(desc, left, leftDescType, right, rightDescType, destination, threads);
+        else
+          static_assert(__tensor_ops_detail::__assert_false_v<destinationPtrType>, "Unsupported address space");
+      }
       else
         static_assert(
             __tensor_ops_detail::__assert_false_v<destinationValueType>,
@@ -5938,6 +7733,9 @@
   else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
     __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_f32(
         desc, src, dst, identity, op, leftDataType, rightDataType);
+  else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+    __tensorops_impl_matmul2d_op_cooperative_destination_reduce_rows_b16(
+        desc, src, dst, identity, op, leftDataType, rightDataType);
   else
     static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
                   "Unsupported type");
@@ -5992,6 +7790,9 @@
   else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, float>)
     __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_f32(
         desc, src, dst, identity, op, leftDataType, rightDataType);
+  else if constexpr (__tensor_ops_detail::__is_same_v<ElementType, bfloat>)
+    __tensorops_impl_matmul2d_op_cooperative_destination_reduce_columns_b16(
+        desc, src, dst, identity, op, leftDataType, rightDataType);
   else
     static_assert(__tensor_ops_detail::__assert_false_v<ElementType>,
                   "Unsupported type");
@@ -5999,8 +7800,8 @@
 
 template <class SrcElementType, class DstElementType, class SrcExtents, class DstExtents, class SrcLayout, class DstLayout>
 inline bool __is_iterator_compatible(
-    thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
-    thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
+    const thread metal::cooperative_tensor<SrcElementType, SrcExtents, SrcLayout> &sourceT,
+    const thread metal::cooperative_tensor<DstElementType, DstExtents, DstLayout> &destT)
 {
   if (!SrcLayout::is_matmul2d_cooperative_destination_layout ||
       !DstLayout::is_matmul2d_reduction_cooperative_destination_layout ||
diff -ruN /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h
--- /Applications/Xcode_26.1.0-beta.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h	2025-09-08 05:52:50
+++ /Applications/Xcode_26.1.0-beta2.app/Contents/Developer/Platforms/iPhoneOS.platform/Developer/SDKs/iPhoneOS.sdk/System/Library/Frameworks/MetalPerformancePrimitives.framework/Headers/__impl/MPPTensorOpsTypes.h	2025-09-26 17:51:19
@@ -130,7 +130,7 @@
 template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>
 struct dynamic_length
 {
-  static constexpr constant T value = metal::numeric_limits<T>::max();
+    static constexpr constant T value = metal::numeric_limits<T>::max();
 };
 
 template <typename T, typename U = __tensor_ops_detail::__enable_if_t<__tensor_ops_detail::__is_integral_v<T>>>
Clone this wiki locally